### Throughout this exercise, you may need to refer to online documentation.

Pandas: http://pandas.pydata.org/pandas-docs/stable/

Numpy & Scipy: https://docs.scipy.org/doc/

In [1]:
# First, import packages and load some data
import pandas      as pd
import numpy       as np
import scipy       as sp
import scipy.stats as stats

datafile = './pima-indians-diabetes.csv'

df = pd.read_csv(datafile, header=0, index_col=None)
df

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [15]:
df1.iloc[:, 1:8] == 0

Unnamed: 0,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age
0,False,False,False,True,False,False,False
1,False,False,False,True,False,False,False
2,False,False,True,True,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
763,False,False,False,False,False,False,False
764,False,False,False,True,False,False,False
765,False,False,False,False,False,False,False
766,False,False,True,True,False,False,False


In [29]:
# Missing data is coded as 0 in all but the first and last columns (where 0 is meaningful)
# Change 0s in columns 1-7 to np.nan

df1 = df.copy() # deep copy
for i in df1.columns[1:-1]:
    mask = df1.loc[:, i] == 0
    df1.loc[mask, i] = np.nan

df1 
# Hint: you can conditionally assign slices of a dataframe to a new value:
# eg. df[df[c]==x] = v  will change values in column c to v where they were originall x.
# And remember you can slice multiple columns with as df[[0,1]] for the first two columns
# You can also look at pandas' DataFrame.replace() function




In [51]:
df2 = df1.copy()
df2

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
0,6,148.0,72.0,35.0,,33.6,0.627,50.0,1
1,1,85.0,66.0,29.0,,26.6,0.351,31.0,0
2,8,183.0,64.0,,,23.3,0.672,32.0,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63.0,0
764,2,122.0,70.0,27.0,,36.8,0.340,27.0,0
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30.0,0
766,1,126.0,60.0,,,30.1,0.349,47.0,1


In [63]:
print(df2["plasma_glucose_concentration"].mean()) #ignore
np.nanmean(df2["plasma_glucose_concentration"])

121.6867627785059


121.6867627785059

In [59]:
# The basics: counts, min, max, mean, median and mode
# Get the count (of all non-NaNs), min, max, mean, median and mode of each column.

# Hint 1: use numpy's min(), max(), nanmean() and nanmedian() functions.
# Hint 2: scipy.stats has a mode function ... but it returns a complex object.

# Which column has the most missing values?
print("Column,NaN Count,Min,Max,Mean,Median,Mode")
for col in df2.columns:
    count = df2[col].isnull().sum()
    min = df2[col].min()
    max = df2[col].max()
    mean = df2[col].mean()
    median = df2[col].median()
    mode = df2[col].mode() # can be a tie between values
    
    print(f"{col},{count},{min},{max},{mean},{median},{mode}")

Column,NaN Count,Min,Max,Mean,Median,Mode
times_pregnant,0,0,17,3.8450520833333335,3.0,0    1
dtype: int64
plasma_glucose_concentration,5,44.0,199.0,121.6867627785059,117.0,0     99.0
1    100.0
dtype: float64
diastolic_blood_pressure,35,24.0,122.0,72.40518417462484,72.0,0    70.0
dtype: float64
triceps_thickness,227,7.0,99.0,29.153419593345657,29.0,0    32.0
dtype: float64
2-hour_serum_insulin,374,14.0,846.0,155.5482233502538,125.0,0    105.0
dtype: float64
BMI,11,18.2,67.1,32.45746367239099,32.3,0    32.0
dtype: float64
diabetes_pedigreen,0,0.078,2.42,0.4718763020833327,0.3725,0    0.254
1    0.258
dtype: float64
age,0,21.0,81.0,33.240885416666664,29.0,0    22.0
dtype: float64
diabetes,0,0,1,0.3489583333333333,0.0,0    0
dtype: int64


In [None]:
# Now let's try to better characterize the column distributions.
# First, we can "centre" each column (also called a z-transform) so that it has mean=0, variance=1
# To center a value, we simply subtract the mean of the distribution, and divide that value 
#   by the standard deviation of the distribution:
# for each value v in a distribution of values V:
#    v = (v-mean(V)) / std(V)

# Write a centre() function
# Hint: centre([0,1,2,3,5,10]) => [-1.059, -0.757, -0.454, -0.151, 0.454, 1.967]

# Remember to use np.nanmean() and np.nanstd() to deal with NaNs
def centre(v):
    """Returns a copy of array `v with centred values"""
    

In [None]:
# Let's make a copy of df, to hold the centred values:
cdf = df.copy()

In [None]:
# Now fill in cdf by applying your centre() function to each column in df

In [None]:
# Finally! Let's look for outliers by getting the min and max in the new centred df:
# (You can re-use code from above!)

print "Column,Min,Max"
for col in cdf.columns:
    # go...

In [None]:
# trick question:
# What is the mean of cdf['times_pregnant']?
# Why isn't it zero?