In [130]:
#import the raw dataset and name the columns 

import pandas as pd
import numpy as np


col_names = ['pregnant','glucose','bp','skin','insulin','bmi','pedigree','age','class']
df = pd.read_csv('pima-indians-diabetes.csv', names=col_names)

df.head(30)



Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


This looks like a dataset that relates incidence of diabetes to a variety of medical attributes of a person.  We can use this to make a model that predicts incidence of diabetes.  All of the predictors are numeric, which allows us to us Knn.  There are a number of 0s in some of the columns which must be missing values.    The features are not normalized, so we will need to do that before using them with KNN.  

Hypothesis:  Some of these medical attributes will correlate with having diabetes, enabling us to create a predictive model.  

In [131]:
#Fill in all of the missing values
#first replace all the 0s with NaN

missing_features = ['insulin','bp','glucose','skin','bmi','age','pedigree']

for column in missing_features:    
    df[column].replace(0, np.nan, inplace=True)
    
df.head(10)


Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,class
0,6,148,72.0,35.0,,33.6,0.627,50,1
1,1,85,66.0,29.0,,26.6,0.351,31,0
2,8,183,64.0,,,23.3,0.672,32,1
3,1,89,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116,74.0,,,25.6,0.201,30,0
6,3,78,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115,,,,35.3,0.134,29,0
8,2,197,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125,96.0,,,,0.232,54,1


In [132]:
df.describe()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,class
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [129]:
feature_mean = df['insulin'].mean()
feature_std = df['insulin'].std()
print feature_mean
print feature_std

df['insulin'] = df['insulin'].apply(lambda x: np.random.normal(feature_mean,feature_std) if np.isnan(x) else x)

df.head(1)

155.54822335
118.775855187


Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,class
0,6,148,72,35,153.520049,33.6,0.627,50,1


In [133]:
#Create a function to fill in random normal values using the normal distribution based on the values that are there

def fill_normal(feature):
    
    feature_mean = df[feature].mean()
    feature_std = df[feature].std()
    
    df[feature] = df[feature].apply(lambda x: np.random.normal(feature_mean,feature_std) if np.isnan(x) else x)

    #df[feature].fillna(np.random.normal(feature_mean,feature_std),inplace=True)
    

In [134]:
#run the function on all columns

for feature in missing_features:
    fill_normal(feature)

df.head(20)
    

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,class
0,6,148,72.0,35.0,140.076338,33.6,0.627,50,1
1,1,85,66.0,29.0,27.113181,26.6,0.351,31,0
2,8,183,64.0,40.331488,83.250039,23.3,0.672,32,1
3,1,89,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116,74.0,40.924579,19.181508,25.6,0.201,30,0
6,3,78,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115,71.238967,27.370179,35.475318,35.3,0.134,29,0
8,2,197,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125,96.0,26.463838,310.541777,26.588061,0.232,54,1


In [135]:
df.describe()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.639594,72.314882,29.297832,156.621565,32.458775,0.471876,33.240885,0.348958
std,3.369578,30.473218,12.334758,10.417808,117.295282,6.897741,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,-1.891885,-204.33903,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,22.0,78.992217,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,135.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,210.465687,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [136]:
#Normalize the data

features = ['pregnant','insulin','bp','glucose','skin','bmi','age','pedigree']

dfn['pregant'] = 

feature_mean = df['pregant'].mean()
feature_std = df['pregant'].std()


for feature in features:
    dfn
