### PIMA Dataset data analysis

This notebook looks at the PIMA dataset and builds a model to predict the onset of diabetes

### Imports

In [114]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import Imputer
import pandas as pd
import numpy as np

# pd.options.mode.chained_assignment = None 

### Read dataset

In [115]:
DATASET_NAME = 'dataset/diabetes_pima.csv'
COLUMNS = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'DiabetesPedigreeFunction', 'Age']
df = pd.read_csv(DATASET_NAME)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [116]:
df_target = pima_df['Outcome']
df.drop('Outcome' , inplace = True, axis =1)
df_attr = df
df_attr = pima_df[COLUMNS]
df_attr.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,0.627,50
1,1,85,66,29,0,0.351,31
2,8,183,64,0,0,0.672,32
3,1,89,66,23,94,0.167,21
4,0,137,40,35,168,2.288,33


### Preprocessing
Replacing missing values with mean of the columns

In [117]:
imputer = Imputer(missing_values=0, strategy='mean', axis=0)

columns_to_impute = ["Insulin", "Glucose", "BloodPressure", "SkinThickness"]

for columns in columns_to_impute:
    df_attr[columns]=imputer.fit_transform(df[[columns]])
    
df_attr.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,155.548223,0.627,50
1,1,85.0,66.0,29.0,155.548223,0.351,31
2,8,183.0,64.0,29.15342,155.548223,0.672,32
3,1,89.0,66.0,23.0,94.0,0.167,21
4,0,137.0,40.0,35.0,168.0,2.288,33


#### Split attribues for training and testing

In [119]:
X_train, X_test, Y_train, Y_test = train_test_split(df_attr , df_target , random_state=1)

## Models
Fit various models and check results

In [118]:
models = [DecisionTreeClassifier(max_depth= 5), LogisticRegression(), RandomForestClassifier()]
for model in models:
    model.fit(X_train, Y_train)
    Y_predict = model.predict(X_test)
                                       
                              