# Predicting Diabetes

## Import libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# do plottin inline instead of in a separated window
∞matplotlib inline

## Load review data

In [2]:
df = pd.read_csv('../MachineLearningWithPython/Notebooks/data/pima-data.csv')

In [4]:
df.shape

(768, 10)

In [5]:
df.head(5)

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
0,6,148,72,35,0,33.6,0.627,50,1.379,True
1,1,85,66,29,0,26.6,0.351,31,1.1426,False
2,8,183,64,0,0,23.3,0.672,32,0.0,True
3,1,89,66,23,94,28.1,0.167,21,0.9062,False
4,0,137,40,35,168,43.1,2.288,33,1.379,True


In [6]:
df.tail(5)

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
763,10,101,76,48,180,32.9,0.171,63,1.8912,False
764,2,122,70,27,0,36.8,0.34,27,1.0638,False
765,5,121,72,23,112,26.2,0.245,30,0.9062,False
766,1,126,60,0,0,30.1,0.349,47,0.0,True
767,1,93,70,31,0,30.4,0.315,23,1.2214,False


### Check for null values

In [9]:
df.isnull().values.any()

False

In [10]:
df.corr()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
num_preg,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,-0.081672,0.221898
glucose_conc,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.057328,0.466581
diastolic_bp,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.207371,0.065068
thickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,1.0,0.074752
insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.436783,0.130548
bmi,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.392573,0.292695
diab_pred,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.183928,0.173844
age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,-0.11397,0.238356
skin,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,1.0,0.074752
diabetes,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,0.074752,1.0


* cuanto más cercano a 1 es el valor, mayor correlación entre las columnas (una columna siempre estará correlacionada consigo misma)

### Delete correlated columns

In [11]:
del df['skin']

In [12]:
df.head(5)

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,True
1,1,85,66,29,0,26.6,0.351,31,False
2,8,183,64,0,0,23.3,0.672,32,True
3,1,89,66,23,94,28.1,0.167,21,False
4,0,137,40,35,168,43.1,2.288,33,True


## Check data types

In [13]:
df.head(5)

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,True
1,1,85,66,29,0,26.6,0.351,31,False
2,8,183,64,0,0,23.3,0.672,32,True
3,1,89,66,23,94,28.1,0.167,21,False
4,0,137,40,35,168,43.1,2.288,33,True


Change True to 1 and False to 0

In [16]:
diabetes_map = { True: 1, False: 0 }
df['diabetes'] = df['diabetes'].map(diabetes_map)

In [17]:
df.head(5)

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Check True or False ratio

In [38]:
true_count = len(df.loc[df['diabetes'] == True])
false_count = len(df.loc[df['diabetes'] == False])
print("True to False ratio {0} => {1:2.2f}%".format(true_count, true_count / (true_count + false_count) * 100))
print("False to True ratio {0} => {1:2.2f}%".format(false_count, false_count / (true_count + false_count) * 100))

True to False ratio 268 => 34.90%
False to True ratio 500 => 65.10%


Good distribution between True and False, so no special work needed on the data.

### Splitting the data

70% for training the model and 30% fot testing it

In [39]:
from sklearn.model_selection import train_test_split

feature_col_names = ['num_preg', 'glucose_conc', 'diastolic_bp', 'thickness', 'insulin', 'bmi', 'diab_pred', 'age']
predicted_class_names = ['diabetes']

x = df[feature_col_names].values
y = df[predicted_class_names].values
split_test_size = 0.30

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = split_test_size, random_state = 42)


We check to ensure that training and testing data distribution has been done correctly

In [44]:
training_size = len(x_train)
testing_size = len(x_test)

print("Training data: count = {0} -> {1:0.2f}%".format(training_size, training_size / (training_size + testing_size) * 100 ))
print("Testing data: count = {0} -> {1:0.2f}%".format(testing_size, testing_size / (training_size + testing_size) * 100 ))
      

Training data: count = 537 -> 69.92%
Testing data: count = 231 -> 30.08%


#### Verifying predicted value is split correctly

In [57]:
print("Original True : {0} - {1:0.2f}%".format(len(df.loc[df['diabetes'] == 1]), len(df.loc[df['diabetes'] == 1]) / len(df['diabetes']) * 100))
print("Original False: {0} - {1:0.2f}%".format(len(df.loc[df['diabetes'] == 0]), len(df.loc[df['diabetes'] == 0]) / len(df['diabetes']) * 100))
print("")
print("Training True : {0} - {1:0.2f}%".format(len(y_train[y_train[:] == 1]), len(y_train[y_train[:] == 1]) / len(y_train) * 100))
print("Training False: {0} - {1:0.2f}%".format(len(y_train[y_train[:] == 0]), len(y_train[y_train[:] == 0]) / len(y_train) * 100))
print("")
print("Testing True  : {0} - {1:0.2f}%".format(len(y_test[y_test[:] == 1]), len(y_test[y_test[:] == 1]) / len(y_test) * 100))
print("Testing False : {0} - {1:0.2f}%".format(len(y_test[y_test[:] == 0]), len(y_test[y_test[:] == 0]) / len(y_test) * 100))


Original True : 268 - 34.90%
Original False: 500 - 65.10%

Training True : 188 - 35.01%
Training False: 349 - 64.99%

Testing True  : 80 - 34.63%
Testing False : 151 - 65.37%


### Post split data preparation

#### Hidden missing values

In [58]:
df.head()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


 * having a skin thickness of 0 is not phisically possible, so there's a hidden missing value

## Training Initial Algorithm - Naive Bayes

In [61]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(x_train, y_train.ravel())

GaussianNB(priors=None, var_smoothing=1e-09)

### Performance on Training Data

In [62]:
# predict values using training data
nb_predict_train = nb_model.predict(x_train)

# import the performance metric library
from sklearn import metrics

# accuracy
print("Accuracy {0:.4f}".format(metrics.accuracy_score(y_train, nb_predict_train)))
print()

Accuracy 0.7672



### Performance on Testing Data

In [63]:
# predict values using training data
nb_predict_train = nb_model.predict(x_test)

# import the performance metric library
from sklearn import metrics

# accuracy
print("Accuracy {0:.4f}".format(metrics.accuracy_score(y_test, nb_predict_train)))
print()

Accuracy 0.7446



#### Metrics

In [69]:
print("Confussion Matrix")
print(metrics.confusion_matrix(y_test, nb_predict_train))
print()

print("Classification Report")
print(metrics.classification_report(y_test, nb_predict_train))

Confussion Matrix
[[119  32]
 [ 27  53]]

Classification Report
              precision    recall  f1-score   support

           0       0.82      0.79      0.80       151
           1       0.62      0.66      0.64        80

    accuracy                           0.74       231
   macro avg       0.72      0.73      0.72       231
weighted avg       0.75      0.74      0.75       231



## Random Forest

In [71]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state = 42)     # create random forest object 
rf_model.fit(x_train, y_train.ravel())



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

### Predict Training Data

In [78]:
# predict values using training data
rf_predict_train = rf_model.predict(x_train)

# accuracy
print("Accuracy {0:.4f}".format(metrics.accuracy_score(y_train, rf_predict_train)))
print()

Accuracy 0.9888



### Predict Training Data

In [79]:
# predict values using training data
rf_predict_test = rf_model.predict(x_test)

# accuracy
print("Accuracy {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_test)))
print()

Accuracy 0.7403



#### Metrics

In [81]:
print("Confussion Matrix")
print(metrics.confusion_matrix(y_test, rf_predict_test))
print()

print("Classification Report")
print(metrics.classification_report(y_test, rf_predict_test))

Confussion Matrix
[[123  28]
 [ 32  48]]

Classification Report
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       151
           1       0.63      0.60      0.62        80

    accuracy                           0.74       231
   macro avg       0.71      0.71      0.71       231
weighted avg       0.74      0.74      0.74       231



si precisión de predicción sobre los datos de entrenamiento frente a los datos de test varían tanto, es indicativo de que modelo tiene bastante overfitting, esto es "ha aprendido los datos de entrenamiento demasiado bien"

## Logistic Regression

In [92]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(C = 0.7, random_state = 42)     # C es el hyperparametro de regularización para corregir overfitting
lr_model.fit(x_train, y_train.ravel())
lr_predict_test = lr_model.predict(x_test)

# training metrics
print("Accuracy {0:.4f}".format(metrics.accuracy_score(y_test, lr_predict_test)))
print()

print("Confussion Matrix")
print(metrics.confusion_matrix(y_test, lr_predict_test))
print()

print("Classification Report")
print(metrics.classification_report(y_test, lr_predict_test))

Accuracy 0.7619

Confussion Matrix
[[128  23]
 [ 32  48]]

Classification Report
              precision    recall  f1-score   support

           0       0.80      0.85      0.82       151
           1       0.68      0.60      0.64        80

    accuracy                           0.76       231
   macro avg       0.74      0.72      0.73       231
weighted avg       0.76      0.76      0.76       231





probamos a compensar el balance de datos a la hora de generar el modelo de predicción (el recall para predecir diabetes (1) es sólo del 0.60, frente al 0.85 para predecir no diabetes (0))

In [93]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(class_weight = "balanced", C = 0.7, random_state = 42)     # C es el hyperparametro de regularización para corregir overfitting
lr_model.fit(x_train, y_train.ravel())
lr_predict_test = lr_model.predict(x_test)

print("Accuracy {0:.4f}".format(metrics.accuracy_score(y_test, lr_predict_test)))
print()

print("Confussion Matrix")
print(metrics.confusion_matrix(y_test, lr_predict_test))
print()

print("Classification Report")
print(metrics.classification_report(y_test, lr_predict_test))

Accuracy 0.7143

Confussion Matrix
[[107  44]
 [ 22  58]]

Classification Report
              precision    recall  f1-score   support

           0       0.83      0.71      0.76       151
           1       0.57      0.72      0.64        80

    accuracy                           0.71       231
   macro avg       0.70      0.72      0.70       231
weighted avg       0.74      0.71      0.72       231





ahora la el recall es superior al 0.70 en los dos casos (diabetes (1) vs no diabetes (0)), con lo que damos por bueno el resultado

## Logistic Regression CV

In [95]:
from sklearn.linear_model import LogisticRegressionCV

# cv = number of folds, n_jobs = número de procesos (cpu cores), Cs = es el número de intentos para obtener los mejores parámetros para cada fold, 
lrcv_model = LogisticRegressionCV(n_jobs = -1, class_weight = "balanced", Cs = 3, cv = 10, refit = False, random_state = 42)     # C es el hyperparametro de regularización para corregir overfitting
lrcv_model.fit(x_train, y_train.ravel())
# lrcv_predict_test = lrcv_model.predict(x_test)

LogisticRegressionCV(Cs=3, class_weight='balanced', cv=10, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='warn', n_jobs=-1, penalty='l2',
                     random_state=42, refit=False, scoring=None, solver='lbfgs',
                     tol=0.0001, verbose=0)

### Predict on Test Data

In [96]:
lrcv_predict_test = lrcv_model.predict(x_test)

print("Accuracy {0:.4f}".format(metrics.accuracy_score(y_test, lrcv_predict_test)))
print()

print("Confussion Matrix")
print(metrics.confusion_matrix(y_test, lrcv_predict_test))
print()

print("Classification Report")
print(metrics.classification_report(y_test, lrcv_predict_test))

Accuracy 0.6970

Confussion Matrix
[[106  45]
 [ 25  55]]

Classification Report
              precision    recall  f1-score   support

           0       0.81      0.70      0.75       151
           1       0.55      0.69      0.61        80

    accuracy                           0.70       231
   macro avg       0.68      0.69      0.68       231
weighted avg       0.72      0.70      0.70       231

