# Predict whether a patient is suffreing from diabetes or not

In [195]:
active_imports()

import pandas as pd
import numpy as np


['import pandas as pd', 'import numpy as np']

In [196]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Upload data

In [197]:
ddata = pd.read_csv('C://Users/ntawi/Downloads/pima-indians-diabetes-database/diabetes.csv')
ddata.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [198]:
ddata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [199]:
ddata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [200]:
ddata.shape

(768, 9)

In [201]:
ddata['Pregnancies'].unique()

array([ 6,  1,  8,  0,  5,  3, 10,  2,  4,  7,  9, 11, 13, 15, 17, 12, 14],
      dtype=int64)

In [202]:
ddata['Outcome'].unique()

array([1, 0], dtype=int64)

## Data wrangling

In [203]:
mean_Ins = ddata['Insulin'].mean(skipna=True)
ddata['Insulin'] = ddata['Insulin'].replace(to_replace=0, value=mean_Ins)

In [204]:
mean_Glu = ddata['Glucose'].mean(skipna=True)
ddata['Glucose'] = ddata['Glucose'].replace(to_replace=0, value=mean_Glu)

In [205]:
mean_BP = ddata['BloodPressure'].mean(skipna=True)
ddata['BloodPressure'] = ddata['BloodPressure'].replace(to_replace=0, value=mean_BP)

In [206]:
mean_ST = ddata['SkinThickness'].mean(skipna=True)
ddata['SkinThickness'] = ddata['SkinThickness'].replace(to_replace=0, value=mean_ST)

In [207]:
mean_BMI = ddata['BMI'].mean(skipna=True)
ddata['BMI'] = ddata['BMI'].replace(to_replace=0, value=mean_BMI)

In [208]:
ddata.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,79.799479,33.6,0.627,50,1
1,1,85.0,66.0,29.0,79.799479,26.6,0.351,31,0
2,8,183.0,64.0,20.536458,79.799479,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


### Check correlation

In [209]:
corr_matrix = ddata.corr()
corr_matrix['Outcome'].sort_values(ascending=False)

Outcome                     1.000000
Glucose                     0.492908
BMI                         0.312254
Age                         0.238356
Pregnancies                 0.221898
Insulin                     0.179185
SkinThickness               0.175026
DiabetesPedigreeFunction    0.173844
BloodPressure               0.162986
Name: Outcome, dtype: float64

## Prepare data for training

In [210]:
ddata1 = ddata.copy()

# separating the feature vecotrs and the target values
X = ddata1.drop('Outcome', axis =1)
y = ddata1['Outcome']

X.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [211]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(len(X_train), "train +", len(y_test), "test")

537 train + 231 test


## Prepare the data for Machine Learning Algorithms
    . Data cleaning
    . Handling categorical and numeric features
    . Transformers
    . Feature scaling
    . Transformation pipelines

In [212]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

# create the preprocessing pipelines for numeric data
numeric_features = ['Insulin', 'Glucose', 'BloodPressure', 'Age', 'DiabetesPedigreeFunction',
                    'SkinThickness', 'BMI', 'Pregnancies']
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# create the preprocessing pipelines for categorical data
#categorical_features = ['New_age']
#categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    #('onehot', OneHotEncoder(handle_unknown='ignore'))])

# preprocessor process
preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features)])
        #('cat', categorical_transformer, categorical_features)])

diab_prepared = preprocessor.fit_transform(ddata1)

# Append regressor to preprocessing pipeline
# Now we have a full prediction pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='liblinear'))])

In [213]:
diab_prepared.shape

(768, 8)

In [214]:
clf.fit(X_train, y_train)
print('Accuracy of LogisticRegression on training set: {:.2f}'.format(reg.score(X_train, y_train)))

Accuracy of LogisticRegression on training set: 0.79


In [215]:
from sklearn.metrics import classification_report, confusion_matrix

lreg = LogisticRegression(solver='liblinear')
#gb_clf2 = GradientBoostingClassifier()
lreg.fit(diab_prepared, y)
diab_pred = lreg.predict(diab_prepared)


print("Confusion Matrix:")
print(confusion_matrix(y, diab_pred))

print("Classification Report")
print(classification_report(y, diab_pred))

Confusion Matrix:
[[446  54]
 [114 154]]
Classification Report
              precision    recall  f1-score   support

           0       0.80      0.89      0.84       500
           1       0.74      0.57      0.65       268

    accuracy                           0.78       768
   macro avg       0.77      0.73      0.74       768
weighted avg       0.78      0.78      0.77       768



In [216]:
y_pred = clf.predict(X_test)

## Select and train your model

In [217]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC, SVR
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.linear_model import Lasso, ElasticNet, Ridge
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

In [218]:
classifiers = [
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    SVR(gamma='scale', C=1.0, epsilon=0.2),
    DecisionTreeClassifier(),
    KNeighborsClassifier(3),
    xgb.XGBClassifier(objective= 'reg:squarederror', alpha=10, n_estimators=100),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    LogisticRegression(solver='liblinear')
    ]

In [219]:
for classifier in classifiers:
    clf_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    clf_pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % clf_pipe.score(X_test, y_test))

SVC(C=0.025, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)
model score: 0.654
NuSVC(cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
      kernel='rbf', max_iter=-1, nu=0.5, probability=True, random_state=None,
      shrinking=True, tol=0.001, verbose=False)
model score: 0.745
SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
model score: 0.183
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       

In [220]:
clf.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'classifier', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__num', 'preprocessor__num__memory', 'preprocessor__num__steps', 'preprocessor__num__verbose', 'preprocessor__num__imputer', 'preprocessor__num__scaler', 'preprocessor__num__imputer__add_indicator', 'preprocessor__num__imputer__copy', 'preprocessor__num__imputer__fill_value', 'preprocessor__num__imputer__missing_values', 'preprocessor__num__imputer__strategy', 'preprocessor__num__imputer__verbose', 'preprocessor__num__scaler__copy', 'preprocessor__num__scaler__with_mean', 'preprocessor__num__scaler__with_std', 'classifier__C', 'classifier__class_weight', 'classifier__dual', 'classifier__fit_intercept', 'classifier__intercept_scaling', 'classifier__l1_ratio', 'classifier__max_iter', 'classifier__multi_class', 'classifier__n_jobs',

In [221]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm

params = {
        'C':[1, 10],
        #'cache_size': [200],
        'class_weight': [None, 'balanced']
                         
}

log_reg = LogisticRegression(solver='liblinear')
#Nu_reg = NuSVC(degree=3, gamma='auto-deprecated', probability=True)
grid_search = GridSearchCV(log_reg, param_grid=params, cv=5, scoring='neg_log_loss', n_jobs=None)
#grid_serach = GridSearchCV(Nu_reg, param_grid=params, cv=5)
grid_search.fit(diab_prepared, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10], 'class_weight': [None, 'balanced']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_log_loss', verbose=0)

In [222]:
grid_search.best_params_

{'C': 1, 'class_weight': None}

In [223]:
grid_search.best_estimator_

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [224]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.6892098293022249 {'C': 1, 'class_weight': None}
0.7115537089847835 {'C': 1, 'class_weight': 'balanced'}
0.6894083720985684 {'C': 10, 'class_weight': None}
0.7116862962077457 {'C': 10, 'class_weight': 'balanced'}


In [225]:
from sklearn.metrics import mean_squared_error

final_model = grid_search.best_estimator_

X_test = ddata1.drop('Outcome', axis=1)
y_test = ddata1['Outcome'].copy()

X_test_prepared = preprocessor.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
print(final_mse)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

0.22135416666666666
0.4704829079431756


In [226]:
active_imports()

import pandas as pd
import numpy as np


['import pandas as pd', 'import numpy as np']