In [3]:
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine
import seaborn as sns

import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score

from sklearn.linear_model import LogisticRegression

from sklearn import set_config
set_config(transform_output="pandas")

# Set random seed 
RSEED = 42

warnings.filterwarnings("ignore")

In [4]:
# load the dataset
df = pd.read_csv("./df_diabetes.csv", encoding="utf-8")
df.head()


Unnamed: 0,pregnancies,Age,bmi,pedigree_f,outcome,skinthickness,insulin,glucose,bloodpressure
0,6,50,33.6,1,1,35,0,148,72
1,1,31,26.6,0,0,29,0,85,66
2,8,32,23.3,1,1,0,0,183,64
3,1,21,28.1,0,0,23,94,89,66
4,0,33,43.1,2,1,35,168,137,40


# Modelling:

In [5]:
# Defining X and y

X = df.drop('outcome', axis=1)
y = df['outcome']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y)

# Check the shape of the data sets
print("X_train:", X_train.shape)  
print("y_train:", y_train.shape)   
print("X_test:", X_test.shape)  
print("y_test:", y_test.shape)     

X_train: (537, 8)
y_train: (537,)
X_test: (231, 8)
y_test: (231,)


## Preprocessing Pipeline
No categorical column found. No need categorical pipeline. Also, no null values found and there is no need for imputation

In [12]:


num_pipeline = Pipeline([
    ('num_scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('num_processor', num_pipeline, ['Age','bmi','skinthickness','insulin','glucose','bloodpressure'])
])

preprocessor

In [13]:
preprocessor.fit_transform(X_train)

Unnamed: 0,num_processor__Age,num_processor__bmi,num_processor__skinthickness,num_processor__insulin,num_processor__glucose,num_processor__bloodpressure
209,0.632129,0.431483,0.812564,-0.722392,2.017945,0.780670
176,0.716992,-0.119832,-1.316902,-0.722392,-1.148619,0.465388
147,0.038086,-0.209581,0.941623,0.373154,-0.476923,-0.270269
454,-0.810548,0.726372,0.489918,0.244266,-0.668836,-0.795739
636,1.226172,-0.427543,-1.316902,-0.722392,-0.540894,0.255200
...,...,...,...,...,...,...
214,0.207812,0.264806,0.748035,0.888705,-0.285010,0.675576
113,-0.725684,0.239163,-1.316902,-0.722392,-1.436488,-0.375363
556,-0.301368,0.764836,1.264269,-0.722392,-0.764793,0.045012
759,2.753712,0.431483,-1.316902,-0.722392,2.209858,1.201045


## Training the model using logistic regression pipeline

In [14]:

l_regression_pipe = Pipeline([
    ('feature_engineering', preprocessor),
    ('logistic_regression', LogisticRegression(class_weight='balanced', max_iter=1000))

])

l_regression_pipe

In [16]:
y_train_predicted = cross_val_predict(l_regression_pipe, X_train, y_train, cv=10, n_jobs=-1)

In [17]:
# Calculating the accuracy for the LogisticRegression Classifier 
print('Cross validation scores:')
print('-------------------------')
print("Accuracy: {:.2f}".format(accuracy_score(y_train, y_train_predicted)))
print("Recall: {:.2f}".format(recall_score(y_train, y_train_predicted)))
print("Precision: {:.2f}".format(precision_score(y_train, y_train_predicted)))

Cross validation scores:
-------------------------
Accuracy: 0.73
Recall: 0.69
Precision: 0.60


## Improving the model using Gridsearch:

In [21]:
l_regression_pipe.get_params()

{'memory': None,
 'steps': [('feature_engineering',
   ColumnTransformer(transformers=[('num_processor',
                                    Pipeline(steps=[('num_scaler',
                                                     StandardScaler())]),
                                    ['Age', 'bmi', 'skinthickness', 'insulin',
                                     'glucose', 'bloodpressure'])])),
  ('logistic_regression',
   LogisticRegression(class_weight='balanced', max_iter=1000))],
 'verbose': False,
 'feature_engineering': ColumnTransformer(transformers=[('num_processor',
                                  Pipeline(steps=[('num_scaler',
                                                   StandardScaler())]),
                                  ['Age', 'bmi', 'skinthickness', 'insulin',
                                   'glucose', 'bloodpressure'])]),
 'logistic_regression': LogisticRegression(class_weight='balanced', max_iter=1000),
 'feature_engineering__n_jobs': None,
 'feature_engineer

In [34]:
param_logreg = {'logistic_regression__penalty':('l1','l2'),
                'logistic_regression__C': [0.001, 0.01, 0.1, 1, 10],
                'logistic_regression__solver': ['liblinear', 'newton-cg', 'lbfgs','sag','saga'],
                'logistic_regression__max_iter': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
                'logistic_regression__class_weight': [None, 'balanced'],
                'logistic_regression__random_state': [42]

               }

grid_logreg = GridSearchCV(
    l_regression_pipe,
    param_grid=param_logreg,
    scoring="accuracy",
    cv=5
)
grid_logreg

In [35]:
# we fit the grid_logreg on train data
grid_logreg.fit(X_train,y_train)

In [36]:
# Show best parameters
print('Best score:\n{:.2f}'.format(grid_logreg.best_score_))
print("Best parameters:\n{}".format(grid_logreg.best_params_))

Best score:
0.78
Best parameters:
{'logistic_regression__C': 1, 'logistic_regression__class_weight': None, 'logistic_regression__max_iter': 100, 'logistic_regression__penalty': 'l2', 'logistic_regression__random_state': 42, 'logistic_regression__solver': 'newton-cg'}


In [37]:
best_model = grid_logreg.best_estimator_
best_model

## Final Evaluation

In [39]:
y_test_predicted = best_model.predict(X_test)

print("Accuracy: {:.2f}".format(accuracy_score(y_test, y_test_predicted)))
print("Recall: {:.2f}".format(recall_score(y_test, y_test_predicted)))
print("Precision: {:.2f}".format(precision_score(y_test, y_test_predicted)))

Accuracy: 0.74
Recall: 0.53
Precision: 0.67
