## Logistic Regression - Diabetes

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score

import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
import configure
import warnings
warnings.filterwarnings('ignore')

### 1. Problem Statement

### 2. Data Gathering

In [2]:
df = pd.read_csv(configure.CSV_FILE_PATH)
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


In [3]:
df.columns

Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
       'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

### 3. EDA

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Glucose                   768 non-null    int64  
 1   BloodPressure             768 non-null    int64  
 2   SkinThickness             768 non-null    int64  
 3   Insulin                   768 non-null    int64  
 4   BMI                       768 non-null    float64
 5   DiabetesPedigreeFunction  768 non-null    float64
 6   Age                       768 non-null    int64  
 7   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 48.1 KB


### 4. Feature Selection

### Model Training

In [5]:
x = df.drop('Outcome', axis = 1)
y = df.Outcome

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, 
                                    random_state=55, stratify=y)


##### Stratify Parameter 

In [6]:
log_reg_model = LogisticRegression()
log_reg_model.fit(x_train, y_train)

LogisticRegression()

In [9]:
## Evaluation on Training Dataset

y_pred_train = log_reg_model.predict(x_train)

cnf_matrix = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix :\n", cnf_matrix)
print("*"* 50)

accuracy = accuracy_score(y_train, y_pred_train)
print("Accuracy is :",accuracy)
print("*"* 50)

clf_report = classification_report(y_train, y_pred_train)
print('Classification Report :\n',clf_report)
print("*"* 50)

precision_value = precision_score(y_train, y_pred_train)
print("Precision :",precision_value)

recall_value = recall_score(y_train, y_pred_train)
print('Recall :',recall_value)

f1_value = f1_score(y_train, y_pred_train)
print('F1 Score:',f1_value)
print("*"* 50)

Confusion Matrix :
 [[360  40]
 [ 84 130]]
**************************************************
Accuracy is : 0.7980456026058632
**************************************************
Classification Report :
               precision    recall  f1-score   support

           0       0.81      0.90      0.85       400
           1       0.76      0.61      0.68       214

    accuracy                           0.80       614
   macro avg       0.79      0.75      0.77       614
weighted avg       0.79      0.80      0.79       614

**************************************************
Precision : 0.7647058823529411
Recall : 0.6074766355140186
F1 Score: 0.6770833333333333
**************************************************


In [54]:
# Evaluation on Testing Dataset

y_pred = log_reg_model.predict(x_test)

cnf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix :\n", cnf_matrix)
print("*"* 50)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is :",accuracy)
print("*"* 50)

clf_report = classification_report(y_test, y_pred)
print('Classification Report :\n',clf_report)
print("*"* 50)

precision_value = precision_score(y_test, y_pred)
print("Precision :",precision_value)

recall_value = recall_score(y_test, y_pred)
print('Recall :',recall_value)

f1_value = f1_score(y_test, y_pred)
print('F1 Score:',f1_value)
print("*"* 50)

Confusion Matrix :
 [[81 19]
 [26 28]]
**************************************************
Accuracy is : 0.7077922077922078
**************************************************
Classification Report :
               precision    recall  f1-score   support

           0       0.76      0.81      0.78       100
           1       0.60      0.52      0.55        54

    accuracy                           0.71       154
   macro avg       0.68      0.66      0.67       154
weighted avg       0.70      0.71      0.70       154

**************************************************
Precision : 0.5957446808510638
Recall : 0.5185185185185185
F1 Score: 0.5544554455445543
**************************************************


## Prediction of Single Row

In [7]:
Glucose = 96.000
BloodPressure = 56.000
SkinThickness = 34.000
Insulin = 115.000
BMI = 24.700
DiabetesPedigreeFunction = 0.944
Age = 50.000
# Outcome = ?


test_array = np.array([Glucose,BloodPressure, SkinThickness, Insulin, BMI, 
                       DiabetesPedigreeFunction, Age], ndmin = 2)
test_array

array([[ 96.   ,  56.   ,  34.   , 115.   ,  24.7  ,   0.944,  50.   ]])

In [8]:
pred_class = log_reg_model.predict(test_array)[0]
print("Predicted CLass using 0.5 Threshold is :",pred_class)

Predicted CLass using 0.5 Threshold is : 0


In [71]:
pred_class = log_reg_model.predict(test_array)
pred_class

array([0], dtype=int64)

In [86]:
import pickle

### save model

In [87]:
with open(r"artifact/regression_model.pkl",'wb') as f:
    pickle.dump(log_reg_model,f)

### save columns

In [11]:
import json

In [12]:
log_reg_model.n_features_in_

7

In [13]:
column_names=x.columns
project_data = {"Column Names" : column_names.tolist()}
with open(r'artifact/project_data.json','w')as f:
    json.dump(project_data,f)

In [14]:
x.columns

Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
       'DiabetesPedigreeFunction', 'Age'],
      dtype='object')