<a href="https://colab.research.google.com/github/parhamalikhan/Diabetes-Prediction/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Diabetes Prediction | Pima Indians Dataset
This notebook demonstrates a full data science pipeline on the Pima Indians Diabetes dataset.  
Key goals:
- Train a logistic regression model to predict diabetes
- Interpret model coefficients using **Odds Ratio**
- Evaluate performance via **ROC Curve**, **AUC**, and **Confusion Matrix**
- Regularize the model using **Ridge** and **Elastic Net**
- Bonus: Check model fit using **Deviance Residuals** and explore **Maximum Likelihood Estimation**

 Libraries used: `pandas`, `seaborn`, `scikit-learn`, `statsmodels`, `matplotlib`

Ideal for learning statistical modeling & interpretability in healthcare data.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report
import statsmodels.api as sm

file_path = '/content/drive/MyDrive/diabetes.csv'
df = pd.read_csv(file_path)
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
# Check the shape of the dataset (rows, columns)
print("Shape:", df.shape)

# Display basic info about the dataset, including data types and non-null counts
df.info()

# Show the first five rows of the dataset
df.head()

# Check for missing values in each column
print("Missing values:\n", df.isnull().sum())

# Check for duplicate rows
print("Number of duplicate rows:", df.duplicated().sum())

# Drop duplicate rows if any
df = df.drop_duplicates()

# Check the distribution of the target variable 'Outcome'
print("Target variable distribution:\n", df['Outcome'].value_counts())



Shape: (768, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
Missing values:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction   

In [3]:
# Features (independent variables)
X = df.drop('Outcome', axis=1)

# Target (dependent variable)
y = df['Outcome']


In [4]:
 from sklearn.model_selection import train_test_split

#Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# 1. Import the required libraries
from sklearn.preprocessing import StandardScaler
import pandas as pd

# 2. Initialize the StandardScaler
# This scaler will standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()

# 3. Fit the scaler on training data and transform it
# We use only the training data to fit so that no information from the test set leaks into the model
X_train_scaled = scaler.fit_transform(X_train)

# 4. Use the same scaler to transform the test data
# This ensures that both train and test are scaled based on the same mean and std
X_test_scaled = scaler.transform(X_test)

# 5. Convert the scaled training data into a DataFrame
# We use the same column names as the original X
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)

# 6. Show the first 5 rows of the scaled training data
X_train_scaled_df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,-0.526397,-1.151398,-3.752683,-1.322774,-0.701206,-4.135256,-0.490735,-1.03594
1,1.588046,-0.276643,0.680345,0.233505,-0.701206,-0.489169,2.41503,1.487101
2,-0.82846,0.566871,-1.265862,-0.09072,0.013448,-0.424522,0.549161,-0.948939
3,-1.130523,1.254179,-1.049617,-1.322774,-0.701206,-1.30372,-0.639291,2.792122
4,0.681856,0.410665,0.572222,1.07649,2.484601,1.838121,-0.686829,1.139095


In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import numpy as np

# Create a pipeline: first standardize the features, then apply logistic regression
pipeline = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))

# Perform 5-fold cross-validation
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')

# Print cross-validation results
print("Accuracy for each fold:", scores)
print("Mean Accuracy:", np.mean(scores))
print("Standard Deviation:", np.std(scores))


Accuracy for each fold: [0.77272727 0.74675325 0.75324675 0.81699346 0.76470588]
Mean Accuracy: 0.7708853238265002
Standard Deviation: 0.024742737050396143


In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# 1. Create a pipeline
pipeline = make_pipeline(StandardScaler(), SVC(kernel='linear', random_state=42))

# 2. Perform Cross Validation
# Using 5-fold cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')

# 3. Display the results
print(f"Accuracy for each fold: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean()}")
print(f"Standard Deviation: {cv_scores.std()}")


Accuracy for each fold: [0.76623377 0.75974026 0.75974026 0.81699346 0.76470588]
Mean Accuracy: 0.7734827264239028
Standard Deviation: 0.02191107247832647
