In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, confusion_matrix, 
                           classification_report, f1_score, 
                           precision_score, recall_score)
from imblearn.over_sampling import RandomOverSampler, SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('diabetes.csv')

# Explore the data
print(df.head())
print(df.describe())
print(df.isnull().sum())

# Replace zeros with NaN for certain columns (except Pregnancies and Outcome)
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_zeros] = df[cols_with_zeros].replace(0, np.nan)

# Fill missing values with median
for col in cols_with_zeros:
    df[col].fillna(df[col].median(), inplace=True)

# Define features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Initial train-test split (70-30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model Building Functions

def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.show()
    
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred)
    }

def grid_search_model(model, params, X_train, y_train):
    grid = GridSearchCV(model, params, cv=5, scoring='f1', n_jobs=-1)
    grid.fit(X_train, y_train)
    print(f"Best parameters: {grid.best_params_}")
    print(f"Best F1 score: {grid.best_score_:.4f}")
    return grid.best_estimator_

## Initial Models (without GridSearchCV)

# Logistic Regression
print("\nLogistic Regression:")
lr = LogisticRegression(random_state=42)
lr_metrics = evaluate_model(lr, X_train_scaled, X_test_scaled, y_train, y_test)

# Decision Tree
print("\nDecision Tree:")
dt = DecisionTreeClassifier(random_state=42)
dt_metrics = evaluate_model(dt, X_train, X_test, y_train, y_test)

# Random Forest
print("\nRandom Forest:")
rf = RandomForestClassifier(random_state=42)
rf_metrics = evaluate_model(rf, X_train, X_test, y_train, y_test)

## Models with GridSearchCV

# Logistic Regression with GridSearchCV
print("\nLogistic Regression with GridSearchCV:")
lr_params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}
best_lr = grid_search_model(LogisticRegression(random_state=42), lr_params, X_train_scaled, y_train)
lr_metrics_gs = evaluate_model(best_lr, X_train_scaled, X_test_scaled, y_train, y_test)

# Decision Tree with GridSearchCV
print("\nDecision Tree with GridSearchCV:")
dt_params = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
best_dt = grid_search_model(DecisionTreeClassifier(random_state=42), dt_params, X_train, y_train)
dt_metrics_gs = evaluate_model(best_dt, X_train, X_test, y_train, y_test)

# Random Forest with GridSearchCV
print("\nRandom Forest with GridSearchCV:")
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
best_rf = grid_search_model(RandomForestClassifier(random_state=42), rf_params, X_train, y_train)
rf_metrics_gs = evaluate_model(best_rf, X_train, X_test, y_train, y_test)

## Sampling Techniques

# Random Over Sampling
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train_scaled, y_train)

# SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

## Models with Sampling + GridSearchCV

# Logistic Regression with ROS
print("\nLogistic Regression with Random Over Sampling:")
best_lr_ros = grid_search_model(LogisticRegression(random_state=42), lr_params, X_train_ros, y_train_ros)
lr_metrics_ros = evaluate_model(best_lr_ros, X_train_scaled, X_test_scaled, y_train, y_test)

# Logistic Regression with SMOTE
print("\nLogistic Regression with SMOTE:")
best_lr_smote = grid_search_model(LogisticRegression(random_state=42), lr_params, X_train_smote, y_train_smote)
lr_metrics_smote = evaluate_model(best_lr_smote, X_train_scaled, X_test_scaled, y_train, y_test)

# Decision Tree with ROS
print("\nDecision Tree with Random Over Sampling:")
best_dt_ros = grid_search_model(DecisionTreeClassifier(random_state=42), dt_params, X_train_ros, y_train_ros)
dt_metrics_ros = evaluate_model(best_dt_ros, X_train, X_test, y_train, y_test)

# Decision Tree with SMOTE
print("\nDecision Tree with SMOTE:")
best_dt_smote = grid_search_model(DecisionTreeClassifier(random_state=42), dt_params, X_train_smote, y_train_smote)
dt_metrics_smote = evaluate_model(best_dt_smote, X_train, X_test, y_train, y_test)

# Random Forest with ROS
print("\nRandom Forest with Random Over Sampling:")
best_rf_ros = grid_search_model(RandomForestClassifier(random_state=42), rf_params, X_train_ros, y_train_ros)
rf_metrics_ros = evaluate_model(best_rf_ros, X_train, X_test, y_train, y_test)

# Random Forest with SMOTE
print("\nRandom Forest with SMOTE:")
best_rf_smote = grid_search_model(RandomForestClassifier(random_state=42), rf_params, X_train_smote, y_train_smote)
rf_metrics_smote = evaluate_model(best_rf_smote, X_train, X_test, y_train, y_test)

## Results Comparison

# Create comparison dataframe
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest',
              'LR + GridSearchCV', 'DT + GridSearchCV', 'RF + GridSearchCV',
              'LR + ROS', 'LR + SMOTE',
              'DT + ROS', 'DT + SMOTE',
              'RF + ROS', 'RF + SMOTE'],
    'Accuracy': [lr_metrics['accuracy'], dt_metrics['accuracy'], rf_metrics['accuracy'],
                lr_metrics_gs['accuracy'], dt_metrics_gs['accuracy'], rf_metrics_gs['accuracy'],
                lr_metrics_ros['accuracy'], lr_metrics_smote['accuracy'],
                dt_metrics_ros['accuracy'], dt_metrics_smote['accuracy'],
                rf_metrics_ros['accuracy'], rf_metrics_smote['accuracy']],
    'Precision': [lr_metrics['precision'], dt_metrics['precision'], rf_metrics['precision'],
                 lr_metrics_gs['precision'], dt_metrics_gs['precision'], rf_metrics_gs['precision'],
                 lr_metrics_ros['precision'], lr_metrics_smote['precision'],
                 dt_metrics_ros['precision'], dt_metrics_smote['precision'],
                 rf_metrics_ros['precision'], rf_metrics_smote['precision']],
    'Recall': [lr_metrics['recall'], dt_metrics['recall'], rf_metrics['recall'],
              lr_metrics_gs['recall'], dt_metrics_gs['recall'], rf_metrics_gs['recall'],
              lr_metrics_ros['recall'], lr_metrics_smote['recall'],
              dt_metrics_ros['recall'], dt_metrics_smote['recall'],
              rf_metrics_ros['recall'], rf_metrics_smote['recall']],
    'F1': [lr_metrics['f1'], dt_metrics['f1'], rf_metrics['f1'],
           lr_metrics_gs['f1'], dt_metrics_gs['f1'], rf_metrics_gs['f1'],
           lr_metrics_ros['f1'], lr_metrics_smote['f1'],
           dt_metrics_ros['f1'], dt_metrics_smote['f1'],
           rf_metrics_ros['f1'], rf_metrics_smote['f1']]
})

# Display results sorted by F1 score
print("\nModel Comparison (sorted by F1 score):")
print(results.sort_values('F1', ascending=False).to_string(index=False))
