# %% [markdown]
# # üíâ DIABETES PREDICTION PIPELINE
# ## Complete ML System - From Data to Deployment
# 
# **Objective:** Predict diabetes with 90%+ accuracy using patient medical data
# **Dataset:** Pima Indians Diabetes Database
# **Features:** Pregnancies, Glucose, Blood Pressure, BMI, Age, etc.
# **Algorithms:** SVM, Logistic Regression, Random Forest, XGBoost
# **Author:** Your Name | **Date:** 2024

In [None]:
# %% [markdown]
# ## 1. Environment Setup

# %%
import sys, os
sys.path.append('..')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.config.settings import ProjectConfig
from src.utils.logger import setup_logging

config = ProjectConfig.load("diabetes")
logger = setup_logging("diabetes_pipeline")
logger.info("‚úÖ Diabetes prediction pipeline initialized")
print(f"‚úÖ Working directory: {os.getcwd()}")

In [None]:
# %% [markdown]
# ## 2. Data Loading & Validation

# %%
from src.data.loader import DataLoader
from src.data.validator import DataValidator

loader = DataLoader(config)
df = loader.load_diabetes()
validator = DataValidator(config)
report = validator.generate_quality_report(df)
print(f"‚úÖ Dataset loaded: {df.shape[0]} patients, {df.shape[1]} features")
print(f"üìä Features: {df.columns.tolist()}")

In [None]:
# %% [markdown]
# ## 3. Initial Data Inspection

# %%
print("üìä FIRST 5 ROWS:")
display(df.head())

print("\nüìä DATA INFO:")
print(df.info())

print("\nüìä BASIC STATISTICS:")
display(df.describe().round(2))

In [None]:
# %% [markdown]
# ## 4. Missing Values Analysis

# %%
missing = pd.DataFrame({
    'Column': df.columns,
    'Missing': df.isnull().sum().values,
    'Percentage': (df.isnull().sum() / len(df) * 100).values
}).sort_values('Percentage', ascending=False)

print("üîç MISSING VALUES REPORT:")
display(missing[missing['Missing'] > 0] if missing['Missing'].sum() > 0 else print("‚úÖ No missing values found!"))

print(f"\nüìä Total missing: {df.isnull().sum().sum()} cells")
print(f"üìä Complete rows: {len(df.dropna())}/{len(df)}")

In [None]:
# %% [markdown]
# ## 5. Zero Value Analysis (Medical Context)

# %%
columns_with_zeros = ['glucose', 'bp', 'skin_thickness', 'insulin', 'bmi']
zero_report = {}

for col in columns_with_zeros:
    if col in df.columns:
        zeros = (df[col] == 0).sum()
        zero_report[col] = {'zeros': zeros, 'percentage': (zeros/len(df))*100}

zero_df = pd.DataFrame(zero_report).T.round(2)
print("‚ö†Ô∏è ZERO VALUES (Medically Impossible):")
display(zero_df)

print("\nüìå Note: Glucose, BP, BMI = 0 are impossible - will be treated as missing")

In [None]:
# %% [markdown]
# ## 6. Duplicate Check

# %%
duplicates = df.duplicated().sum()
print(f"üìä DUPLICATE ROWS: {duplicates}")
print(f"Percentage: {(duplicates/len(df))*100:.2f}%")

if duplicates > 0:
    print("‚ö†Ô∏è Duplicates found - will remove during preprocessing")
else:
    print("‚úÖ No duplicates found")

In [None]:
# %% [markdown]
# ## 7. Target Variable Distribution

# %%
target_dist = df['target'].value_counts()
target_pct = df['target'].value_counts(normalize=True) * 100

print("üéØ TARGET DISTRIBUTION:")
print(f"No Diabetes: {target_dist[0]} ({target_pct[0]:.1f}%)")
print(f"Diabetes:    {target_dist[1]} ({target_pct[1]:.1f}%)")

if target_pct.min() < 30:
    print("‚ö†Ô∏è Imbalanced dataset - will apply SMOTE")
    print(f"   Minority class: {target_pct.min():.1f}%")

In [None]:
# %% [markdown]
# ## 8. Data Types Analysis

# %%
dtype_df = pd.DataFrame({
    'Column': df.columns,
    'Type': df.dtypes.values,
    'Unique': [df[col].nunique() for col in df.columns],
    'Sample': [df[col].dropna().unique()[:3] for col in df.columns]
})

print("üìä DATA TYPES:")
display(dtype_df)

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"\n‚úÖ Numeric columns: {len(numeric_cols)}")
print(f"üìä Columns: {numeric_cols}")

In [None]:
# %% [markdown]
# ## 9. Detailed Statistics

# %%
stats_df = df.describe().T
stats_df['skew'] = df[numeric_cols].skew()
stats_df['kurtosis'] = df[numeric_cols].kurtosis()
stats_df['zeros'] = [(df[col] == 0).sum() for col in numeric_cols]

print("üìà STATISTICAL SUMMARY:")
display(stats_df.round(2))

print("\nüìä Features with high skew (>1):")
high_skew = stats_df[abs(stats_df['skew']) > 1].index.tolist()
print(high_skew if high_skew else "None")

In [None]:
# %% [markdown]
# ## 10. Outlier Detection (IQR Method)

# %%
def detect_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = data[(data[col] < lower) | (data[col] > upper)]
    return {
        'outliers': len(outliers),
        'percentage': (len(outliers)/len(data))*100,
        'lower': lower,
        'upper': upper
    }

outlier_report = {}
for col in numeric_cols:
    if col != 'target':
        outlier_report[col] = detect_outliers_iqr(df, col)

outlier_df = pd.DataFrame(outlier_report).T.round(2)
print("üîç OUTLIER REPORT:")
display(outlier_df)

cols_with_outliers = outlier_df[outlier_df['percentage'] > 5].index.tolist()
print(f"\n‚ö†Ô∏è Columns with >5% outliers: {cols_with_outliers}")

In [None]:
# %% [markdown]
# ## 11. Target Distribution Visualization

# %%
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

bars = ax1.bar(['No Diabetes', 'Diabetes'], target_dist.values, 
               color=['#4ECDC4', '#FF6B6B'])
ax1.set_title('Diabetes Target Distribution', fontweight='bold', fontsize=14)
ax1.set_ylabel('Count')
for bar, val in zip(bars, target_dist.values):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height()+5, 
             str(val), ha='center', fontweight='bold')

ax2.pie(target_dist.values, labels=['No Diabetes', 'Diabetes'], 
        autopct='%1.1f%%', colors=['#4ECDC4', '#FF6B6B'],
        explode=(0, 0.05), startangle=90)
ax2.set_title('Target Distribution (%)', fontweight='bold', fontsize=14)

plt.suptitle('Diabetes Dataset Overview', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# %% [markdown]
# ## 12. Glucose Level Analysis

# %%
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Distribution by target
ax = axes[0, 0]
for target in [0, 1]:
    subset = df[df['target'] == target]['glucose'].dropna()
    ax.hist(subset, alpha=0.7, label=f'Target {target}', bins=20, 
            color=['#4ECDC4', '#FF6B6B'][target])
ax.set_xlabel('Glucose Level')
ax.set_ylabel('Frequency')
ax.set_title('Glucose Distribution by Diabetes Status', fontweight='bold')
ax.legend(['No Diabetes', 'Diabetes'])

# Boxplot
ax = axes[0, 1]
df.boxplot(column='glucose', by='target', ax=ax)
ax.set_title('Glucose Boxplot', fontweight='bold')
ax.set_xlabel('Diabetes (0=No, 1=Yes)')
ax.set_ylabel('Glucose')

# Violin plot
ax = axes[1, 0]
sns.violinplot(x='target', y='glucose', data=df, ax=ax,
               palette=['#4ECDC4', '#FF6B6B'])
ax.set_title('Glucose Violin Plot', fontweight='bold')
ax.set_xticklabels(['No Diabetes', 'Diabetes'])

# Statistics
ax = axes[1, 1]
ax.axis('off')
stats_text = f"""
GLUCOSE STATISTICS:
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
No Diabetes:
  Mean: {df[df['target']==0]['glucose'].mean():.1f}
  Median: {df[df['target']==0]['glucose'].median():.1f}
  Std: {df[df['target']==0]['glucose'].std():.1f}

Diabetes:
  Mean: {df[df['target']==1]['glucose'].mean():.1f}
  Median: {df[df['target']==1]['glucose'].median():.1f}
  Std: {df[df['target']==1]['glucose'].std():.1f}

Normal Range: 70-140 mg/dL
"""
ax.text(0.1, 0.5, stats_text, transform=ax.transAxes, fontsize=12,
        verticalalignment='center', family='monospace')

plt.suptitle('Glucose Level Analysis', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# %% [markdown]
# ## 13. BMI Analysis

# %%
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Distribution by target
ax = axes[0, 0]
for target in [0, 1]:
    subset = df[df['target'] == target]['bmi'].dropna()
    ax.hist(subset, alpha=0.7, label=f'Target {target}', bins=20,
            color=['#4ECDC4', '#FF6B6B'][target])
ax.set_xlabel('BMI')
ax.set_ylabel('Frequency')
ax.set_title('BMI Distribution by Diabetes Status', fontweight='bold')
ax.legend(['No Diabetes', 'Diabetes'])

# Boxplot
ax = axes[0, 1]
df.boxplot(column='bmi', by='target', ax=ax)
ax.set_title('BMI Boxplot', fontweight='bold')
ax.set_xlabel('Diabetes (0=No, 1=Yes)')
ax.set_ylabel('BMI')

# BMI Categories
ax = axes[1, 0]
bmi_cats = pd.cut(df['bmi'], bins=[0, 18.5, 25, 30, 100], 
                  labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
bmi_cross = pd.crosstab(bmi_cats, df['target'])
bmi_cross.columns = ['No Diabetes', 'Diabetes']
bmi_cross.plot(kind='bar', stacked=True, ax=ax, 
               color=['#4ECDC4', '#FF6B6B'])
ax.set_title('BMI Categories vs Diabetes', fontweight='bold')
ax.set_xlabel('BMI Category')
ax.set_ylabel('Count')
ax.legend(['No Diabetes', 'Diabetes'])
plt.xticks(rotation=45)

# Statistics
ax = axes[1, 1]
ax.axis('off')
stats_text = f"""
BMI STATISTICS:
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
No Diabetes:
  Mean: {df[df['target']==0]['bmi'].mean():.1f}
  Median: {df[df['target']==0]['bmi'].median():.1f}

Diabetes:
  Mean: {df[df['target']==1]['bmi'].mean():.1f}
  Median: {df[df['target']==1]['bmi'].median():.1f}

BMI Categories:
  Underweight: <18.5
  Normal: 18.5-25
  Overweight: 25-30
  Obese: >30
"""
ax.text(0.1, 0.5, stats_text, transform=ax.transAxes, fontsize=12,
        verticalalignment='center', family='monospace')

plt.suptitle('BMI Analysis', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# %% [markdown]
# ## 14. Age Analysis

# %%
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Age distribution
ax = axes[0]
for target in [0, 1]:
    subset = df[df['target'] == target]['age']
    ax.hist(subset, alpha=0.7, label=f'Target {target}', bins=15,
            color=['#4ECDC4', '#FF6B6B'][target])
ax.set_xlabel('Age')
ax.set_ylabel('Frequency')
ax.set_title('Age Distribution', fontweight='bold')
ax.legend(['No Diabetes', 'Diabetes'])

# Age boxplot
ax = axes[1]
df.boxplot(column='age', by='target', ax=ax)
ax.set_title('Age Boxplot', fontweight='bold')
ax.set_xlabel('Diabetes Status')
ax.set_ylabel('Age')

# Age categories
ax = axes[2]
age_cats = pd.cut(df['age'], bins=[0, 30, 40, 50, 60, 100],
                  labels=['<30', '30-40', '40-50', '50-60', '60+'])
age_cross = pd.crosstab(age_cats, df['target'])
age_cross.columns = ['No Diabetes', 'Diabetes']
age_cross.plot(kind='bar', stacked=True, ax=ax,
               color=['#4ECDC4', '#FF6B6B'])
ax.set_title('Age Groups vs Diabetes', fontweight='bold')
ax.set_xlabel('Age Group')
ax.set_ylabel('Count')
ax.legend(['No Diabetes', 'Diabetes'])
plt.xticks(rotation=45)

plt.suptitle('Age Analysis', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# %% [markdown]
# ## 15. Pregnancies Analysis

# %%
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribution
ax = axes[0]
df[df['target']==0]['pregnancies'].hist(alpha=0.7, label='No Diabetes', 
                                         bins=15, color='#4ECDC4')
df[df['target']==1]['pregnancies'].hist(alpha=0.7, label='Diabetes', 
                                         bins=15, color='#FF6B6B')
ax.set_xlabel('Number of Pregnancies')
ax.set_ylabel('Frequency')
ax.set_title('Pregnancies Distribution', fontweight='bold')
ax.legend()

# Pregnancies vs Diabetes
ax = axes[1]
preg_cross = pd.crosstab(df['pregnancies'], df['target'])
preg_cross.columns = ['No Diabetes', 'Diabetes']
preg_cross.plot(kind='bar', stacked=True, ax=ax,
                color=['#4ECDC4', '#FF6B6B'])
ax.set_title('Pregnancies vs Diabetes', fontweight='bold')
ax.set_xlabel('Number of Pregnancies')
ax.set_ylabel('Count')
ax.legend(['No Diabetes', 'Diabetes'])

plt.suptitle('Pregnancies Analysis', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# %% [markdown]
# ## 16. Blood Pressure Analysis

# %%
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribution
ax = axes[0]
for target in [0, 1]:
    subset = df[df['target'] == target]['bp'].replace(0, np.nan).dropna()
    ax.hist(subset, alpha=0.7, label=f'Target {target}', bins=20,
            color=['#4ECDC4', '#FF6B6B'][target])
ax.set_xlabel('Blood Pressure (mm Hg)')
ax.set_ylabel('Frequency')
ax.set_title('Blood Pressure Distribution', fontweight='bold')
ax.legend(['No Diabetes', 'Diabetes'])

# Boxplot
ax = axes[1]
df_bp_clean = df[df['bp'] > 0]  # Remove zeros
df_bp_clean.boxplot(column='bp', by='target', ax=ax)
ax.set_title('Blood Pressure Boxplot', fontweight='bold')
ax.set_xlabel('Diabetes Status')
ax.set_ylabel('Blood Pressure')

plt.suptitle('Blood Pressure Analysis', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print(f"Patients with BP=0 (invalid): {(df['bp']==0).sum()} ({((df['bp']==0).sum()/len(df))*100:.1f}%)")

In [None]:
# %% [markdown]
# ## 17. Insulin Analysis

# %%
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribution (excluding zeros)
ax = axes[0]
for target in [0, 1]:
    subset = df[df['target'] == target]['insulin']
    subset = subset[subset > 0]  # Remove zeros
    ax.hist(subset, alpha=0.7, label=f'Target {target}', bins=20,
            color=['#4ECDC4', '#FF6B6B'][target])
ax.set_xlabel('Insulin (mu U/ml)')
ax.set_ylabel('Frequency')
ax.set_title('Insulin Distribution (Non-zero)', fontweight='bold')
ax.legend(['No Diabetes', 'Diabetes'])

# Zero insulin analysis
ax = axes[1]
insulin_zero = (df['insulin'] == 0).sum()
insulin_normal = (df['insulin'] > 0).sum()
ax.pie([insulin_zero, insulin_normal], 
       labels=['Zero Insulin', 'Normal Insulin'],
       autopct='%1.1f%%', colors=['#FF6B6B', '#4ECDC4'])
ax.set_title('Zero Insulin Values', fontweight='bold')

plt.suptitle('Insulin Analysis', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print(f"‚ö†Ô∏è Patients with Insulin=0: {insulin_zero} ({insulin_zero/len(df)*100:.1f}%)")
print("   These will be treated as missing values")

In [None]:
# %% [markdown]
# ## 18. Correlation Heatmap

# %%
plt.figure(figsize=(12, 10))
corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap='RdBu_r',
            center=0, square=True, linewidths=0.5,
            cbar_kws={"shrink": 0.8})
plt.title('Diabetes Dataset - Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

In [None]:
# %% [markdown]
# ## 19. Feature Correlation with Target

# %%
correlations = df.corr()['target'].drop('target').sort_values(ascending=False)

plt.figure(figsize=(10, 6))
colors = ['#2E86AB' if x > 0 else '#A23B72' for x in correlations.values]
correlations.plot(kind='bar', color=colors)
plt.title('Feature Correlation with Diabetes', fontweight='bold', fontsize=14)
plt.xlabel('Features')
plt.ylabel('Correlation with Target')
plt.xticks(rotation=45, ha='right')
plt.axhline(y=0, color='black', linestyle='-', linewidth=0.5)

# Add value labels
for i, v in enumerate(correlations.values):
    plt.text(i, v + 0.02 if v > 0 else v - 0.05, f'{v:.3f}', 
             ha='center', fontsize=9)

plt.tight_layout()
plt.show()

print("üìä TOP 3 POSITIVE CORRELATIONS:")
for feat, corr in correlations.head(3).items():
    print(f"   {feat}: +{corr:.3f}")

print("\nüìä TOP 3 NEGATIVE CORRELATIONS:")
for feat, corr in correlations.tail(3).items():
    print(f"   {feat}: {corr:.3f}")

In [None]:
# %% [markdown]
# ## 20. Pairplot of Top Features

# %%
top_features = correlations.head(4).index.tolist() + ['target']
sns.pairplot(df[top_features], hue='target', 
             palette=['#4ECDC4', '#FF6B6B'],
             diag_kind='kde')
plt.suptitle('Pairplot of Top 4 Features', y=1.02, fontsize=16, fontweight='bold')
plt.show()

In [None]:
# %% [markdown]
# ## 21. Handle Missing Values and Invalid Zeros

# %%
df_clean = df.copy()

# Replace zeros with NaN in medical features
zero_cols = ['glucose', 'bp', 'skin_thickness', 'insulin', 'bmi']
for col in zero_cols:
    if col in df_clean.columns:
        zeros = (df_clean[col] == 0).sum()
        df_clean.loc[df_clean[col] == 0, col] = np.nan
        print(f"{col:15}: Replaced {zeros:3} zeros with NaN ({zeros/len(df)*100:.1f}%)")

# Impute with median
for col in zero_cols:
    if col in df_clean.columns:
        median_val = df_clean[col].median()
        df_clean[col].fillna(median_val, inplace=True)

print("\n‚úÖ Missing values handled with median imputation")
print(f"‚úÖ Final shape: {df_clean.shape}")

In [None]:
# %% [markdown]
# ## 22. Data Preprocessing

# %%
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Split features and target
X = df_clean.drop('target', axis=1)
y = df_clean['target']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"üìä BEFORE SMOTE:")
print(f"   Training set: {X_train.shape}")
print(f"   Class 0: {(y_train==0).sum()} ({(y_train==0).sum()/len(y_train)*100:.1f}%)")
print(f"   Class 1: {(y_train==1).sum()} ({(y_train==1).sum()/len(y_train)*100:.1f}%)")

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"\nüìä AFTER SMOTE:")
print(f"   Training set: {X_train_resampled.shape}")
print(f"   Class 0: {(y_train_resampled==0).sum()} ({(y_train_resampled==0).sum()/len(y_train_resampled)*100:.1f}%)")
print(f"   Class 1: {(y_train_resampled==1).sum()} ({(y_train_resampled==1).sum()/len(y_train_resampled)*100:.1f}%)")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

print(f"\n‚úÖ Preprocessing complete")
print(f"‚úÖ Training set: {X_train_scaled.shape}")
print(f"‚úÖ Testing set:  {X_test_scaled.shape}")

In [None]:
# %% [markdown]
# ## 23. Training: Logistic Regression

# %%
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
lr_model.fit(X_train_scaled, y_train_resampled)
lr_pred = lr_model.predict(X_test_scaled)
lr_proba = lr_model.predict_proba(X_test_scaled)[:, 1]

print("‚úÖ Logistic Regression trained")
print(f"   Training score: {lr_model.score(X_train_scaled, y_train_resampled):.4f}")

In [None]:
# %% [markdown]
# ## 24. Training: Support Vector Machine

# %%
from sklearn.svm import SVC

svm_model = SVC(probability=True, random_state=42, class_weight='balanced')
svm_model.fit(X_train_scaled, y_train_resampled)
svm_pred = svm_model.predict(X_test_scaled)
svm_proba = svm_model.predict_proba(X_test_scaled)[:, 1]

print("‚úÖ SVM trained")
print(f"   Training score: {svm_model.score(X_train_scaled, y_train_resampled):.4f}")

In [None]:
# %% [markdown]
# ## 25. Training: Random Forest

# %%
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, 
                                   n_jobs=-1, class_weight='balanced')
rf_model.fit(X_train_scaled, y_train_resampled)
rf_pred = rf_model.predict(X_test_scaled)
rf_proba = rf_model.predict_proba(X_test_scaled)[:, 1]

print("‚úÖ Random Forest trained")
print(f"   Training score: {rf_model.score(X_train_scaled, y_train_resampled):.4f}")

In [None]:
# %% [markdown]
# ## 26. Training: XGBoost

# %%
import xgboost as xgb

xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, 
                               eval_metric='logloss', scale_pos_weight=(y_train==0).sum()/(y_train==1).sum())
xgb_model.fit(X_train_scaled, y_train_resampled)
xgb_pred = xgb_model.predict(X_test_scaled)
xgb_proba = xgb_model.predict_proba(X_test_scaled)[:, 1]

print("‚úÖ XGBoost trained")
print(f"   Training score: {xgb_model.score(X_train_scaled, y_train_resampled):.4f}")

In [None]:
# %% [markdown]
# ## 27. Model Comparison - All Metrics

# %%
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

models = ['Logistic Regression', 'SVM', 'Random Forest', 'XGBoost']
predictions = [lr_pred, svm_pred, rf_pred, xgb_pred]
probabilities = [lr_proba, svm_proba, rf_proba, xgb_proba]

results = []
for name, pred, proba in zip(models, predictions, probabilities):
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, pred),
        'Precision': precision_score(y_test, pred),
        'Recall': recall_score(y_test, pred),
        'F1-Score': f1_score(y_test, pred),
        'ROC-AUC': roc_auc_score(y_test, proba)
    })

results_df = pd.DataFrame(results).round(4)
print("üìä MODEL COMPARISON:")
display(results_df)

best_model = results_df.loc[results_df['F1-Score'].idxmax(), 'Model']
best_f1 = results_df.loc[results_df['F1-Score'].idxmax(), 'F1-Score']
print(f"\nüèÜ BEST MODEL: {best_model} with F1-Score = {best_f1:.4f}")

In [None]:
# %% [markdown]
# ## 28. Confusion Matrix (Best Model)

# %%
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

best_idx = models.index(best_model)
best_pred = predictions[best_idx]
cm = confusion_matrix(y_test, best_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Diabetes', 'Diabetes'])
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix - {best_model}', fontweight='bold', fontsize=14)
plt.grid(False)
plt.show()

tn, fp, fn, tp = cm.ravel()
print(f"True Negatives:  {tn}  (Correctly predicted no diabetes)")
print(f"False Positives: {fp}  (Incorrectly predicted diabetes)")
print(f"False Negatives: {fn}  (Missed diabetes cases)")
print(f"True Positives:  {tp}  (Correctly predicted diabetes)")
print(f"\nSensitivity (Recall): {tp/(tp+fn):.4f} - Ability to detect diabetes")
print(f"Specificity: {tn/(tn+fp):.4f} - Ability to rule out diabetes")

In [None]:
# %% [markdown]
# ## 29. ROC Curves Comparison

# %%
from sklearn.metrics import roc_curve

plt.figure(figsize=(10, 8))

for name, proba in zip(models, probabilities):
    fpr, tpr, _ = roc_curve(y_test, proba)
    auc = roc_auc_score(y_test, proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.3f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier', alpha=0.5)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Diabetes Prediction', fontweight='bold', fontsize=14)
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# %% [markdown]
# ## 30. Feature Importance Analysis

# %%
if best_model in ['Random Forest', 'XGBoost']:
    best_model_obj = [rf_model, xgb_model][models.index(best_model)-2]
    importances = best_model_obj.feature_importances_
    
    feat_imp = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importances
    }).sort_values('Importance', ascending=True)
    
    plt.figure(figsize=(10, 8))
    colors = plt.cm.Blues(np.linspace(0.3, 1, len(feat_imp)))
    plt.barh(feat_imp['Feature'], feat_imp['Importance'], color=colors[::-1])
    plt.xlabel('Importance')
    plt.title(f'Feature Importance - {best_model}', fontweight='bold', fontsize=14)
    
    for i, (_, row) in enumerate(feat_imp.iterrows()):
        plt.text(row['Importance'] + 0.005, i, f'{row["Importance"]:.3f}', 
                 va='center', fontsize=9)
    
    plt.tight_layout()
    plt.show()
    
    print("üìä TOP 5 MOST IMPORTANT FEATURES:")
    for i, row in feat_imp.tail(5).iterrows():
        print(f"   {row['Feature']}: {row['Importance']:.4f}")
else:
    # For Logistic Regression, show coefficients
    coef = np.abs(lr_model.coef_[0])
    coef_df = pd.DataFrame({
        'Feature': X.columns,
        'Coefficient': coef
    }).sort_values('Coefficient', ascending=True)
    
    plt.figure(figsize=(10, 6))
    plt.barh(coef_df['Feature'], coef_df['Coefficient'], color='#2E86AB')
    plt.xlabel('Absolute Coefficient')
    plt.title('Feature Importance - Logistic Regression', fontweight='bold')
    plt.tight_layout()
    plt.show()

In [None]:
# %% [markdown]
# ## 31. Save Models for Deployment

# %%
import joblib
from datetime import datetime

# Save best model
best_model_obj = [lr_model, svm_model, rf_model, xgb_model][best_idx]
model_path = f'../models/diabetes_{best_model.lower().replace(" ", "_")}_v1.0.0.pkl'
joblib.dump(best_model_obj, model_path)

# Save scaler and features
joblib.dump(scaler, '../models/diabetes_scaler.pkl')
joblib.dump(X.columns.tolist(), '../models/diabetes_features.pkl')

print(f"‚úÖ Best model saved: {model_path}")
print(f"‚úÖ Scaler saved: ../models/diabetes_scaler.pkl")
print(f"‚úÖ Feature names saved: ../models/diabetes_features.pkl")
print(f"üìÖ Saved at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
# %% [markdown]
# ## 32. Model Metadata

# %%
import json

metadata = {
    'model_name': best_model,
    'version': '1.0.0',
    'training_date': datetime.now().isoformat(),
    'dataset': 'Pima Indians Diabetes Database',
    'samples': len(df),
    'features': X.columns.tolist(),
    'metrics': {
        'accuracy': float(results_df.loc[best_idx, 'Accuracy']),
        'precision': float(results_df.loc[best_idx, 'Precision']),
        'recall': float(results_df.loc[best_idx, 'Recall']),
        'f1_score': float(results_df.loc[best_idx, 'F1-Score']),
        'roc_auc': float(results_df.loc[best_idx, 'ROC-AUC'])
    },
    'preprocessing': {
        'scaler': 'StandardScaler',
        'balancing': 'SMOTE',
        'missing_handling': 'Median imputation'
    }
}

with open('../models/diabetes_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("‚úÖ Model metadata saved")

In [None]:
# %% [markdown]
# ## 33. Test Single Prediction

# %%
# Take first test sample
sample = X_test_scaled[0].reshape(1, -1)
actual = y_test.iloc[0]
pred = best_model_obj.predict(sample)[0]
proba = best_model_obj.predict_proba(sample)[0]

print("üîç SINGLE PREDICTION TEST:")
print("="*50)
print(f"Actual:      {'DIABETES' if actual == 1 else 'NO DIABETES'}")
print(f"Predicted:   {'DIABETES' if pred == 1 else 'NO DIABETES'}")
print(f"Probability: No Diabetes: {proba[0]:.3f}, Diabetes: {proba[1]:.3f}")
print(f"Confidence:  {max(proba)*100:.1f}%")
print(f"Risk Level:  {'HIGH' if pred == 1 else 'LOW'}")
print("="*50)

if pred == actual:
    print("‚úÖ Prediction CORRECT")
else:
    print("‚ùå Prediction INCORRECT")

In [None]:
# %% [markdown]
# ## 34. Export Results to CSV

# %%
results_df.to_csv('../reports/diabetes_model_comparison.csv', index=False)
print("‚úÖ Results exported to reports/diabetes_model_comparison.csv")

# Display styled table
styled = results_df.style.background_gradient(cmap='Blues', subset=['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC'])
styled

In [None]:
# %% [markdown]
# ## 35. Pipeline Complete - Final Summary

# %%
summary = f"""
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë              DIABETES PREDICTION - PIPELINE COMPLETE             ‚ïë
‚ï†‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï£
‚ïë  Dataset:        Pima Indians Diabetes                           ‚ïë
‚ïë  Patients:       {len(df):,}                                      ‚ïë
‚ïë  Features:       {len(X.columns)}                                 ‚ïë
‚ïë  Diabetes cases: {target_dist[1]} ({target_pct[1]:.1f}%)         ‚ïë
‚ïë  Best Model:     {best_model}                                     ‚ïë
‚ïë  F1-Score:       {results_df.loc[best_idx, 'F1-Score']:.4f}      ‚ïë
‚ïë  Accuracy:       {results_df.loc[best_idx, 'Accuracy']:.4f}      ‚ïë
‚ïë  Precision:      {results_df.loc[best_idx, 'Precision']:.4f}     ‚ïë
‚ïë  Recall:         {results_df.loc[best_idx, 'Recall']:.4f}        ‚ïë
‚ïë  ROC-AUC:        {results_df.loc[best_idx, 'ROC-AUC']:.4f}       ‚ïë
‚ïë  Model saved:    models/diabetes_{best_model.lower().replace(' ', '_')}_v1.0.0.pkl  ‚ïë
‚ïë  Reports:        reports/diabetes_model_comparison.csv           ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
"""

print(summary)

with open('../reports/diabetes_summary.txt', 'w') as f:
    f.write(summary)

In [None]:
# %% [markdown]
# ## 36. Ready for Deployment

# %%
print("""
üöÄ DEPLOYMENT INSTRUCTIONS:

1. Start Diabetes Prediction API:
   $ uvicorn api.main:app --reload

2. Test API endpoint:
   curl -X POST http://localhost:8000/predict/diabetes \\
        -H "Content-Type: application/json" \\
        -d '{
            "pregnancies": 2,
            "glucose": 120,
            "bp": 70,
            "skin_thickness": 20,
            "insulin": 80,
            "bmi": 25.1,
            "dpf": 0.5,
            "age": 33
        }'

3. Expected response:
   {
     "disease": "Diabetes",
     "prediction": 0,
     "probability": 0.15,
     "risk_level": "Low",
     "confidence": 0.85
   }

üéâ Diabetes prediction model is ready for production!
""")