In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score, 
                           precision_recall_fscore_support, roc_auc_score, roc_curve)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("Data/Multiclass Diabetes Dataset.csv")


In [6]:
def create_features(df):
    df_feat = df.copy()
    
    # Log transformations for skewed features
    df_feat['Urea_log'] = np.log1p(df['Urea'])
    df_feat['Cr_log'] = np.log1p(df['Cr'])
    df_feat['VLDL_log'] = np.log1p(df['VLDL'])
    df_feat['TG_log'] = np.log1p(df['TG'])
    
    # Risk indicators based on medical thresholds
    df_feat['HbA1c_Risk'] = (df['HbA1c'] >= 6.5).astype(int)
    df_feat['BMI_Risk'] = (df['BMI'] >= 30).astype(int)
    df_feat['Age_Risk'] = (df['AGE'] >= 45).astype(int)
    df_feat['TG_Risk'] = (df['TG'] > 150).astype(int)
    df_feat['Chol_Risk'] = (df['Chol'] > 200).astype(int)
    
    # Composite risk score
    df_feat['Total_Risk_Score'] = (df_feat['HbA1c_Risk'] + df_feat['BMI_Risk'] + 
                                   df_feat['Age_Risk'] + df_feat['TG_Risk'] + 
                                   df_feat['Chol_Risk'])
    
    # Ratios
    df_feat['Chol_HDL_Ratio'] = df['Chol'] / (df['HDL'] + 1e-5)
    df_feat['LDL_HDL_Ratio'] = df['LDL'] / (df['HDL'] + 1e-5)
    df_feat['TG_HDL_Ratio'] = df['TG'] / (df['HDL'] + 1e-5)
    
    return df_feat

In [8]:
df_enhanced = create_features(df)

# Select features for modeling
# Based on ANOVA results, excluding HDL and LDL as they showed no significance
feature_columns = ['Gender', 'AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'VLDL', 'BMI',
                  'Urea_log', 'Cr_log', 'VLDL_log', 'TG_log',
                  'HbA1c_Risk', 'BMI_Risk', 'Age_Risk', 'TG_Risk', 'Chol_Risk',
                  'Total_Risk_Score', 'Chol_HDL_Ratio', 'LDL_HDL_Ratio', 'TG_HDL_Ratio']

X = df_enhanced[feature_columns]
y = df_enhanced['Class']

print(f"Features shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}\n")

Features shape: (264, 22)
Target distribution:
Class
2    128
0     96
1     40
Name: count, dtype: int64



In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create dataframes for easier handling
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_columns)

In [11]:
print("=== MODEL DEVELOPMENT ===\n")

# 4. Define models to evaluate
models = {
    'Logistic Regression': LogisticRegression(multi_class='multinomial', max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(objective='multi:softmax', num_class=3, random_state=42),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}
results = {}
cv_scores = {}

for name, model in models.items():
    # Cross-validation
    cv_score = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
    cv_scores[name] = cv_score
    
    # Train and evaluate
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    
    results[name] = {
        'model': model,
        'predictions': y_pred,
        'accuracy': accuracy,
        'cv_mean': cv_score.mean(),
        'cv_std': cv_score.std()
    }
    
    print(f"{name}:")
    print(f"  CV Accuracy: {cv_score.mean():.4f} (+/- {cv_score.std() * 2:.4f})")
    print(f"  Test Accuracy: {accuracy:.4f}")

=== MODEL DEVELOPMENT ===

Logistic Regression:
  CV Accuracy: 0.9193 (+/- 0.0577)
  Test Accuracy: 0.9245
Random Forest:
  CV Accuracy: 0.9762 (+/- 0.0426)
  Test Accuracy: 0.9623
Gradient Boosting:
  CV Accuracy: 0.9526 (+/- 0.0522)
  Test Accuracy: 0.9623
XGBoost:
  CV Accuracy: 0.9620 (+/- 0.0775)
  Test Accuracy: 0.9811
SVM:
  CV Accuracy: 0.8388 (+/- 0.0565)
  Test Accuracy: 0.9245
KNN:
  CV Accuracy: 0.7960 (+/- 0.1618)
  Test Accuracy: 0.8868
Decision Tree:
  CV Accuracy: 0.9480 (+/- 0.0461)
  Test Accuracy: 0.9434


  File "C:\Users\LapTop\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "C:\Users\LapTop\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "C:\Users\LapTop\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\LapTop\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1456, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


In [12]:
best_model_name = max(results.keys(), key=lambda x: results[x]['cv_mean'])
print(f"\nBest model based on CV: {best_model_name}")


Best model based on CV: Random Forest


In [14]:
print("\n=== HYPERPARAMETER TUNING ===\n")

# Sort models by CV performance
sorted_models = sorted(results.items(), key=lambda x: x[1]['cv_mean'], reverse=True)[:3]

# Define parameter grids
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.3],
        'subsample': [0.8, 1.0]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.3],
        'subsample': [0.8, 1.0]
    },
    'Logistic Regression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'saga']
    }
}

best_tuned_model = None
best_tuned_score = 0

for model_name, model_results in sorted_models[:2]:  # Tune top 2 models
    if model_name in param_grids:
        print(f"Tuning {model_name}...")
        
        grid_search = GridSearchCV(
            models[model_name],
            param_grids[model_name],
            cv=5,
            scoring='accuracy',
            n_jobs=-1,
            verbose=0
        )
        
        grid_search.fit(X_train_scaled, y_train)
        
        print(f"  Best parameters: {grid_search.best_params_}")
        print(f"  Best CV score: {grid_search.best_score_:.4f}")
        
        if grid_search.best_score_ > best_tuned_score:
            best_tuned_score = grid_search.best_score_
            best_tuned_model = grid_search.best_estimator_
            best_tuned_name = model_name



=== HYPERPARAMETER TUNING ===

Tuning Random Forest...
  Best parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
  Best CV score: 0.9810
Tuning XGBoost...
  Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
  Best CV score: 0.9620


In [15]:
print("\n=== FINAL MODEL EVALUATION ===\n")

# Use the best tuned model
final_model = best_tuned_model
final_model.fit(X_train_scaled, y_train)
y_pred_final = final_model.predict(X_test_scaled)
y_pred_proba = final_model.predict_proba(X_test_scaled)

print(f"Best Model: {best_tuned_name}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_final, 
                          target_names=['Non-Diabetic', 'Diabetic', 'Predict-Diabetic']))


=== FINAL MODEL EVALUATION ===

Best Model: Random Forest

Classification Report:
                  precision    recall  f1-score   support

    Non-Diabetic       0.95      1.00      0.97        19
        Diabetic       1.00      1.00      1.00         8
Predict-Diabetic       1.00      0.96      0.98        26

        accuracy                           0.98        53
       macro avg       0.98      0.99      0.98        53
    weighted avg       0.98      0.98      0.98        53



In [16]:
import joblib

# Save the model and scaler
joblib.dump(final_model, 'diabetes_multiclass_model.pkl')
joblib.dump(scaler, 'diabetes_scaler.pkl')


['diabetes_scaler.pkl']