## Machine Learning - Assignment 2 (Semester 1)

***************************************************************************
BITS ID: **2020AB05241**

Name: **Manish Devraj**

Email: 2025ab05241@wilp.bits-pilani.ac.in

***************************************************************************

This notebook compares the following six models for a classification task:
1. Logistic Regression
2. Decision Tree Classifier
3. K-Nearest Neighbor Classifier
4. Naive Bayes Classifier
5. Random Forest (Ensemble)
6. XGBoost (Ensemble)

**Dataset**:\
"Early Stage Diabetes Risk Prediction" dataset from UCI
https://archive.ics.uci.edu/dataset/529/early+stage+diabetes+risk+prediction+dataset

In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
import joblib
import os

!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo

from sklearn.preprocessing import StandardScaler, LabelEncoder


import warnings
warnings.filter_type = "ignore"



# 1. Load Dataset

In [41]:
# fetch dataset
early_stage_diabetes_risk_prediction = fetch_ucirepo(id=529)

# data (as pandas dataframes)
X = early_stage_diabetes_risk_prediction.data.features
y = early_stage_diabetes_risk_prediction.data.targets

# 2. Preprocessing

In [42]:

df = pd.DataFrame(X)
df['target'] = y

print(f"Dataset Shape: {df.shape}")

# Encoding target variable
le = LabelEncoder()
df['target'] = le.fit_transform(df['target'])

# One-hot encode categorical features
X_encoded = pd.get_dummies(df.drop('target', axis=1), drop_first=True)
y_encoded = df['target']

# Splitting Features and Target
X = X_encoded
y = y_encoded

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Combine X_test and y_test into a single DataFrame
test_df = X_test.copy()
test_df['class'] = y_test  # Make sure 'class' matches your actual target column name

# Save to CSV
test_df.to_csv("test_data.csv", index=False)
print("✅ test_data.csv created successfully!")

Dataset Shape: (520, 17)
✅ test_data.csv created successfully!


# 3. Model Training And Implementations

In [43]:
results = []

if not os.path.exists('model'):
    os.makedirs('model')

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier( eval_metric='logloss', random_state=42)
}

def evaluate_model():
    print("Training Models...")
    for name, model in models.items():
      model.fit(X_train_scaled, y_train)

      # Save Model
      joblib.dump(model, f'model/{name.replace(" ", "_")}.pkl')

      # # Predict
      y_pred = model.predict(X_test_scaled)
      # Calculate Probabilities for AUC (handle models without predict_proba)
      if hasattr(model, "predict_proba"):
          # Check if binary classification (2 classes)
          if len(model.classes_) == 2:
              y_prob = model.predict_proba(X_test_scaled)[:, 1]
          else:
              y_prob = None # AUC is less relevant for multi-class in this simple view
      else:
          y_prob = None

      # Calculate Metrics
      # Note: For multi-class, you may need average='macro' for precision/recall/f1
      metrics = {
          "Model name": name,
          "Accuracy": accuracy_score(y_test, y_pred),
          "AUC": roc_auc_score(y_test, y_prob) if len(np.unique(y)) == 2 else 0,
          "Precision": precision_score(y_test, y_pred, average='weighted'),
          "Recall": recall_score(y_test, y_pred, average='weighted'),
          "F1 Score": f1_score(y_test, y_pred, average='weighted'),
          "MCC": matthews_corrcoef(y_test, y_pred)
      }
      results.append(metrics)

      print(f"Finished evaluating {name}")

evaluate_model()

Training Models...
Finished evaluating Logistic Regression
Finished evaluating Decision Tree
Finished evaluating KNN
Finished evaluating Naive Bayes
Finished evaluating Random Forest
Finished evaluating XGBoost


# 4. Compare models

In [44]:
results_df = pd.DataFrame(results)
print("\n=== Evaluation Metrics for README ===")
print("\nComparison Table with the evaluation metrics for all 6 models\n")
print(results_df)
results_df.to_csv("model_metrics.csv", index=False)


=== Evaluation Metrics for README ===

Comparison Table with the evaluation metrics for all 6 models

            Model name  Accuracy       AUC  Precision    Recall  F1 Score  \
0  Logistic Regression  0.923077  0.977379   0.922533  0.923077  0.922409   
1        Decision Tree  0.951923  0.964789   0.958249  0.951923  0.952739   
2                  KNN  0.894231  0.977379   0.902167  0.894231  0.896025   
3          Naive Bayes  0.913462  0.960734   0.912927  0.913462  0.913098   
4        Random Forest  0.990385  1.000000   0.990667  0.990385  0.990422   
5              XGBoost  0.971154  1.000000   0.973558  0.971154  0.971470   

        MCC  
0  0.820358  
1  0.898479  
2  0.769771  
3  0.798823  
4  0.978222  
5  0.936981  
