<a href="https://colab.research.google.com/github/lynn-yg/Deakin-Unit-Page/blob/main/SIT719_pima_pipeline_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# 1) Load the data
# Assuming the Pima Indians Diabetes Database is in a CSV file named 'pima-indians-diabetes.csv'
# You might need to change the path to your data file
try:
    df = pd.read_csv('pima-indians-diabetes.csv')
except FileNotFoundError:
    print("Error: 'pima-indians-diabetes.csv' not found. Please upload the dataset or provide the correct path.")
    # Exit or handle the error as appropriate for your notebook environment
    # For now, we'll create a dummy dataframe to allow the code structure to be generated
    # In a real scenario, you would stop execution here if the file is essential
    data = {'Pregnancies': [6, 1, 8, 1, 0],
            'Glucose': [148, 85, 183, 89, 137],
            'BloodPressure': [72, 66, 64, 66, 40],
            'SkinThickness': [35, 29, 0, 23, 35],
            'Insulin': [0, 0, 0, 94, 168],
            'BMI': [33.6, 26.6, 23.3, 28.1, 43.1],
            'DiabetesPedigreeFunction': [0.627, 0.351, 0.672, 0.167, 2.288],
            'Age': [50, 31, 32, 21, 33],
            'Outcome': [1, 0, 1, 0, 1]}
    df = pd.DataFrame(data)


# Define features (X) and target (y)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# 2) Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 3) Define the models and their parameter grids
models = {
    "DecisionTree": (Pipeline([('scaler', StandardScaler()), ('imputer', SimpleImputer(strategy='median')), ('dt', DecisionTreeClassifier(random_state=42))]),
                     {'dt__max_depth': [3, 5, 7, 9], 'dt__min_samples_split': [2, 5, 10]}),
    "SVM": (Pipeline([('scaler', StandardScaler()), ('imputer', SimpleImputer(strategy='median')), ('svc', SVC(probability=True, random_state=42))]),
            {'svc__C': [0.1, 1, 10], 'svc__kernel': ['linear', 'rbf']}),
    "RandomForest": (Pipeline([('scaler', StandardScaler()), ('imputer', SimpleImputer(strategy='median')), ('rf', RandomForestClassifier(random_state=42))]),
                      {'rf__n_estimators': [50, 100, 200], 'rf__max_depth': [3, 5, 7]})
}

# 4) Define the cross-validation strategy
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = []
per_model_best = {}

# 5) Loop through models and perform GridSearchCV
for name, (pipe, grid) in models.items():
    print(f"Training {name}...")
    gs = GridSearchCV(pipe, grid, cv=skf, n_jobs=-1, scoring='f1', verbose=0) # Use f1 for primary scoring
    gs.fit(X_train, y_train)

    per_model_best[name] = {
        "best_params": gs.best_params_,
        "best_score": gs.best_score_, # This is the mean cross-validated score
        "estimator": gs.best_estimator_
    }

    # Evaluate on the test set
    y_pred = gs.best_estimator_.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    results.append({
        "Model": name,
        "CV_F1": gs.best_score_,
        "Test_Accuracy": acc,
        "Test_Precision": prec,
        "Test_Recall": rec,
        "Test_F1": f1,
        "BestParams": gs.best_params_
    })

print("\n=== Cross-validation and Test Results ===")
res_df = pd.DataFrame(results)
display(res_df)


# 6) Model Selection Logic
# Select BEST model by cross-validated F1 (tie-break: Recall, then Accuracy)
res_df_cv_sorted = res_df.sort_values(by=["CV_F1", "Test_Recall", "Test_Accuracy"], ascending=[False, False, False])
ix_best = res_df_cv_sorted.index[0]
BEST_MODEL_NAME = res_df_cv_sorted.loc[ix_best, "Model"]

BEST = {
    "name": BEST_MODEL_NAME,
    "cv_f1": float(res_df.loc[ix_best, "CV_F1"]),
    "cv_recall": float(res_df.loc[ix_best, "Test_Recall"]), # Using Test Recall as tie-breaker
    "cv_accuracy": float(res_df.loc[ix_best, "Test_Accuracy"]), # Using Test Accuracy as tie-breaker
    "test_f1": float(res_df.loc[ix_best, "Test_F1"]),
    "test_recall": float(res_df.loc[ix_best, "Test_Recall"]),
    "test_accuracy": float(res_df.loc[ix_best, "Test_Accuracy"]),
    "best_params": per_model_best[BEST_MODEL_NAME]["best_params"],
    "estimator": per_model_best[BEST_MODEL_NAME]["estimator"],
}

print("\n=== Model Selection (based on CV) ===")
print(f"BEST model: {BEST['name']}")
print(f"CV: F1={BEST['cv_f1']:.4f}, Recall={BEST['cv_recall']:.4f}, Acc={BEST['cv_accuracy']:.4f}")
print(f"Test: F1={BEST['test_f1']:.4f}, Recall={BEST['test_recall']:.4f}, Acc={BEST['test_accuracy']:.4f}")
print(f"Best params: {BEST['best_params']}")

# Now 'best' is defined as the estimator from the BEST model
best = BEST["estimator"]

# 7) Final evaluation artifacts for BEST
y_pred_best = best.predict(X_test)
cm = confusion_matrix(y_test, y_pred_best)
print("\nConfusion Matrix (BEST):\n", cm)
print("\nClassification Report (BEST):\n", classification_report(y_test, y_pred_best, digits=4))

# Also export a compact CSV for Table 4-like section
table4_cols = ["Model", "Test_Accuracy", "Test_Precision", "Test_Recall", "Test_F1"]
res_df[table4_cols].sort_values("Test_F1", ascending=False).to_csv(
"pima_results_table4_like.csv", index=False
)

with open("README_RUN.txt", "w") as f:
    f.write(
"""
How to run
----------
1) pip install -r requirements.txt
2) Edit the CSV path in pima_pipeline.py
3) python pima_pipeline.py
Outputs
-------
- pima_results_table4_like.csv: metrics table (Accuracy, Precision, Recall, F1)
- stdout: best params and confusion matrix
"""
)

Training DecisionTree...
Training SVM...
Training RandomForest...

=== Cross-validation and Test Results ===


Unnamed: 0,Model,CV_F1,Test_Accuracy,Test_Precision,Test_Recall,Test_F1,BestParams
0,DecisionTree,0.571941,0.792208,0.703704,0.703704,0.703704,"{'dt__max_depth': 5, 'dt__min_samples_split': 2}"
1,SVM,0.659704,0.720779,0.622222,0.518519,0.565657,"{'svc__C': 1, 'svc__kernel': 'linear'}"
2,RandomForest,0.637795,0.74026,0.652174,0.555556,0.6,"{'rf__max_depth': 7, 'rf__n_estimators': 200}"



=== Model Selection (based on CV) ===
BEST model: SVM
CV: F1=0.6597, Recall=0.5185, Acc=0.7208
Test: F1=0.5657, Recall=0.5185, Acc=0.7208
Best params: {'svc__C': 1, 'svc__kernel': 'linear'}

Confusion Matrix (BEST):
 [[83 17]
 [26 28]]

Classification Report (BEST):
               precision    recall  f1-score   support

           0     0.7615    0.8300    0.7943       100
           1     0.6222    0.5185    0.5657        54

    accuracy                         0.7208       154
   macro avg     0.6918    0.6743    0.6800       154
weighted avg     0.7126    0.7208    0.7141       154

