In [6]:
# ─── 1) IMPORTS ─────────────────────────────────────────────────────────────
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.datasets import load_iris, load_wine
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression



In [7]:
# Load built-in datasets
iris_df = load_iris(as_frame=True).frame
wine_df = load_wine(as_frame=True).frame

# Load uploaded real-world dataset
diabetes_df = pd.read_csv("diabetes.csv")

# Build dataset map
raw_datasets = {
    'Iris': iris_df,
    'Wine': wine_df,
    'Diabetes': diabetes_df
}



In [8]:
def preprocess_dataset(df, target_col):
    df = df.copy()
    
    # Drop rows with all NaNs and duplicates
    df.dropna(how='all', inplace=True)
    df.drop_duplicates(inplace=True)
    
    # Fill missing values
    for col in df.columns:
        if df[col].dtype in [np.float64, np.int64]:
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna(df[col].mode()[0], inplace=True)
    
    # Encode categorical features (if any)
    for col in df.select_dtypes(include='object').columns:
        df[col] = LabelEncoder().fit_transform(df[col])
    
    # Split into features and target
    y = df[target_col]
    X = df.drop(columns=[target_col])
    
    return train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
models = {
    'Decision Tree': (
        DecisionTreeClassifier(random_state=42),
        {'clf__max_depth': [None, 5, 10], 'clf__min_samples_split': [2, 5, 10]}
    ),
    'Random Forest': (
        RandomForestClassifier(random_state=42),
        {'clf__n_estimators': [50, 100], 'clf__max_depth': [None, 5, 10]}
    ),
    'KNN': (
        KNeighborsClassifier(),
        {'clf__n_neighbors': [3, 5, 7]}
    ),
    'Logistic Regression': (
        LogisticRegression(max_iter=1000, random_state=42),
        {'clf__C': [0.01, 0.1, 1, 10]}
    )
}




In [12]:
from sklearn.utils.multiclass import type_of_target

results = []

for dataset_name, df in raw_datasets.items():
    print(f"\nProcessing: {dataset_name}")
    
    try:
        X_train, X_test, y_train, y_test = preprocess_dataset(df, target_columns[dataset_name])
        min_class_count = pd.Series(y_train).value_counts().min()
        cv_folds = min(5, min_class_count)

        for model_name, (model, param_grid) in models.items():
            pipe = Pipeline([
                ('scaler', StandardScaler()),
                ('clf', model)
            ])

            try:
                grid = GridSearchCV(pipe, param_grid=param_grid, cv=cv_folds, scoring='accuracy')
                grid.fit(X_train, y_train)
                y_pred = grid.predict(X_test)

                # Safe AUC computation
                if hasattr(grid, 'predict_proba'):
                    if type_of_target(y_test) == 'binary':
                        auc = roc_auc_score(y_test, grid.predict_proba(X_test)[:, 1])
                    else:
                        auc = roc_auc_score(y_test, grid.predict_proba(X_test), multi_class='ovr')
                else:
                    auc = None

                results.append({
                    'Dataset': dataset_name,
                    'Model': model_name,
                    'Best Params': grid.best_params_,
                    'Accuracy': accuracy_score(y_test, y_pred),
                    'F1': f1_score(y_test, y_pred, average='weighted'),
                    'Precision': precision_score(y_test, y_pred, average='weighted'),
                    'Recall': recall_score(y_test, y_pred, average='weighted'),
                    'AUC': auc
                })

            except Exception as model_error:
                results.append({
                    'Dataset': dataset_name,
                    'Model': model_name,
                    'Error': str(model_error)
                })

    except Exception as dataset_error:
        print(f"Error processing dataset '{dataset_name}': {dataset_error}")

# Create and display final results DataFrame
results_df = pd.DataFrame(results)
print(results_df)



Processing: Iris

Processing: Wine

Processing: Diabetes
     Dataset                Model  \
0       Iris        Decision Tree   
1       Iris        Random Forest   
2       Iris                  KNN   
3       Iris  Logistic Regression   
4       Wine        Decision Tree   
5       Wine        Random Forest   
6       Wine                  KNN   
7       Wine  Logistic Regression   
8   Diabetes        Decision Tree   
9   Diabetes        Random Forest   
10  Diabetes                  KNN   
11  Diabetes  Logistic Regression   

                                          Best Params  Accuracy        F1  \
0   {'clf__max_depth': None, 'clf__min_samples_spl...  1.000000  1.000000   
1   {'clf__max_depth': None, 'clf__n_estimators': 50}  1.000000  1.000000   
2                             {'clf__n_neighbors': 3}  1.000000  1.000000   
3                                       {'clf__C': 1}  1.000000  1.000000   
4   {'clf__max_depth': None, 'clf__min_samples_spl...  0.944444  0.944856  