In [1]:
import boto3
import pandas as pd
from io import StringIO

bucket = "group3-appendicitis-bucket"
prefix = "data"  

s3 = boto3.client("s3")

def read_csv_from_s3(key):
    obj = s3.get_object(Bucket=bucket, Key=key)
    return pd.read_csv(obj["Body"])

X = read_csv_from_s3("data/X_clean.csv")
y_df = read_csv_from_s3("data/y_clean.csv")




In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

#### Prepare the target variable

In [3]:
y_df = y_df.copy()
y_df["Diagnosis"] = (
    y_df["Diagnosis"]
    .str.lower()
    .str.strip()
    .map({"appendicitis": 1, "no appendicitis": 0})
)

y = y_df[["Diagnosis"]]

print("Target value counts (0 = no appendicitis, 1 = appendicitis):")
print(y["Diagnosis"].value_counts())

Target value counts (0 = no appendicitis, 1 = appendicitis):
Diagnosis
1    463
0    317
Name: count, dtype: int64


#### Train, Validation and Test split

In [4]:
random_seed = 42
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    train_size=0.6,
    stratify=y["Diagnosis"],
    random_state=random_seed
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    train_size=0.5,
    stratify=y_temp["Diagnosis"],
    random_state=random_seed
)


X_train = X_train.reset_index(drop=True)
X_val   = X_val.reset_index(drop=True)
X_test  = X_test.reset_index(drop=True)

y_train = y_train.reset_index(drop=True)
y_val   = y_val.reset_index(drop=True)
y_test  = y_test.reset_index(drop=True)

print("Shapes (X_train, X_val, X_test):", X_train.shape, X_val.shape, X_test.shape)


Shapes (X_train, X_val, X_test): (468, 53) (156, 53) (156, 53)


#### Checking variables

In [5]:
def common_var_checker(df_train, df_val, df_test, target):
    """
    The common variables checker

    Parameters
    ----------
    df_train : the dataframe of training data
    df_val   : the dataframe of validation data
    df_test  : the dataframe of test data
    target   : the name of the target

    Returns
    -------
    DataFrame of common variables between train, val and test
    """
    df_common_var = pd.DataFrame(
        np.intersect1d(
            np.intersect1d(df_train.columns, df_val.columns),
            np.union1d(df_test.columns, [target])
        ),
        columns=["common var"]
    )
    return df_common_var
df_train = pd.concat([X_train, y_train], axis=1)
df_val   = pd.concat([X_val, y_val], axis=1)
df_test  = pd.concat([X_test, y_test], axis=1)

target = "Diagnosis"
df_common_var = common_var_checker(df_train, df_val, df_test, target)
print("\nCommon variables across train/val/test:")
print(df_common_var)


Common variables across train/val/test:
                          common var
0                   Abscess_Location
1                                Age
2                     Alvarado_Score
3                      Appendicolith
4               Appendicular_Abscess
5                  Appendix_Diameter
6               Appendix_Wall_Layers
7                     Appendix_on_US
8                                BMI
9                   Body_Temperature
10             Bowel_Wall_Thickening
11                               CRP
12       Conglomerate_of_Bowel_Loops
13  Contralateral_Rebound_Tenderness
14                       Coprostasis
15                     Coughing_Pain
16                         Diagnosis
17                           Dysuria
18                         Enteritis
19                       Free_Fluids
20            Gynecological_Findings
21                            Height
22                        Hemoglobin
23                             Ileus
24    Ipsilateral_Rebound_Tenderne

#### Removing features with >50% missing

In [6]:
missing_percentage = X_train.isnull().mean()
features_to_remove = missing_percentage[missing_percentage > 0.5].index

print("\nFeatures with >50% missing data:")
print(features_to_remove)

# Drop those columns from all splits
X_train = X_train.drop(columns=features_to_remove)
X_val   = X_val.drop(columns=features_to_remove)
X_test  = X_test.drop(columns=features_to_remove)


Features with >50% missing data:
Index(['Segmented_Neutrophils', 'Appendix_Wall_Layers', 'Target_Sign',
       'Appendicolith', 'Perfusion', 'Perforation',
       'Surrounding_Tissue_Reaction', 'Appendicular_Abscess',
       'Abscess_Location', 'Pathological_Lymph_Nodes', 'Lymph_Nodes_Location',
       'Bowel_Wall_Thickening', 'Conglomerate_of_Bowel_Loops', 'Ileus',
       'Coprostasis', 'Meteorism', 'Enteritis', 'Gynecological_Findings'],
      dtype='object')


#### Impute missing values

In [7]:
from sklearn.impute import SimpleImputer
numeric_cols = X_train.select_dtypes(include=["float64", "int64"]).columns
categorical_cols = X_train.select_dtypes(include=["object"]).columns

print("\nNumeric columns:")
print(numeric_cols)
print("\nCategorical columns:")
print(categorical_cols)

# Numerical imputation 
mean_impute_cols = [
    "BMI", "Height", "Weight", "Alvarado_Score",
    "Paedriatic_Appendicitis_Score", "Appendix_Diameter",
    "WBC_Count", "Neutrophil_Percentage", "RBC_Count",
    "Thrombocyte_Count"
]

median_impute_cols = [
    "Body_Temperature",
    "Length_of_Stay",
    "CRP",
    "Hemoglobin",
    "RDW"
]

mean_imputer = SimpleImputer(strategy="mean")
X_train[mean_impute_cols] = mean_imputer.fit_transform(X_train[mean_impute_cols])
X_val[mean_impute_cols]   = mean_imputer.transform(X_val[mean_impute_cols])
X_test[mean_impute_cols]  = mean_imputer.transform(X_test[mean_impute_cols])

median_imputer = SimpleImputer(strategy="median")
X_train[median_impute_cols] = median_imputer.fit_transform(X_train[median_impute_cols])
X_val[median_impute_cols]   = median_imputer.transform(X_val[median_impute_cols])
X_test[median_impute_cols]  = median_imputer.transform(X_test[median_impute_cols])

print("\nRemaining numeric NaNs after imputation:")
print(X_train[numeric_cols].isnull().sum())

# Categorical imputation 
cat_missing = X_train[categorical_cols].isnull().sum()
cat_missing = cat_missing[cat_missing > 0].sort_values(ascending=False)
print("\nCategorical columns with missing values:")
print(cat_missing)

categorical_impute_cols = cat_missing.index.tolist()
cat_imputer = SimpleImputer(strategy="most_frequent")

X_train[categorical_impute_cols] = cat_imputer.fit_transform(X_train[categorical_impute_cols])
X_val[categorical_impute_cols]   = cat_imputer.transform(X_val[categorical_impute_cols])
X_test[categorical_impute_cols]  = cat_imputer.transform(X_test[categorical_impute_cols])

print("\nRemaining categorical NaNs after imputation:")
print(X_train[categorical_cols].isnull().sum())



Numeric columns:
Index(['Age', 'BMI', 'Height', 'Weight', 'Length_of_Stay', 'Alvarado_Score',
       'Paedriatic_Appendicitis_Score', 'Appendix_Diameter',
       'Body_Temperature', 'WBC_Count', 'Neutrophil_Percentage', 'RBC_Count',
       'Hemoglobin', 'RDW', 'Thrombocyte_Count', 'CRP'],
      dtype='object')

Categorical columns:
Index(['Sex', 'Appendix_on_US', 'Migratory_Pain', 'Lower_Right_Abd_Pain',
       'Contralateral_Rebound_Tenderness', 'Coughing_Pain', 'Nausea',
       'Loss_of_Appetite', 'Neutrophilia', 'Ketones_in_Urine', 'RBC_in_Urine',
       'WBC_in_Urine', 'Dysuria', 'Stool', 'Peritonitis', 'Psoas_Sign',
       'Ipsilateral_Rebound_Tenderness', 'US_Performed', 'Free_Fluids'],
      dtype='object')

Remaining numeric NaNs after imputation:
Age                              0
BMI                              0
Height                           0
Weight                           0
Length_of_Stay                   0
Alvarado_Score                   0
Paedriatic_Appendicitis

#### Encoding categorical variables

In [8]:
from sklearn.preprocessing import OneHotEncoder


categorical_cols = X_train.select_dtypes(include=["object"]).columns

print("\nUnique values for each categorical column:")
for col in categorical_cols:
    print(f"\n{col}: {X_train[col].unique()}")

# Ordinal mappings
ordinal_mapping = {
    "Peritonitis": {"no": 0, "local": 1, "generalized": 2},
    "Stool": {"normal": 0, "constipation": 1, "diarrhea": 2, "constipation, diarrhea": 3},
    "Ketones_in_Urine": {"no": 0, "+": 1, "++": 2, "+++": 3},
    "RBC_in_Urine": {"no": 0, "+": 1, "++": 2, "+++": 3},
    "WBC_in_Urine": {"no": 0, "+": 1, "++": 2, "+++": 3}
}

for col, mapping in ordinal_mapping.items():
    if col in X_train.columns:
        X_train[col] = X_train[col].map(mapping)
        X_val[col]   = X_val[col].map(mapping)
        X_test[col]  = X_test[col].map(mapping)

# Recompute categorical columns
categorical_cols = X_train.select_dtypes(include=["object"]).columns

# Nominal (non-ordinal) columns
nominal_cols = list(set(categorical_cols) - set(ordinal_mapping.keys()))
print("\nNominal columns to one-hot encode:")
print(nominal_cols)

# One-hot encode nominal columns
X_train = pd.get_dummies(X_train, columns=nominal_cols, drop_first=True)
X_val   = pd.get_dummies(X_val, columns=nominal_cols, drop_first=True)
X_test  = pd.get_dummies(X_test, columns=nominal_cols, drop_first=True)

# Align validation and test columns to training
X_val  = X_val.reindex(columns=X_train.columns, fill_value=0)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)


Unique values for each categorical column:

Sex: ['female' 'male']

Appendix_on_US: ['yes' 'no']

Migratory_Pain: ['yes' 'no']

Lower_Right_Abd_Pain: ['yes' 'no']

Contralateral_Rebound_Tenderness: ['no' 'yes']

Coughing_Pain: ['yes' 'no']

Nausea: ['yes' 'no']

Loss_of_Appetite: ['no' 'yes']

Neutrophilia: ['yes' 'no']

Ketones_in_Urine: ['+++' 'no' '++' '+']

RBC_in_Urine: ['no' '+' '++' '+++']

WBC_in_Urine: ['+' 'no' '++' '+++']

Dysuria: ['no' 'yes']

Stool: ['diarrhea' 'normal' 'constipation' 'constipation, diarrhea']

Peritonitis: ['no' 'local' 'generalized']

Psoas_Sign: ['no' 'yes']

Ipsilateral_Rebound_Tenderness: ['no' 'yes']

US_Performed: ['yes' 'no']

Free_Fluids: ['yes' 'no']

Nominal columns to one-hot encode:
['Free_Fluids', 'Contralateral_Rebound_Tenderness', 'Coughing_Pain', 'Migratory_Pain', 'Loss_of_Appetite', 'Sex', 'Nausea', 'Neutrophilia', 'Appendix_on_US', 'Ipsilateral_Rebound_Tenderness', 'US_Performed', 'Lower_Right_Abd_Pain', 'Dysuria', 'Psoas_Sign']


In [9]:
y_train = y_train["Diagnosis"]
y_val   = y_val["Diagnosis"]
y_test  = y_test["Diagnosis"]

#### Scaling numeric features

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

#### Checking class imbalance

In [11]:
class_counts = pd.Series(y_train).value_counts().sort_index()
print("\nClass counts in y_train (0=no appendicitis, 1=appendicitis):")
print(class_counts)



Class counts in y_train (0=no appendicitis, 1=appendicitis):
Diagnosis
0    190
1    278
Name: count, dtype: int64


#### Handling class imbalance + models + GridSearchCV

In [12]:
random_seed = 42
smote = SMOTE(random_state=random_seed)

models = {
    "lr":   LogisticRegression(class_weight="balanced", random_state=random_seed, max_iter=1000),
    "mlpc": MLPClassifier(early_stopping=True, random_state=random_seed),
    "rfc":  RandomForestClassifier(class_weight="balanced", random_state=random_seed),
    "hgbc": HistGradientBoostingClassifier(random_state=random_seed)
}

pipes = {acronym: Pipeline([("model", model)]) for acronym, model in models.items()}

# Hyperparameter grids
param_grids = {}

param_grids["lr"] = {
    "model__C": [0.01, 0.1, 1, 10]
}

param_grids["mlpc"] = {
    "model__hidden_layer_sizes": [(50,), (100,), (50, 50)],
    "model__activation": ["relu", "tanh"],
    "model__solver": ["adam", "sgd"],
    "model__alpha": [0.0001, 0.001, 0.01],
    "model__learning_rate": ["constant", "invscaling", "adaptive"]
}

param_grids["rfc"] = {
    "model__n_estimators": [50, 100, 200],
    "model__max_depth": [10, 20, None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__bootstrap": [True, False]
}

param_grids["hgbc"] = {
    "model__learning_rate": [0.01, 0.1, 0.5],
    "model__max_iter": [100, 200],
    "model__max_depth": [3, 5, 7],
    "model__min_samples_leaf": [20, 30, 50]
}

grid_searches = {}
model_predictions = {}

# Train models with/without SMOTE
for acronym, pipe in pipes.items():
    print(f"\n==============================")
    print(f"Training model: {acronym.upper()}")
    print(f"==============================")

    if acronym in ["mlpc", "hgbc"]:
        # Apply SMOTE for models without class_weight
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
        gs = GridSearchCV(
            pipe,
            param_grid=param_grids[acronym],
            cv=5,
            n_jobs=-1,
            scoring="f1_macro",
            verbose=1
        )
        gs.fit(X_train_resampled, y_train_resampled)
    else:
        
        gs = GridSearchCV(
            pipe,
            param_grid=param_grids[acronym],
            cv=5,
            n_jobs=-1,
            scoring="f1_macro",
            verbose=1
        )
        gs.fit(X_train_scaled, y_train)

    grid_searches[acronym] = gs

    print(f"\n--- Results for {acronym.upper()} ---")
    print("Best Parameters:", gs.best_params_)
    print(f"Best CV f1_macro Score: {gs.best_score_:.4f}")

    # Test set performance
    y_pred = gs.best_estimator_.predict(X_test_scaled)
    model_predictions[acronym] = y_pred

    print(f"Prediction shape: {y_pred.shape} | True labels shape: {y_test.shape}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=["no appendicitis", "appendicitis"]))



Training model: LR
Fitting 5 folds for each of 4 candidates, totalling 20 fits

--- Results for LR ---
Best Parameters: {'model__C': 1}
Best CV f1_macro Score: 0.8884
Prediction shape: (156,) | True labels shape: (156,)
Classification Report:
                 precision    recall  f1-score   support

no appendicitis       0.82      0.94      0.88        64
   appendicitis       0.95      0.86      0.90        92

       accuracy                           0.89       156
      macro avg       0.89      0.90      0.89       156
   weighted avg       0.90      0.89      0.89       156


Training model: MLPC
Fitting 5 folds for each of 108 candidates, totalling 540 fits

--- Results for MLPC ---
Best Parameters: {'model__activation': 'relu', 'model__alpha': 0.0001, 'model__hidden_layer_sizes': (50, 50), 'model__learning_rate': 'constant', 'model__solver': 'adam'}
Best CV f1_macro Score: 0.8282
Prediction shape: (156,) | True labels shape: (156,)
Classification Report:
                 preci

After completing all preprocessing steps—including handling missing values, ordinal and nominal encoding, feature scaling, and class-imbalance handling—we trained four supervised learning models and evaluated them using cross-validation and hyperparameter tuning. Logistic Regression achieved a strong linear baseline with an f1_macro score of ~0.89 and an overall test accuracy of 0.89, demonstrating that a linear decision boundary captures a meaningful amount of signal in the dataset. The MLPClassifier performed moderately well with an f1_macro of 0.82, although its improvement was limited compared to tree-based models, likely due to the relatively small tabular dataset and the model’s sensitivity to architecture and hyperparameters.

The ensemble methods showed the most robust and consistent performance. Random Forest achieved an f1_macro of 0.93 and a test accuracy of 0.94, benefiting from its ability to model nonlinear clinical patterns and interactions. The best overall performance came from the HistGradientBoosting Classifier, which achieved a cross-validated f1_macro of 0.97 and a test accuracy of 0.95 with highly balanced precision and recall across both diagnostic classes. These results indicate that gradient-boosted decision trees are the most effective and reliable approach for predicting appendicitis in this dataset, outperforming both linear and neural network models.

#### Auto ML With Pycaret

In [14]:
from pycaret.classification import setup, compare_models, pull, save_model, predict_model
df_train_auto = X_train.copy()
df_train_auto["Diagnosis"] = y_train.values

df_test_auto = X_test.copy()
df_test_auto["Diagnosis"] = y_test.values

print(df_train_auto.shape, df_test_auto.shape)
df_train_auto.head()


(468, 36) (156, 36)


Unnamed: 0,Age,BMI,Height,Weight,Length_of_Stay,Alvarado_Score,Paedriatic_Appendicitis_Score,Appendix_Diameter,Body_Temperature,WBC_Count,...,Sex_male,Nausea_yes,Neutrophilia_yes,Appendix_on_US_yes,Ipsilateral_Rebound_Tenderness_yes,US_Performed_yes,Lower_Right_Abd_Pain_yes,Dysuria_yes,Psoas_Sign_yes,Diagnosis
0,14.58,22.98,155.0,55.2,4.0,9.0,9.0,4.6,38.7,14.0,...,False,True,True,True,False,True,True,False,False,0
1,6.36,13.19,120.0,19.0,3.0,8.0,6.0,7.0,38.2,20.9,...,False,False,True,True,False,True,True,False,False,1
2,9.51,19.03,147.5,41.4,3.0,9.0,6.0,4.5,37.8,12.4,...,True,True,True,True,False,True,True,False,False,0
3,11.48,18.81,144.0,39.0,9.0,4.0,3.0,5.1,38.6,4.4,...,False,False,False,True,False,True,True,False,False,1
4,11.27,15.19,152.0,35.1,3.0,6.0,7.0,4.7,38.0,8.7,...,False,True,False,True,False,True,True,False,False,0


#### PyCaret setup

In [15]:
clf_setup = setup(
    data=df_train_auto,
    target="Diagnosis",
    train_size=0.8,     
    session_id=42,
    fold=5,
    use_gpu=False        
)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Diagnosis
2,Target type,Binary
3,Original data shape,"(468, 36)"
4,Transformed data shape,"(468, 36)"
5,Transformed train set shape,"(374, 36)"
6,Transformed test set shape,"(94, 36)"
7,Numeric features,21
8,Preprocess,True
9,Imputation type,simple


#### compare multiple models

In [17]:
best_automl_model = compare_models()  
leaderboard = pull()  

print("=== AutoML Leaderboard (Top models) ===")
display(leaderboard.head(10))

print("\nBest AutoML model:")
print(best_automl_model)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9519,0.9788,0.9591,0.9596,0.959,0.9006,0.9016,7.464
gbc,Gradient Boosting Classifier,0.9492,0.9847,0.9545,0.959,0.9565,0.8954,0.8963,0.198
ada,Ada Boost Classifier,0.9117,0.9615,0.9229,0.9305,0.9249,0.8174,0.8219,0.13
dt,Decision Tree Classifier,0.8957,0.8904,0.9186,0.9067,0.9124,0.7836,0.7844,0.036
rf,Random Forest Classifier,0.8903,0.969,0.9053,0.9095,0.907,0.7731,0.774,0.214
et,Extra Trees Classifier,0.885,0.9509,0.9322,0.8808,0.9054,0.7589,0.7625,0.172
lr,Logistic Regression,0.8717,0.9569,0.8788,0.9036,0.8903,0.736,0.7381,0.158
ridge,Ridge Classifier,0.8476,0.9432,0.8243,0.9117,0.8645,0.6914,0.698,0.026
lda,Linear Discriminant Analysis,0.8476,0.9405,0.8243,0.9117,0.8645,0.6914,0.698,0.034
qda,Quadratic Discriminant Analysis,0.8129,0.8984,0.8378,0.8496,0.8408,0.6132,0.6206,0.034


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064786 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 908
[LightGBM] [Info] Number of data points in the train set: 299, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.595318 -> initscore=0.385993
[LightGBM] [Info] Start training from score 0.385993
[LightGBM] [Info] Number of positive: 177, number of negative: 122
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064916 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 916
[LightGBM] [Info] Number of data points in the train set: 300, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.593333 -> initscore=0.377763
[LightGBM] [Info] Star

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9519,0.9788,0.9591,0.9596,0.959,0.9006,0.9016,7.464
gbc,Gradient Boosting Classifier,0.9492,0.9847,0.9545,0.959,0.9565,0.8954,0.8963,0.198
ada,Ada Boost Classifier,0.9117,0.9615,0.9229,0.9305,0.9249,0.8174,0.8219,0.13
dt,Decision Tree Classifier,0.8957,0.8904,0.9186,0.9067,0.9124,0.7836,0.7844,0.036
rf,Random Forest Classifier,0.8903,0.969,0.9053,0.9095,0.907,0.7731,0.774,0.214
et,Extra Trees Classifier,0.885,0.9509,0.9322,0.8808,0.9054,0.7589,0.7625,0.172
lr,Logistic Regression,0.8717,0.9569,0.8788,0.9036,0.8903,0.736,0.7381,0.158
ridge,Ridge Classifier,0.8476,0.9432,0.8243,0.9117,0.8645,0.6914,0.698,0.026
lda,Linear Discriminant Analysis,0.8476,0.9405,0.8243,0.9117,0.8645,0.6914,0.698,0.034
qda,Quadratic Discriminant Analysis,0.8129,0.8984,0.8378,0.8496,0.8408,0.6132,0.6206,0.034



Best AutoML model:
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)


#### Best AutoML Model

In [22]:
from sklearn.metrics import classification_report, f1_score, accuracy_score

# predict_model on the test set
test_predictions_df = predict_model(best_automl_model, data=df_test_auto)

y_test_true = df_test_auto["Diagnosis"].values
y_test_pred = test_predictions_df["prediction_label"].values

print("=== AutoML Test Performance ===")
print("Accuracy:", accuracy_score(y_test_true, y_test_pred))
print("F1 (macro):", f1_score(y_test_true, y_test_pred, average="macro"))
print("\nClassification report:")
print(classification_report(y_test_true, y_test_pred, target_names=["no appendicitis", "appendicitis"]))


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.9359,0.9794,0.9674,0.9271,0.9468,0.8663,0.8675


=== AutoML Test Performance ===
Accuracy: 0.9358974358974359
F1 (macro): 0.9330816746739876

Classification report:
                 precision    recall  f1-score   support

no appendicitis       0.95      0.89      0.92        64
   appendicitis       0.93      0.97      0.95        92

       accuracy                           0.94       156
      macro avg       0.94      0.93      0.93       156
   weighted avg       0.94      0.94      0.94       156



We applied PyCaret's AutoML framework to evaluate a broad range of classification models on the preprocessed training dataset. The AutoML search identified GradientBoostingClassifier as the best-performing model, achieving a cross-validated macro F1-score of approximately 0.96. Evaluation on our held-out test set resulted in an accuracy of 0.9359, a macro F1-score of 0.946, and an AUC of 0.982, indicating excellent predictive performance with balanced precision and recall across both diagnostic outcomes.

The AutoML results closely match the performance of our manually optimized HistGradientBoostingClassifier, which achieved a test accuracy of about 0.95. This confirms that our preprocessing pipeline, split strategy, and manual hyperparameter tuning were effective and well-calibrated. Overall, AutoML validates that gradient-boosted tree models provide the most reliable and accurate predictions for appendicitis diagnosis in this dataset.

###### Metrics Summuries

In [23]:
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score
)

rows = []

# Manual models: LR, MLPC, RFC, HGBC
name_map = {
    "lr":   "Logistic Regression",
    "mlpc": "MLPClassifier",
    "rfc":  "Random Forest",
    "hgbc": "HistGradientBoosting"
}

for acronym, pretty_name in name_map.items():
    gs = grid_searches[acronym]
    y_pred = model_predictions[acronym]
    
    # Probabilities for AUC (all these models support predict_proba)
    try:
        y_proba = gs.best_estimator_.predict_proba(X_test_scaled)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
    except Exception:
        auc = None
    
    rows.append({
        "model_id": acronym,
        "model_name": pretty_name,
        "type": "manual",
        "test_accuracy": accuracy_score(y_test, y_pred),
        "test_f1_macro": f1_score(y_test, y_pred, average="macro"),
        "test_precision_macro": precision_score(y_test, y_pred, average="macro"),
        "test_recall_macro": recall_score(y_test, y_pred, average="macro"),
        "test_auc": auc
    })

try:
    if "Score" in test_predictions_df.columns:
        automl_proba = test_predictions_df["Score"].values
        automl_auc = roc_auc_score(y_test_true, automl_proba)
    else:
        automl_auc = None
except Exception:
    automl_auc = None

rows.append({
    "model_id": "automl_gbc",
    "model_name": "AutoML GradientBoostingClassifier",
    "type": "automl",
    "test_accuracy": accuracy_score(y_test_true, y_test_pred),
    "test_f1_macro": f1_score(y_test_true, y_test_pred, average="macro"),
    "test_precision_macro": precision_score(y_test_true, y_test_pred, average="macro"),
    "test_recall_macro": recall_score(y_test_true, y_test_pred, average="macro"),
    "test_auc": automl_auc
})

metrics_df = pd.DataFrame(rows)
metrics_df


Unnamed: 0,model_id,model_name,type,test_accuracy,test_f1_macro,test_precision_macro,test_recall_macro,test_auc
0,lr,Logistic Regression,manual,0.891026,0.889385,0.886863,0.898098,0.959069
1,mlpc,MLPClassifier,manual,0.858974,0.856113,0.853488,0.861413,0.92731
2,rfc,Random Forest,manual,0.923077,0.920879,0.919192,0.922894,0.973166
3,hgbc,HistGradientBoosting,manual,0.948718,0.946749,0.949211,0.944633,0.981148
4,automl_gbc,AutoML GradientBoostingClassifier,automl,0.935897,0.933082,0.938542,0.929008,


In [24]:
metrics_path = "results_model_metrics_summary.csv"
metrics_df.to_csv(metrics_path, index=False)
metrics_path


'results_model_metrics_summary.csv'

###### Feature Importance

In [25]:
fi_rows = []

# Manual tree models
tree_models_for_fi = {
    "rfc":  "Random Forest",
    "hgbc": "HistGradientBoosting"
}

for acronym, pretty_name in tree_models_for_fi.items():
    gs = grid_searches[acronym]
    model = gs.best_estimator_.named_steps["model"]
    
    if hasattr(model, "feature_importances_"):
        importances = model.feature_importances_
        for feat, imp in zip(X_train.columns, importances):
            fi_rows.append({
                "model_id": acronym,
                "model_name": pretty_name,
                "feature": feat,
                "importance": float(imp),
                "source": "manual"
            })

# AutoML GradientBoostingClassifier (best_automl_model)
if hasattr(best_automl_model, "feature_importances_"):
    for feat, imp in zip(X_train.columns, best_automl_model.feature_importances_):
        fi_rows.append({
            "model_id": "automl_gbc",
            "model_name": "AutoML GradientBoostingClassifier",
            "feature": feat,
            "importance": float(imp),
            "source": "automl"
        })

feature_importances_df = pd.DataFrame(fi_rows)

feature_importances_df["rank_within_model"] = (
    feature_importances_df
    .groupby("model_id")["importance"]
    .rank(ascending=False, method="first")
)

top_fi_df = feature_importances_df[feature_importances_df["rank_within_model"] <= 30]

top_fi_path = "results_feature_importances_top30.csv"
top_fi_df.to_csv(top_fi_path, index=False)
top_fi_path


'results_feature_importances_top30.csv'

###### All Model result

In [26]:
preds_df = pd.DataFrame({
    "true_label": y_test.reset_index(drop=True)
})

# Manual models' predictions
for acronym, pretty_name in name_map.items():
    preds_df[f"{acronym}_pred"] = model_predictions[acronym]

# AutoML predictions on the same test patients
# y_test_true should correspond to y_test (same order)
preds_df["automl_gbc_pred"] = y_test_pred

test_preds_all_path = "results_test_predictions_all_models.csv"
preds_df.to_csv(test_preds_all_path, index=False)
test_preds_all_path


'results_test_predictions_all_models.csv'

#### Upload Result CSV to bucket 

In [41]:
import boto3
import os


In [None]:
# Our group bucket
bucket = "group3-appendicitis-bucket"

# Local directory 
base_path = "/home/ec2-user/appendicitis_project/modeling/"

# Initialize S3 client
s3 = boto3.client("s3")

def upload_to_s3(local_filename, s3_key):
    full_local_path = os.path.join(base_path, local_filename)
    print(f"Uploading {full_local_path} --> s3://{bucket}/{s3_key}")
    s3.upload_file(full_local_path, bucket, s3_key)

# Upload each result file to the 'results/' folder in the bucket
upload_to_s3("results_model_metrics_summary.csv",
             "results/model_metrics_summary.csv")

upload_to_s3("results_feature_importances_top30.csv",
             "results/feature_importances_top30.csv")

upload_to_s3("results_test_predictions_all_models.csv",
             "results/test_predictions_all_models.csv")


In [30]:
!aws s3 ls s3://group3-appendicitis-bucket/results/


2025-11-26 20:56:07          0 
2025-11-27 19:48:16       4266 feature_importances_top30.csv
2025-11-27 19:48:16        723 model_metrics_summary.csv
2025-11-27 19:48:16       1936 test_predictions_all_models.csv


In [39]:
from pycaret.classification import save_model

# Save the AutoML best model 
save_model(best_automl_model, "automl_best_model")


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['Age', 'BMI', 'Height', 'Weight',
                                              'Length_of_Stay', 'Alvarado_Score',
                                              'Paedriatic_Appendicitis_Score',
                                              'Appendix_Diameter',
                                              'Body_Temperature', 'WBC_Count',
                                              'Neutrophil_Percentage',
                                              'RBC_Count', 'Hemoglobin', 'RDW',
                                              'Thrombocyte_Count',
                                              'Ketones_in_Urine', 'R...
                  LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                 colsample_bytree=1.0, importance_type='split',
                                 learni

In [34]:
!ls -lah /home/ec2-user/appendicitis_project/modeling | grep automl


-rw-r--r--. 1 ec2-user ec2-user 196K Nov 27 19:56 automl_best_model.pkl


#### Upload AutoML Model File to our S3 Bucket


In [36]:
bucket = "group3-appendicitis-bucket"
s3 = boto3.client("s3")

base_path = "/home/ec2-user/appendicitis_project/modeling/"

def upload_to_s3(local_filename, s3_key):
    full_local_path = os.path.join(base_path, local_filename)
    print(f"Uploading {full_local_path} --> s3://{bucket}/{s3_key}")
    s3.upload_file(full_local_path, bucket, s3_key)
upload_to_s3("automl_best_model.pkl", "models/automl_best_model.pkl")


Uploading /home/ec2-user/appendicitis_project/modeling/automl_best_model.pkl --> s3://group3-appendicitis-bucket/models/automl_best_model.pkl


In [37]:
!aws s3 ls s3://group3-appendicitis-bucket/models/

2025-11-27 20:01:06     200412 automl_best_model.pkl
