# Training and Evaluating Machine Learning Model

In [1]:
import pandas as pd
from IPython.display import HTML, display
import tabulate

In [2]:
# Import the pre-processed train, test, and the sample_submission CSV file.
df_train_clean = pd.read_csv("df_train_clean.csv")
df_test_clean = pd.read_csv("df_test_clean.csv")
submission = pd.read_csv("sample_submission.csv")

In [3]:
df_train_clean.head(5)

Unnamed: 0,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,surgery_no,surgery_yes,age_adult,...,lesion_1_6111,lesion_1_6112,lesion_1_6209,lesion_1_7111,lesion_1_7209,lesion_1_7400,lesion_1_8300,lesion_1_8400,lesion_1_9400,outcome
0,38.1,132.0,24.0,6.5,57.0,8.5,3.4,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,died
1,37.5,88.0,12.0,2.0,33.0,64.0,2.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,euthanized
2,38.3,120.0,28.0,3.5,37.0,6.4,3.4,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,lived
3,37.1,72.0,30.0,2.0,53.0,7.0,3.9,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,lived
4,38.0,52.0,48.0,7.0,47.0,7.3,2.6,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,lived


In [4]:
df_test_clean.head(5)

Unnamed: 0,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,surgery_no,surgery_yes,age_adult,...,lesion_1_5400,lesion_1_6111,lesion_1_6112,lesion_1_6209,lesion_1_7111,lesion_1_7209,lesion_1_7400,lesion_1_8300,lesion_1_8400,lesion_1_9400
0,38.6,40.0,20.0,7.0,42.0,7.5,2.3,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,38.2,112.0,48.0,3.5,44.0,6.0,2.6,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,37.7,66.0,12.0,3.0,31.5,6.0,1.6,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,37.1,88.0,20.0,2.0,75.0,81.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,38.3,50.0,12.0,3.0,37.0,6.8,2.6,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
submission.head(5)

Unnamed: 0,id,outcome
0,1235,lived
1,1236,lived
2,1237,lived
3,1238,lived
4,1239,lived


Let's label encode our outcome column before we train the model.

In [6]:
from sklearn.preprocessing import LabelEncoder

# Initialize label encoder
le = LabelEncoder()

# Fit and transform the outcome column
df_train_clean['outcome'] = le.fit_transform(df_train_clean['outcome'])

# Check the mapping
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)

{'died': 0, 'euthanized': 1, 'lived': 2}


In [7]:
df_train_clean.head(5)

Unnamed: 0,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,surgery_no,surgery_yes,age_adult,...,lesion_1_6111,lesion_1_6112,lesion_1_6209,lesion_1_7111,lesion_1_7209,lesion_1_7400,lesion_1_8300,lesion_1_8400,lesion_1_9400,outcome
0,38.1,132.0,24.0,6.5,57.0,8.5,3.4,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,37.5,88.0,12.0,2.0,33.0,64.0,2.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,38.3,120.0,28.0,3.5,37.0,6.4,3.4,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,37.1,72.0,30.0,2.0,53.0,7.0,3.9,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,38.0,52.0,48.0,7.0,47.0,7.3,2.6,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


For this challenge, I'll be training an XGBoost model. Its been shown to be one of the best performing models on tabular data as shown in this [paper](chrome-extension://efaidnbmnnnibpcajpcglclefindmkaj/https://arxiv.org/pdf/2207.08815), it's robust against outliers, and doesn't require scaling or nomalization. We'll start with a baseline model, but I'll make sure to add the 'multi:softmax' parameter since it enables the model to handle multi-class classification. The paper I'm referencing will also be linked in the README.md file.

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

# Model evaluation function
def evaluate_model(model_name, y_test, y_pred):
    print(f"{model_name} Micro-Averaged F1 Score:", f1_score(y_test, y_pred, average="micro"))
    
# Define the X and y variables
X = df_train_clean.drop(["outcome"], axis=1)
y = df_train_clean['outcome']

# Split into training and validation sets, making sure to stratify since there's an imbalance
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize a base XGBoost model
xgb = XGBClassifier(objective='multi:softmax', random_state=42)

# Fit and train the model
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_val)

# Evaluate the model
evaluate_model("Baseline XGBoost", y_val, y_pred)

Baseline XGBoost Micro-Averaged F1 Score: 0.6963562753036437


Now that we have our baseline model, we can begin hyperparameter tuning using Grid Search.

In [9]:
from sklearn.model_selection import GridSearchCV

# Create a set of hyperparameters with a set of values
param_grid = {
    'n_estimators': [100, 200, 300],   # Number of trees
    'max_depth': [3, 5, 7],            # Depth of the trees
    'learning_rate': [0.01, 0.1, 0.2], # Step size shrinkage to prevent overfitting
    'max_delta_step': [1,3,5],         # Can help stop bias against majority class
    'gamma': [0.1,0.3,0.5]             # Minimum loss reduction to make a split
}

# Initialize model and apply Grid Search
xgb = XGBClassifier(objective='multi:softmax', random_state=42)
grid_xgb = GridSearchCV(xgb, param_grid, scoring='f1_micro', cv=5, n_jobs=-1)
grid_xgb.fit(X_train, y_train)

# Print best paramaters and best micro F1 score
print("Best XGBoost Params:", grid_xgb.best_params_)
print("Best XGBoost Score:", grid_xgb.best_score_)

Best XGBoost Params: {'gamma': 0.3, 'learning_rate': 0.1, 'max_delta_step': 3, 'max_depth': 3, 'n_estimators': 200}
Best XGBoost Score: 0.7155822181202891


Let's see how the model performs on the validation set with the optimized hyperparameters.

In [10]:
# Initialize the optimized model
final_xgb = XGBClassifier(objective='multi:softmax', 
                          random_state=42, 
                          gamma=0.3, 
                          learning_rate=0.1, 
                          max_delta_step=3, 
                          max_depth=3, 
                          n_estimators=200)

# Fit the model 
final_xgb.fit(X_train, y_train)

# Create the predictions
y_pred_fin = final_xgb.predict(X_val)

# Evaluate model
evaluate_model("Optimized XGBoost:", y_val, y_pred_fin)

Optimized XGBoost: Micro-Averaged F1 Score: 0.7004048582995951


Now that we have the best hyperparameters, we can use our model to predict the outcomes for the test dataset and generate submission file to be evaluated by Kaggle.

In [11]:
# Initialize the optimized model
final_xgb = XGBClassifier(objective='multi:softmax', 
                          random_state=42, 
                          gamma=0.3, 
                          learning_rate=0.1, 
                          max_delta_step=3, 
                          max_depth=3, 
                          n_estimators=200)

# Fit the model to the entire training set now
final_xgb.fit(X, y)

# Create the predictions
y_test_pred = final_xgb.predict(df_test_clean)

# Replace the example predictions on the sample with our real predictions
submission["outcome"] = y_test_pred
submission["outcome"] = le.inverse_transform(y_test_pred)  # Returns label encoded outcome column back to strings
submission.to_csv("submission.csv", index=False)

According to Kaggle, the Micro-Averaged F1 Score on the test dataset was 0.73484

In [12]:
submission.head(5)

Unnamed: 0,id,outcome
0,1235,lived
1,1236,died
2,1237,lived
3,1238,euthanized
4,1239,lived


Now let's create a table to neatly display and compare all of our model scores against each other.

In [13]:
# Create and display table of scores
table = [
    ["Baseline: Validation Set", round(f1_score(y_val, y_pred, average="micro"), 5)],
    ["Optimized Hyperparameters: Validation Set", round(f1_score(y_val, y_pred_fin, average="micro"), 5)],
    ["Optimized Hyperparameters: Test Set", "0.73484"]
]

display(HTML(tabulate.tabulate(table, tablefmt='html', headers=["Model Iteration", "Micro-Averaged F1-Score"])))

Model Iteration,Micro-Averaged F1-Score
Baseline: Validation Set,0.69636
Optimized Hyperparameters: Validation Set,0.7004
Optimized Hyperparameters: Test Set,0.73484
