In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import randint, uniform

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
directory_path = '/content/gdrive/MyDrive/Lim_ShiBin_ML/'
os.makedirs(directory_path, exist_ok=True)

In [None]:
%cd '/content/gdrive/MyDrive/Lim_ShiBin_ML/'

/content/gdrive/MyDrive/Lim_ShiBin_ML


Preprocessing

In [None]:
train_data = pd.read_csv('/content/gdrive/MyDrive/Lim_ShiBin_ML/train.csv')
test_data = pd.read_csv('/content/gdrive/MyDrive/Lim_ShiBin_ML/test.csv')

# # Feature engineering
# def engineer_features(df):
#     df = df.copy()

#     # Create combinations of important features
#     df['temp_pulse_ratio'] = df['rectal_temp'] / df['pulse']
#     df['temp_resp_ratio'] = df['rectal_temp'] / df['respiratory_rate']
#     df['pulse_resp_ratio'] = df['pulse'] / df['respiratory_rate']

#     # Create interaction terms
#     df['protein_ratio'] = df['total_protein'] / df['packed_cell_volume']

#     # Bin continuous variables
#     df['temp_category'] = pd.qcut(df['rectal_temp'], q=5, labels=['very_low', 'low', 'normal', 'high', 'very_high'], duplicates='drop')
#     df['pulse_category'] = pd.qcut(df['pulse'], q=5, labels=['very_low', 'low', 'normal', 'high', 'very_high'], duplicates='drop')

#     return df

# # Apply feature engineering
# train_data = engineer_features(train_data)
# test_data = engineer_features(test_data)

# Splitting features and target variable in training data
X_train = train_data.drop(columns=["id", "hospital_number", "outcome"])
y_train = train_data["outcome"]

# Preprocessing test data (no target column)
X_test = test_data.drop(columns=["id", "hospital_number"])

# Identifying numerical and categorical columns
num_features = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
cat_features = X_train.select_dtypes(include=['object']).columns.tolist()

# Preprocessing pipelines for numerical and categorical features
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combining preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)])

# Label encoding for the target variable in training data
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

Random Forest  
Public Score: 0.78658 (741/1,543)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Building the model pipeline
rf = RandomForestClassifier(random_state=42)

# Complete pipeline with preprocessor and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', rf)])

# Train the model
pipeline.fit(X_train, y_train_encoded)

# Predict on the test dataset
y_test_pred = pipeline.predict(X_test)

# Decode the predicted labels back to the original categories
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

# Convert predictions to DataFrame for easy inspection
test_predictions = pd.DataFrame({
    "id": test_data["id"],
    "predicted_outcome": y_test_pred_labels
})

test_predictions.to_csv("test_predictions_rf.csv", index=False)

Sample predictions:
      id predicted_outcome
0  1235             lived
1  1236              died
2  1237             lived
3  1238        euthanized
4  1239             lived


Gradient Boosting (XGBoost)  
Public Score: 0.79878 (568/1,543)

In [None]:
from xgboost import XGBClassifier

# Build model pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', XGBClassifier(
                            n_estimators=100,
                            use_label_encoder=False,
                            eval_metric='mlogloss',
                            random_state=42))])

# Training model
model.fit(X_train, y_train_encoded)

# Predict
y_test_pred = model.predict(X_test)
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

# Convert predictions to DataFrame for easier inspection
test_predictions = pd.DataFrame({
    "id": test_data["id"],
    "predicted_outcome": y_test_pred_labels
})

test_predictions.to_csv("test_predictions.csv", index=False)

Parameters: { "use_label_encoder" } are not used.



Sample predictions:
      id predicted_outcome
0  1235             lived
1  1236              died
2  1237             lived
3  1238        euthanized
4  1239             lived


Gradient Boosting (XGBoost) with Hyperparameter Tuning and Cross-validation  
Public Score: 0.82317 (314/1,543)

In [None]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Define hyperparameter grid for RandomizedSearchCV
param_dist = {
    'classifier__n_estimators': randint(50, 300),          # Number of boosting rounds
    'classifier__max_depth': randint(3, 15),               # Depth of the tree
    'classifier__learning_rate': uniform(0.01, 0.3),       # Learning rate
    'classifier__subsample': uniform(0.5, 0.5),            # Fraction of samples used per boosting round
    'classifier__colsample_bytree': uniform(0.5, 0.5),     # Fraction of features used per tree
    'classifier__min_child_weight': randint(1, 10)         # Minimum sum of instance weight for child nodes
}

# Complete pipeline with preprocessor and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', xgb)])

# Randomized search with cross-validation
random_search = RandomizedSearchCV(
    pipeline, param_distributions=param_dist, n_iter=100, scoring='accuracy', cv=5, verbose=2, n_jobs=-1, random_state=42
)

# Fit the model with RandomizedSearchCV
random_search.fit(X_train, y_train_encoded)

# Display best parameters and accuracy score
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation accuracy: ", random_search.best_score_)

# Use the best model found in RandomizedSearchCV to make predictions
best_xgb_model = random_search.best_estimator_
y_test_pred = best_xgb_model.predict(X_test)

# Decode the predicted labels back to their original categories
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

# Convert predictions to DataFrame for easy inspection
test_predictions = pd.DataFrame({
    "id": test_data["id"],
    "predicted_outcome": y_test_pred_labels
})

test_predictions.to_csv("test_predictions_xgb.csv", index=False)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters found:  {'classifier__colsample_bytree': 0.3936964831604432, 'classifier__learning_rate': 0.015468785930798914, 'classifier__max_depth': 8, 'classifier__min_child_weight': 12, 'classifier__n_estimators': 296, 'classifier__subsample': 0.9998588366430653}
Best cross-validation accuracy:  0.7206477732793524


In [None]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

# Define base estimators
estimators = [
    ('xgb', XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42, use_label_encoder=False, eval_metric='mlogloss')),
    ('lgbm', LGBMClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42))
]

# Define the stacking ensemble with a Logistic Regression meta-model
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5,
    n_jobs=-1
)

# Fit the model
stacking_clf.fit(X_train, y_train_encoded)

# Evaluate on test set
y_test_pred = stacking_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_test_pred)
print("Stacking Model Accuracy:", accuracy)