# Hepatitis Challenge

## Plan
### 1. Clean dataset
+ Categoricals
    + SEX -> homogenize values to M or F
    + Missing values '?' on multiple columns
        + Drop column
        + Drop row
        + Fill with mode
        + Impute with KNN
+ Numericals
    + Missing values '?' -> Impute with KNN
### 2. Dataset Split
+ X/y split
+ Train / Test Split
### 3. Scaling with Standard Scaler
### 4. Fix Class Imbalance
+ Upsampling
+ Downsampling
+ SMOTE
+ Tomek
### 5. Feature selection?
### 6. Model selection
+ Random Forest
+ Logistic Regression
+ KNN
### 7. Model evaluation
+ Optimize -> Kappa
### 8. Model tuning
+ RandomSearchCV + GridSearchCV


### Submission variations:
1. Dataset 1
2. Dataset 2 -> Not including 'PRO TIME'
3. Dataset 3 -> Tomek vs. SMOTE balancing methods?
4. ?
5. ?

# Import libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks 
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, cohen_kappa_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# Load Clean Data

In [2]:
# Load data
df = pd.read_csv('../data/clean_data/train_clean.csv')
df_no_class = pd.read_csv('../data/clean_data/test_no_class_clean.csv')

# Show data & shape
print(df.shape)
display(df.head())

(102, 20)


Unnamed: 0,AGE,SEX,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,STEROID,Class
0,48.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,4.8,123.0,157.0,2.7,31.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0
1,51.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,79.6,20.0,3.0,63.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0
2,40.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0,0.6,62.0,166.0,4.0,63.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0
3,25.0,1.0,2.0,1.0,2.0,2.0,1.0,1.0,1.3,181.0,181.0,4.5,57.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0
4,34.0,1.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,72.0,46.0,4.4,57.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0


In [3]:
display(df.head())

Unnamed: 0,AGE,SEX,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,STEROID,Class
0,48.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,4.8,123.0,157.0,2.7,31.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0
1,51.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,79.6,20.0,3.0,63.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0
2,40.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0,0.6,62.0,166.0,4.0,63.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0
3,25.0,1.0,2.0,1.0,2.0,2.0,1.0,1.0,1.3,181.0,181.0,4.5,57.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0
4,34.0,1.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,72.0,46.0,4.4,57.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0


# Explore Data

In [4]:
# Show value counts for each column
#for col in df.columns:
#    display(df[col].value_counts())

# Split Features / Target

In [5]:
# Separate the features from the target
y = df['Class']
X = df.drop(['Class'], axis=1)

# Train-Test Split

In [6]:
# Generate the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale Numerical Columns

In [7]:
# Select StandardScaler
scaler = StandardScaler()

# Fit scaler with TRAIN data
scaler.fit(X_train)

# Scale X_train_num_transformed with fitted scaler. Output is a np.array.
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Add columns to np.array to create a DataFrame
X_train_scaled_df = pd.DataFrame(X_train_scaled, 
                                 columns=X_train.columns, 
                                 index=X_train.index)

X_test_scaled_df = pd.DataFrame(X_test_scaled, 
                                columns=X_test.columns,
                                index=X_test.index)

# Balance Target Column

Four possible methods:
+ Upsampling minority class
+ Downsampling majority class
+ SMOTE
+ TOMEK

In [8]:
# Show imablance ratio
count_classes = df['Class'].value_counts()
print("The class imbalance ratio is: {:.2f}".format((count_classes[2]-count_classes[1])/(count_classes[2]+count_classes[1])))

The class imbalance ratio is: 0.59


In [9]:
# Upsampling minority class
ros = RandomOverSampler(random_state=42)

X_train_over, y_train_over = ros.fit_resample(X_train_scaled_df, y_train)

count_classes = y_train_over.value_counts()

print("The class imbalance ratio is: {:.2f}".format((count_classes[2]-count_classes[1])/(count_classes[2]+count_classes[1])))

The class imbalance ratio is: 0.00


In [10]:
# Downsampling majority class
rus = RandomUnderSampler(random_state=42)

X_train_under, y_train_under = rus.fit_resample(X_train_scaled_df, y_train)

count_classes = y_train_over.value_counts()

print("The class imbalance ratio is: {:.2f}".format((count_classes[2]-count_classes[1])/(count_classes[2]+count_classes[1])))

The class imbalance ratio is: 0.00


In [11]:
# SMOTE
sm = SMOTE(random_state=100,k_neighbors=3)

X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_scaled_df, y_train)

count_classes = y_train_over.value_counts()

print("The class imbalance ratio is: {:.2f}".format((count_classes[2]-count_classes[1])/(count_classes[2]+count_classes[1])))

The class imbalance ratio is: 0.00


In [12]:
# TomekLinks
tl = TomekLinks()

X_train_tl, y_train_tl = tl.fit_resample(X_train_scaled_df, y_train)

count_classes = y_train_over.value_counts()

print("The class imbalance ratio is: {:.2f}".format((count_classes[2]-count_classes[1])/(count_classes[2]+count_classes[1])))

The class imbalance ratio is: 0.00


In [13]:
# Select balancing method
#X_train_balanced = X_train_SMOTE
#y_train_balanced = y_train_SMOTE

#X_train_balanced = X_train_tl
#y_train_balanced = y_train_tl

#X_train_balanced = X_train_under
#y_train_balanced = y_train_under

X_train_balanced = X_train_over 
y_train_balanced = y_train_over

# Model Selection

In [14]:
# Create models
model1 = LogisticRegression()
model2 = RandomForestClassifier()
model3 = KNeighborsClassifier()

model_pipeline = [model1, model2, model3]
model_names = ['Logistic Regression', 'Random Forest Classifier', 'KNN']

scores = {}

for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train_balanced, y_train_balanced, cv=5))
    scores[model_name] = mean_score

print(scores)

{'Logistic Regression': 0.9568840579710145, 'Random Forest Classifier': 0.9565217391304348, 'KNN': 0.8967391304347826}


# Model Tuning

In [15]:

param_grid = {
    'n_estimators': np.arange(20, 200, 10), # (50, 200, 10)
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(5, 15), # np.arange(5, 15)
    'min_samples_split': np.arange(2, 11), # np.arange(2, 11)
    'min_samples_leaf': np.arange(1, 11), # np.arange(1, 11)
    'max_features': ['sqrt', 'log2', None], #  ['sqrt', 'log2', None]
    'bootstrap': [True, False] # [True, False]
}

# Create a RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Define Kappa score as the scoring metric
kappa_scorer = make_scorer(cohen_kappa_score)

# Create RandomizedSearchCV object with Kappa score as the scoring metric
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=100, cv=5,
                                   scoring=kappa_scorer, random_state=42, n_jobs=-1)

random_search.fit(X_train_balanced, y_train_balanced)

best_params_from_random_search = random_search.best_params_

# Define a smaller range around the best parameters for GridSearchCV
param_grid_for_grid_search = {
    'n_estimators': np.arange(best_params_from_random_search['n_estimators'] - 20,
                               best_params_from_random_search['n_estimators'] + 20, 5),
    'criterion': [best_params_from_random_search['criterion']],
    'max_depth': np.arange(best_params_from_random_search['max_depth'] - 2,
                           best_params_from_random_search['max_depth'] + 5),
    'min_samples_split': np.arange(best_params_from_random_search['min_samples_split'],#  - 1,
                                   best_params_from_random_search['min_samples_split'] + 5),
    'min_samples_leaf': np.arange(best_params_from_random_search['min_samples_leaf'], # - 1,
                                  best_params_from_random_search['min_samples_leaf'] + 5),
    'max_features': [best_params_from_random_search['max_features']],
    'bootstrap': [best_params_from_random_search['bootstrap']]
}

# Create GridSearchCV object with Kappa score as the scoring metric
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid_for_grid_search, cv=5,
                           scoring=kappa_scorer, n_jobs=-1)

# Fit the GridSearchCV on the training data
grid_search.fit(X_train_balanced, y_train_balanced)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Evaluate the model on the test data
y_pred = best_model.predict(X_test_scaled_df)
kappa_score_test = cohen_kappa_score(y_test, y_pred)

print("Best Model Parameters:")
print(best_params)
print("Best Model Kappa Score on Test Data:", kappa_score_test)


Best Model Parameters:
{'bootstrap': False, 'criterion': 'gini', 'max_depth': 6, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 175}
Best Model Kappa Score on Test Data: 0.5201238390092879


### Feature Importance Rank

In [16]:
# Get the feature importances
importances = best_model.feature_importances_

# Sort feature importances and corresponding feature names together
feature_importance_names = list(zip(importances, X_train_balanced.columns))
feature_importance_names.sort(reverse=True)

# Print the feature rankings
print("Feature Rankings:")
for i, (importance, feature_name) in enumerate(feature_importance_names):
    print(f"{i + 1}. Feature {feature_name}: Importance = {importance}")

Feature Rankings:
1. Feature ALBUMIN: Importance = 0.24349968623436136
2. Feature BILIRUBIN: Importance = 0.22957043294851773
3. Feature PROTIME: Importance = 0.0747714182757526
4. Feature ASCITES: Importance = 0.06401213926207194
5. Feature ALK PHOSPHATE: Importance = 0.0599576067084904
6. Feature AGE: Importance = 0.0552128641895017
7. Feature FATIGUE: Importance = 0.050281683848777944
8. Feature MALAISE: Importance = 0.04761205294715983
9. Feature HISTOLOGY: Importance = 0.046162763865283396
10. Feature SGOT: Importance = 0.03711546166618276
11. Feature SPIDERS: Importance = 0.03243609897104653
12. Feature SPLEEN PALPABLE: Importance = 0.017302966628040855
13. Feature SEX: Importance = 0.012034496628982152
14. Feature ANOREXIA: Importance = 0.006631807522857179
15. Feature STEROID: Importance = 0.006443107804144109
16. Feature LIVER BIG: Importance = 0.005089436174430484
17. Feature ANTIVIRALS: Importance = 0.004406138580653675
18. Feature LIVER FIRM: Importance = 0.0039397958165910

# Transform and Predict Submission Dataset

## Scale Numerical Columns

In [17]:
# Select StandardScaler
scaler = StandardScaler()

# Fit scaler with TRAIN data
scaler.fit(df_no_class)

# Scale X_train_num_transformed with fitted scaler. Output is a np.array.
df_no_class_scaled = scaler.transform(df_no_class)

# Add columns to np.array to create a DataFrame
df_no_class_scaled_df = pd.DataFrame(df_no_class_scaled, 
                                     columns=df_no_class.columns, 
                                     index=df_no_class.index)

## Make Predictions

In [18]:
y_pred_no_class = best_model.predict(df_no_class_scaled_df)

In [19]:
# Convert into DataFrame
y_pred_no_class_df = pd.DataFrame(y_pred_no_class, columns=['Class'])

In [20]:
# Replace 1,2 with DIE, LIVE
y_pred_no_class_df['Class'].replace({1:'DIE', 2:'LIVE'}, inplace=True)

In [21]:
# Save as CSV
y_pred_no_class_df.to_csv('../data/submission_data/group_2.csv', index=False)

### Check Submission Kappa with test dataset

In [22]:
# Best result
test_df = pd.read_csv('../data/raw_data/test.csv')

y_no_class_test = test_df['Class']
y_pred_no_class = y_pred_no_class_df['Class']

final_kappa = cohen_kappa_score(y_no_class_test, y_pred_no_class)
final_kappa

0.6310904872389791

In [23]:
# Submission 1
sub_1_df = pd.read_csv('../data/submission_data/group_2_sub_1.csv')
y_pred_no_class = sub_1_df['Class']

final_kappa = cohen_kappa_score(y_no_class_test, y_pred_no_class)
final_kappa

0.42640692640692635

In [24]:
# Submission 2
sub_1_df = pd.read_csv('../data/submission_data/group_2_sub_2.csv')
y_pred_no_class = sub_1_df['Class']

final_kappa = cohen_kappa_score(y_no_class_test, y_pred_no_class)
final_kappa

0.42599277978339356

In [25]:
# Submission 3
sub_1_df = pd.read_csv('../data/submission_data/group_2_sub_3.csv')
y_pred_no_class = sub_1_df['Class']

final_kappa = cohen_kappa_score(y_no_class_test, y_pred_no_class)
final_kappa

0.462474645030426