In [1]:
import os 
import sys
sys.path.append('..')
sys.path.append('../..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope
import gc
import pickle

from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
# Data paths
data_path = f'../../data/'
train_path = data_path + 'train.csv'
test_path = data_path + 'test.csv'
submission_path = data_path + 'sample_submission.csv'   

In [3]:
# Load and prepare data
train = pd.read_csv(train_path)
# Handle missing values in the training set
missing_values_intrain = train.isnull().mean() * 100
cols_drop = missing_values_intrain[missing_values_intrain > 95].index
ds = train.drop(columns=cols_drop)

In [4]:
# Drop the 'id' column
#ds = ds.drop(columns=['id'])

# Encode target labels
le = LabelEncoder()
ds['class'] = le.fit_transform(ds['class'])

# Split features and target
X = ds.drop(columns=['class'])
y = ds['class']


In [5]:
X.drop(columns=['id'], inplace=True)

In [6]:

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# TODO: Is this the best approabch
X_train = X_train.fillna('NaN')
X_test = X_test.fillna('NaN')

In [8]:
categorical_features_indices = np.where(X_train.dtypes == 'object')[0]
for col in X_train.columns[categorical_features_indices]:
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)

In [9]:
X_train.head()

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
1252551,2.58,s,w,n,t,d,c,n,2.87,5.69,,,n,,,f,f,,d,a
1799166,1.83,b,g,o,f,,c,n,5.36,2.7,,,n,,,f,f,,m,a
1936146,5.22,x,i,o,f,e,,y,7.32,7.41,,,k,,,f,f,,m,a
1464811,4.52,f,,y,f,s,d,y,5.85,12.74,,i,y,,,f,f,,d,a
767639,6.18,f,,n,f,d,c,y,6.33,10.29,b,,n,,,f,f,,l,w


In [8]:
def objective(params):
    model = CatBoostClassifier(
        depth=int(params['depth']),
        learning_rate=params['learning_rate'],
        l2_leaf_reg=params['l2_leaf_reg'],
        iterations=int(params['iterations']),
        random_state=42,
        cat_features=categorical_features_indices,
        verbose=0
    )
    
    model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=10, use_best_model=True)
    
    y_pred = model.predict(X_test)
    score = matthews_corrcoef(y_test, y_pred)
    return {'loss': -score, 'status': STATUS_OK}

# Define the hyperparameter search space
space = {
    'depth': scope.int(hp.quniform('depth', 3, 12, 1)),
    'learning_rate': hp.loguniform('learning_rate', -5, 0),  # equivalent to learning_rate in range (exp(-5), exp(0))
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 10),
    'iterations': scope.int(hp.quniform('iterations', 100, 1000, 50))
}

# Run the optimization process
trials = Trials()
best_params = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=20,
    trials=trials,
    rstate=np.random.default_rng(42)  # Ensure reproducibility
)

# Save the best parameters
with open('best_params_catboost.pkl', 'wb') as f:
    pickle.dump(best_params, f)


100%|██████████| 20/20 [2:57:39<00:00, 533.00s/trial, best loss: -0.9834800626901276]  


In [10]:
import pickle
with open('best_params_catboost.pkl', 'rb') as f:
    best_params = pickle.load(f)

# Train with split

In [11]:
# Train the final model with the best parameters

best_model = CatBoostClassifier(
    depth=int(best_params['depth']),
    learning_rate=best_params['learning_rate'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    iterations=int(best_params['iterations']),
    random_state=42,
    cat_features=categorical_features_indices,
    verbose=0
)

best_model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=10, use_best_model=True)


<catboost.core.CatBoostClassifier at 0x29908a2b0>

In [12]:
# Evaluate the final model
y_pred = best_model.predict(X_test)
final_score = matthews_corrcoef(y_test, y_pred)
print('Best MCC:', final_score)

# Save the final model
with open('../../models/best_model_catboost.pkl', 'wb') as f:
    pickle.dump(best_model, f)



Best MCC: 0.9835319722796011


# Train with the full data

In [13]:
# TODO: Is this the best approabch
X = X.fillna('NaN')
for col in X.columns[categorical_features_indices]:
    X[col] = X[col].astype(str)

In [16]:
# Train the final model with the best parameters

best_model = CatBoostClassifier(
    depth=int(best_params['depth']),
    learning_rate=best_params['learning_rate'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    iterations=int(best_params['iterations']),
    random_state=42,
    cat_features=categorical_features_indices,
    verbose=0
)

best_model.fit(X, y)


<catboost.core.CatBoostClassifier at 0x29d552d60>

# Test

In [17]:
# Process the test set
df = pd.read_csv(test_path)

# Ensure the same columns are dropped in the test set
#df = df.drop(columns=cols_drop.intersection(df.columns))
df = df.drop(columns=cols_drop)

df = df.fillna('NaN')

for col in df.columns[categorical_features_indices]:
    df[col] = df[col].astype(str)

# Drop the 'id' column if it exists in the test set
if 'id' in df.columns:
    df = df.drop(columns=['id'])


In [18]:
df['id'] = df.index.astype(str)
df

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,...,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season,id
0,8.64,x,,n,t,,,w,11.13,17.12,...,,w,u,w,t,g,,d,a,0
1,6.9,o,t,o,f,,c,y,1.27,10.75,...,,n,,,f,f,,d,a,1
2,2.0,b,g,n,f,,c,n,6.18,3.14,...,,n,,,f,f,,d,s,2
3,3.47,x,t,n,f,s,c,n,4.98,8.51,...,,w,,n,t,z,,d,u,3
4,6.17,x,h,y,f,p,,y,6.73,13.7,...,,y,,y,t,,,d,u,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2077959,0.88,x,g,w,f,a,d,w,2.67,1.35,...,,e,,,f,f,,d,u,2077959
2077960,3.12,x,s,w,f,d,c,w,2.69,7.38,...,,w,,,f,f,,g,a,2077960
2077961,5.73,x,e,e,f,a,,w,6.16,9.74,...,,y,,w,t,z,,d,a,2077961
2077962,5.03,b,g,n,f,a,d,g,6.0,3.46,...,s,g,,,f,f,,d,a,2077962


In [19]:
# Predict on the test set
y_final = best_model.predict(df)
y_final2 = le.inverse_transform(y_final)



In [20]:
# Create a submission file
sub_sample = pd.read_csv(submission_path)
df_submission = pd.DataFrame({'id': sub_sample['id'], 'class': y_final2})

df_submission.to_csv('../../data/prediction_catboost.csv', index=False)
gc.collect()

# Display the submission
df_submission


Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e
...,...,...
2077959,5194904,p
2077960,5194905,p
2077961,5194906,p
2077962,5194907,e
