In [7]:
%load_ext autoreload
%autoreload 2

import os 
import sys
sys.path.append('..')
sys.path.append('../..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.metrics import matthews_corrcoef

import category_encoders as ce

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from category_encoders import TargetEncoder, CatBoostEncoder


import gc


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
data_path = f'../../data/'
train_path = data_path + 'train.csv'
test_path = data_path + 'test.csv'
submission_path = data_path + 'sample_submission.csv'   

In [9]:
train = pd.read_csv(train_path)

In [5]:
# Handle missing values in the training set
missing_values_intrain = train.isnull().mean() * 100
cols_drop = missing_values_intrain[missing_values_intrain > 95].index
ds = train.drop(columns=cols_drop)

###
# Select categorical columns for encoding
cols_train = ds.select_dtypes(include=['object']).columns
cols_train = cols_train[cols_train != 'class']

# Encode categorical features
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
ds[cols_train] = ordinal_encoder.fit_transform(ds[cols_train].astype(str))
ds[cols_train] = ds[cols_train].astype(int)
###

# Drop the 'id' column
ds = ds.drop(columns=['id'])

# Encode target labels
le = LabelEncoder()
ds['class'] = le.fit_transform(ds['class'])

# Split features and target
X = ds.drop(columns=['class'])
y = ds['class']


In [6]:
X

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,8.80,53,73,73,8,44,28,60,4.51,15.39,27,48,56,15,15,5,18,21,25,0
1,4.51,72,56,65,8,44,28,46,4.79,6.48,27,59,48,15,15,19,40,21,25,3
2,6.94,53,73,49,8,76,28,60,6.85,9.93,27,52,46,15,15,5,18,21,36,3
3,3.88,53,82,57,8,71,41,37,4.16,6.53,27,48,56,15,15,5,18,21,25,2
4,5.85,72,65,75,8,47,41,60,3.37,8.36,27,48,56,15,15,5,18,21,29,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,9.29,53,68,63,21,65,41,60,12.14,18.81,15,48,56,20,22,19,19,21,25,2
3116941,10.88,68,68,75,21,47,28,49,6.65,26.97,27,48,56,15,15,5,18,21,25,2
3116942,7.82,72,53,55,8,44,41,60,9.51,11.06,27,48,58,15,22,19,40,21,25,0
3116943,9.45,65,59,63,21,52,41,49,9.13,17.77,27,59,56,15,15,19,28,21,25,2


# XGB with hyperopt

In [12]:
!pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting py4j (from hyperopt)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: py4j, hyperopt
Successfully installed hyperopt-0.2.7 py4j-0.10.9.7

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [13]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the MCC metric for evaluation within the hyperopt objective
def mcc_metric(y_pred, dmatrix):
    y_true = dmatrix.get_label()
    y_pred = (y_pred > 0.5).astype(int)
    mcc = matthews_corrcoef(y_true, y_pred)
    return mcc

# Define the objective function for Hyperopt
def objective(params):
    model = XGBClassifier(
        alpha=params['alpha'],                    
        subsample=params['subsample'],     
        colsample_bytree=params['colsample_bytree'],  
        objective='binary:logistic',
        max_depth=int(params['max_depth']),             
        min_child_weight=params['min_child_weight'],         
        gamma=params['gamma'],                
        random_state=42,                 
        n_estimators=int(params['n_estimators']),
        learning_rate=params['learning_rate']
    )
    
    model.fit(
        X_train, 
        y_train, 
        eval_set=[(X_test, y_test)],
        eval_metric=lambda y_pred, dmatrix: [('mcc', mcc_metric(y_pred, dmatrix))],
        verbose=False,
        early_stopping_rounds=10
    )

    y_pred = model.predict(X_test)
    score = matthews_corrcoef(y_test, y_pred)
    return {'loss': -score, 'status': STATUS_OK}

# Define the hyperparameter search space
space = {
    'alpha': hp.loguniform('alpha', -5, 0),  # equivalent to alpha in range (exp(-5), exp(0))
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'max_depth': scope.int(hp.quniform('max_depth', 3, 15, 1)),
    'min_child_weight': hp.uniform('min_child_weight', 1, 10),
    'gamma': hp.loguniform('gamma', -8, 0),  # equivalent to gamma in range (exp(-8), exp(0))
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 300, 10)),
    'learning_rate': hp.loguniform('learning_rate', -5, 0)  # equivalent to learning_rate in range (exp(-5), exp(0))
}

# Run the optimization process
trials = Trials()
best_params = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(42)  # Ensure reproducibility
)

import pickle
with open('best_params_xgb.pkl', 'wb') as f:
    pickle.dump(best_params, f)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]





  2%|▏         | 1/50 [00:17<14:06, 17.27s/trial, best loss: -0.8701685725683594]





  4%|▍         | 2/50 [00:25<09:33, 11.95s/trial, best loss: -0.8701685725683594]





  6%|▌         | 3/50 [00:34<08:26, 10.78s/trial, best loss: -0.8701685725683594]





  8%|▊         | 4/50 [00:49<09:21, 12.21s/trial, best loss: -0.8701685725683594]





 10%|█         | 5/50 [01:00<08:50, 11.80s/trial, best loss: -0.8701685725683594]





 12%|█▏        | 6/50 [01:17<09:58, 13.59s/trial, best loss: -0.8701685725683594]





 14%|█▍        | 7/50 [01:32<10:10, 14.21s/trial, best loss: -0.8894619260119795]





 16%|█▌        | 8/50 [01:44<09:26, 13.48s/trial, best loss: -0.8894619260119795]





 18%|█▊        | 9/50 [01:57<08:58, 13.13s/trial, best loss: -0.8894619260119795]





 20%|██        | 10/50 [02:11<08:54, 13.37s/trial, best loss: -0.8894619260119795]





 22%|██▏       | 11/50 [02:33<10:25, 16.05s/trial, best loss: -0.9382650187058447]





 24%|██▍       | 12/50 [02:43<09:05, 14.36s/trial, best loss: -0.9382650187058447]





 26%|██▌       | 13/50 [02:53<07:55, 12.84s/trial, best loss: -0.9382650187058447]





 28%|██▊       | 14/50 [03:13<09:07, 15.21s/trial, best loss: -0.9382650187058447]





 30%|███       | 15/50 [03:36<10:07, 17.35s/trial, best loss: -0.9469710163101517]





 32%|███▏      | 16/50 [03:46<08:44, 15.42s/trial, best loss: -0.9469710163101517]





 34%|███▍      | 17/50 [03:56<07:34, 13.77s/trial, best loss: -0.9469710163101517]





 36%|███▌      | 18/50 [04:16<08:14, 15.45s/trial, best loss: -0.9469710163101517]





 38%|███▊      | 19/50 [04:29<07:35, 14.68s/trial, best loss: -0.9469710163101517]





 40%|████      | 20/50 [04:39<06:42, 13.40s/trial, best loss: -0.9469710163101517]





 42%|████▏     | 21/50 [04:59<07:23, 15.29s/trial, best loss: -0.9469710163101517]





 44%|████▍     | 22/50 [05:15<07:15, 15.57s/trial, best loss: -0.9469710163101517]





 46%|████▌     | 23/50 [05:33<07:20, 16.30s/trial, best loss: -0.9469710163101517]





 48%|████▊     | 24/50 [05:51<07:16, 16.79s/trial, best loss: -0.9469710163101517]





 50%|█████     | 25/50 [06:10<07:17, 17.50s/trial, best loss: -0.9469710163101517]





 52%|█████▏    | 26/50 [06:36<07:59, 19.99s/trial, best loss: -0.9472974295979041]





 54%|█████▍    | 27/50 [06:56<07:43, 20.16s/trial, best loss: -0.9472974295979041]





 56%|█████▌    | 28/50 [07:12<06:51, 18.73s/trial, best loss: -0.9472974295979041]





 58%|█████▊    | 29/50 [07:28<06:15, 17.87s/trial, best loss: -0.9472974295979041]





 60%|██████    | 30/50 [07:48<06:09, 18.46s/trial, best loss: -0.9472974295979041]





 62%|██████▏   | 31/50 [07:54<04:40, 14.76s/trial, best loss: -0.9472974295979041]





 64%|██████▍   | 32/50 [08:17<05:11, 17.32s/trial, best loss: -0.9472974295979041]





 66%|██████▌   | 33/50 [08:42<05:34, 19.68s/trial, best loss: -0.9494162658031088]





 68%|██████▊   | 34/50 [08:58<04:56, 18.51s/trial, best loss: -0.9494162658031088]





 70%|███████   | 35/50 [09:05<03:47, 15.18s/trial, best loss: -0.9494162658031088]





 72%|███████▏  | 36/50 [09:27<03:59, 17.13s/trial, best loss: -0.9494162658031088]





 74%|███████▍  | 37/50 [09:46<03:50, 17.70s/trial, best loss: -0.9494162658031088]





 76%|███████▌  | 38/50 [10:03<03:29, 17.46s/trial, best loss: -0.9494162658031088]





 78%|███████▊  | 39/50 [10:25<03:26, 18.81s/trial, best loss: -0.9494162658031088]





 80%|████████  | 40/50 [10:44<03:07, 18.78s/trial, best loss: -0.9494162658031088]





 82%|████████▏ | 41/50 [11:00<02:41, 17.92s/trial, best loss: -0.9494162658031088]





 84%|████████▍ | 42/50 [11:19<02:26, 18.34s/trial, best loss: -0.9494162658031088]





 86%|████████▌ | 43/50 [11:42<02:17, 19.67s/trial, best loss: -0.9494162658031088]





 88%|████████▊ | 44/50 [11:51<01:38, 16.46s/trial, best loss: -0.9494162658031088]





 90%|█████████ | 45/50 [12:10<01:26, 17.34s/trial, best loss: -0.9494162658031088]





 92%|█████████▏| 46/50 [12:22<01:03, 15.80s/trial, best loss: -0.9494162658031088]





 94%|█████████▍| 47/50 [12:39<00:48, 16.19s/trial, best loss: -0.9494162658031088]





 96%|█████████▌| 48/50 [12:55<00:32, 16.13s/trial, best loss: -0.9494162658031088]





 98%|█████████▊| 49/50 [13:15<00:17, 17.35s/trial, best loss: -0.9494162658031088]





100%|██████████| 50/50 [13:37<00:00, 16.34s/trial, best loss: -0.9494162658031088]


In [15]:
# Train the final model with the best parameters
best_model = XGBClassifier(
    alpha=best_params['alpha'],                    
    subsample=best_params['subsample'],     
    colsample_bytree=best_params['colsample_bytree'],  
    objective='binary:logistic',
    max_depth=int(best_params['max_depth']),             
    min_child_weight=best_params['min_child_weight'],         
    gamma=best_params['gamma'],                
    random_state=42,                 
    n_estimators=int(best_params['n_estimators']),
    learning_rate=best_params['learning_rate']
)

best_model.fit(
    X_train, 
    y_train, 
    eval_set=[(X_test, y_test)],
    eval_metric=lambda y_pred, dmatrix: [('mcc', mcc_metric(y_pred, dmatrix))],
    verbose=True
)

# Evaluate the final model
y_pred = best_model.predict(X_test)
final_score = matthews_corrcoef(y_test, y_pred)
print('Best MCC:', final_score)





[0]	validation_0-logloss:0.59087	validation_0-mcc:0.94942
[1]	validation_0-logloss:0.50738	validation_0-mcc:0.96548
[2]	validation_0-logloss:0.44175	validation_0-mcc:0.97115
[3]	validation_0-logloss:0.38585	validation_0-mcc:0.97252
[4]	validation_0-logloss:0.33962	validation_0-mcc:0.97198
[5]	validation_0-logloss:0.30028	validation_0-mcc:0.97242
[6]	validation_0-logloss:0.26669	validation_0-mcc:0.97254
[7]	validation_0-logloss:0.23885	validation_0-mcc:0.97338
[8]	validation_0-logloss:0.21448	validation_0-mcc:0.97357
[9]	validation_0-logloss:0.19264	validation_0-mcc:0.97453
[10]	validation_0-logloss:0.17402	validation_0-mcc:0.97465
[11]	validation_0-logloss:0.15831	validation_0-mcc:0.97511
[12]	validation_0-logloss:0.14374	validation_0-mcc:0.97565
[13]	validation_0-logloss:0.13108	validation_0-mcc:0.97654
[14]	validation_0-logloss:0.12059	validation_0-mcc:0.97680
[15]	validation_0-logloss:0.11100	validation_0-mcc:0.97723
[16]	validation_0-logloss:0.10256	validation_0-mcc:0.97753
[17]	va

In [16]:
import pickle
with open('../../models/best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# XGB

In [19]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Define the MCC metric for evaluation
def mcc_metric(y_pred, dmatrix):
    y_true = dmatrix.get_label()
    y_pred = (y_pred > 0.5).astype(int)
    mcc = matthews_corrcoef(y_true, y_pred)
    return 'mcc', mcc


In [21]:
# Initialize the XGBClassifier
model = XGBClassifier(                    
    alpha=0.1,                   
    subsample=0.8,     
    colsample_bytree=0.55,  
    objective='binary:logistic',
    max_depth=14,             
    min_child_weight=7,         
    gamma=1e-6,                
    random_state=42,                 
    n_estimators=100             
)

# Train the model
XGB = model.fit(
    X_train, 
    y_train, 
    eval_set=[(X_test, y_test)],
    eval_metric=mcc_metric
)

# Evaluate the model
y_pred = XGB.predict(X_test)
score = matthews_corrcoef(y_test, y_pred)
print('MCC:', score)

# Process the test set
df = pd.read_csv(test_path)

# Ensure the same columns are dropped in the test set
df = df.drop(columns=cols_drop.intersection(df.columns))

# Apply the same ordinal encoding
df[cols_train] = ordinal_encoder.transform(df[cols_train].astype(str))

# Convert all columns to appropriate numeric types
df[cols_train] = df[cols_train].astype(int)

# Drop the 'id' column if it exists in the test set
if 'id' in df.columns:
    df = df.drop(columns=['id'])





[0]	validation_0-logloss:0.46945	validation_0-mcc:0.91872
[1]	validation_0-logloss:0.36374	validation_0-mcc:0.95269
[2]	validation_0-logloss:0.27142	validation_0-mcc:0.96986
[3]	validation_0-logloss:0.20504	validation_0-mcc:0.97476
[4]	validation_0-logloss:0.16220	validation_0-mcc:0.97578
[5]	validation_0-logloss:0.13477	validation_0-mcc:0.97682
[6]	validation_0-logloss:0.11449	validation_0-mcc:0.97722
[7]	validation_0-logloss:0.09570	validation_0-mcc:0.97928
[8]	validation_0-logloss:0.08436	validation_0-mcc:0.97978
[9]	validation_0-logloss:0.07728	validation_0-mcc:0.97999
[10]	validation_0-logloss:0.07022	validation_0-mcc:0.98030
[11]	validation_0-logloss:0.06425	validation_0-mcc:0.98040
[12]	validation_0-logloss:0.05759	validation_0-mcc:0.98075
[13]	validation_0-logloss:0.05406	validation_0-mcc:0.98106
[14]	validation_0-logloss:0.04912	validation_0-mcc:0.98184
[15]	validation_0-logloss:0.04575	validation_0-mcc:0.98237
[16]	validation_0-logloss:0.04355	validation_0-mcc:0.98269
[17]	va

In [22]:
# Evaluate the model
y_pred = XGB.predict(X_test)
score = matthews_corrcoef(y_test, y_pred)
print('MCC:', score)

MCC: 0.9836332001545705


# Test set

In [17]:
# Process the test set
df = pd.read_csv(test_path)

# Ensure the same columns are dropped in the test set
df = df.drop(columns=cols_drop.intersection(df.columns))

# Apply the same ordinal encoding
df[cols_train] = ordinal_encoder.transform(df[cols_train].astype(str))

# Convert all columns to appropriate numeric types
df[cols_train] = df[cols_train].astype(int)

# Drop the 'id' column if it exists in the test set
if 'id' in df.columns:
    df = df.drop(columns=['id'])



In [18]:
y_final = best_model.predict(df)

In [19]:
y_final2 = le.inverse_transform(y_final)

In [20]:
sub_sample = pd.read_csv(submission_path)
df_submission = pd.DataFrame({'id': sub_sample['id'], 'class': y_final2})

df_submission.to_csv('../../data/prediction.csv', index=False)
gc.collect()

685

In [21]:
df_submission

Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e
...,...,...
2077959,5194904,p
2077960,5194905,p
2077961,5194906,p
2077962,5194907,e


In [22]:
sub_sample

Unnamed: 0,id,class
0,3116945,e
1,3116946,e
2,3116947,e
3,3116948,e
4,3116949,e
...,...,...
2077959,5194904,e
2077960,5194905,e
2077961,5194906,e
2077962,5194907,e
