In [139]:
import os
import pickle
from time import time

import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier

DATA_DIR = os.path.join(CURR_DIR, 'data')
PROCESSED_FILE_NAME = os.path.join(DATA_DIR, 'processed_data.pickle')

# Load the data from the pickle file
with open(PROCESSED_FILE_NAME, 'rb') as handle:
    data_dict = pickle.load(handle)

X_train = data_dict['X_train']
X_test = data_dict['X_test']
y_train = data_dict['y_train']
y_test = data_dict['y_test']

RANDOM_CV_SPLIT = 5
RANDOM_STATE = 42

cv_strategy = StratifiedKFold(n_splits=RANDOM_CV_SPLIT, shuffle=True, random_state=RANDOM_STATE)

In [97]:
pipeline = Pipeline(steps=[('classifier', AdaBoostClassifier())])

# RandomForestClassifier
RandomForestClassifier_param = {
    'classifier': [RandomForestClassifier()],
    'classifier__n_estimators': [50, 100],         
    'classifier__max_depth': [5, 10],                  
    'classifier__class_weight': ['balanced'],          
    'classifier__min_samples_split': [2, 5],           
    'classifier__min_samples_leaf': [1, 2],           
    'classifier__random_state': [RANDOM_STATE]
}

# AdaBoostClassifier
AdaBoost_param = {
    'classifier': [AdaBoostClassifier(algorithm='SAMME')],
    'classifier__n_estimators': [50, 100],             
    'classifier__learning_rate': [0.5, 1],             
    'classifier__random_state': [RANDOM_STATE]
}

# XGBClassifier
XGBClassifier_param = {
    'classifier': [xgb.XGBClassifier()],
    'classifier__eta': [0.05, 0.1],                 
    'classifier__gamma': [0.5, 1.5],               
    'classifier__max_depth': [4, 6],                   
    'classifier__subsample': [0.8, 1.0],               
    'classifier__min_child_weight': [1],              
    'classifier__eval_metric': ['mlogloss'],
    'classifier__seed': [RANDOM_STATE]
}

params = [AdaBoost_param, XGBClassifier_param, RandomForestClassifier_param]

gs = GridSearchCV(pipeline, param_grid=params, refit='f1', return_train_score=True, cv=cv_strategy, n_jobs=-1, scoring='f1_micro')

In [140]:
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

columns_to_encode = ['f_0','f_1','f_2','f_4','f_6','f_8','f_9','f_14', 'f_41','f_42','f_43','f_44','f_45','f_46','f_47','f_48','f_49', 'f_59', 'f_68_outlier', 'f_70_outlier', 'f_74_outlier', 'f_83_outlier', 'f_87_outlier', 'f_2_no_signature','f_9_no_signature']

# Initialize LabelEncoder
label_encoders_train = {col: LabelEncoder() for col in columns_to_encode}
label_encoders_test = {col: LabelEncoder() for col in columns_to_encode}

# Function to apply label encoding to the specified columns
def encode_columns(df, columns, encoders):
    for col in columns:
        if col in df.columns:  # Check if the column exists in the DataFrame
            df[col] = encoders[col].fit_transform(df[col].astype(str))  # Convert values to strings and encode

# Apply encoding to X_train and X_test
encode_columns(X_train_encoded, columns_to_encode, label_encoders_train)
encode_columns(X_test_encoded, columns_to_encode, label_encoders_test)

X_train_encoded = X_train_encoded.drop(['Unnamed: 0', 'broccoli_encoded', 'broccoli'], axis=1)
X_test_encoded = X_test_encoded.drop(['Unnamed: 0', 'broccoli_encoded', 'broccoli'], axis=1)

In [99]:
tic = time()
print('df_train shape:',X_train_encoded.shape)
gs.fit(X_train_encoded, y_train)
toc = time()
print(f'GridSearchCV time(mintues): {round((toc-tic)/60,2)}')

best_params = gs.best_params_

df_train shape: (47984, 110)


  _data = np.array(data, dtype=dtype, copy=copy,


GridSearchCV time(mintues): 18.83


In [100]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__learning_rate,param_classifier__n_estimators,param_classifier__random_state,param_classifier__eta,param_classifier__eval_metric,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,13.692948,4.969483,0.326343,0.278031,AdaBoostClassifier(algorithm='SAMME'),0.5,50.0,42.0,,,...,0.793473,0.004008,36,0.793446,0.794201,0.791596,0.79441,0.80038,0.794807,0.002958
1,18.25786,2.356325,0.354144,0.156017,AdaBoostClassifier(algorithm='SAMME'),0.5,100.0,42.0,,,...,0.803518,0.002977,34,0.802277,0.808034,0.806002,0.800219,0.807205,0.804747,0.003001
2,9.562062,1.443313,0.185705,0.082922,AdaBoostClassifier(algorithm='SAMME'),1.0,50.0,42.0,,,...,0.801913,0.005069,35,0.810352,0.803579,0.798057,0.8041,0.803663,0.80395,0.003898
3,15.49448,1.373331,0.23511,0.015296,AdaBoostClassifier(algorithm='SAMME'),1.0,100.0,42.0,,,...,0.809103,0.004597,25,0.81822,0.81327,0.801495,0.809884,0.813562,0.811286,0.005568
4,3.119313,1.161037,0.327451,0.377618,"XGBClassifier(base_score=None, booster=None, c...",,,,0.05,mlogloss,...,0.859536,0.003579,22,0.861646,0.864329,0.868992,0.862714,0.866703,0.864877,0.002672
5,5.48123,3.005825,0.16807,0.043722,"XGBClassifier(base_score=None, booster=None, c...",,,,0.05,mlogloss,...,0.859474,0.003661,23,0.863886,0.862896,0.867377,0.862271,0.86501,0.864288,0.001802
6,5.18829,1.711053,0.190363,0.085198,"XGBClassifier(base_score=None, booster=None, c...",,,,0.05,mlogloss,...,0.885503,0.002298,6,0.895251,0.896762,0.898247,0.894886,0.896817,0.896393,0.001211
7,4.233853,1.080822,0.1153,0.01242,"XGBClassifier(base_score=None, booster=None, c...",,,,0.05,mlogloss,...,0.883128,0.003676,8,0.894287,0.894496,0.894886,0.895199,0.896765,0.895126,0.000877
8,3.155534,1.029478,0.136063,0.052634,"XGBClassifier(base_score=None, booster=None, c...",,,,0.05,mlogloss,...,0.859828,0.003426,21,0.863938,0.864277,0.867794,0.863261,0.866703,0.865195,0.001743
9,2.576718,0.256176,0.103719,0.014925,"XGBClassifier(base_score=None, booster=None, c...",,,,0.05,mlogloss,...,0.858244,0.00421,24,0.863443,0.860995,0.8667,0.862245,0.864072,0.863491,0.001919


In [103]:
print(f"Best score: {gs.best_score_}")
best_model = gs.best_estimator_
best_model

Best score: 0.898612052371688


In [129]:
importances = best_model['classifier'].feature_importances_

importance_df = pd.DataFrame({
    'Feature': X_train_encoded.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

target_features = ['f_59','f_68_outlier', 'f_70_outlier', 'f_74_outlier', 'f_83_outlier', 'f_87_outlier', 'f_2_no_signature','f_9_no_signature']
found_features = importance_df[importance_df['Feature'].isin(target_features)]

positions = {feature: importance_df.index.get_loc(importance_df[importance_df['Feature'] == feature].index[0])
             for feature in target_features if feature in importance_df['Feature'].values}
positions_df = pd.DataFrame(list(positions.items()), columns=['Feature', 'Position'])
combined_df = pd.merge(importance_df, positions_df, on='Feature')

print(' -------------------------- All features importance --------------------------')
print(importance_df.iloc[:5])

#found_features
print(' -------------------------- Found correlated features importance --------------------------')
print(combined_df)

 -------------------------- All features importance --------------------------
    Feature  Importance
58     f_59    0.354940
36     f_37    0.033018
41     f_42    0.032921
2       f_2    0.029512
102   f_103    0.022611
 -------------------------- Found correlated features importance --------------------------
            Feature  Importance  Position
0              f_59    0.354940         0
1  f_2_no_signature    0.006145        38
2      f_70_outlier    0.000000       104
3      f_68_outlier    0.000000       105
4      f_74_outlier    0.000000       106
5      f_83_outlier    0.000000       107
6      f_87_outlier    0.000000       108
7  f_9_no_signature    0.000000       109


### Surprsingly, only two found features are relatively important - f_59, f_2_no_signature

In [145]:
test_predictions = best_model.predict(X_test_encoded)
test_f1_macro =  f1_score(y_test, test_predictions, average='macro')
test_accuracy = accuracy_score(y_test, test_predictions)

train_predictions = best_model.predict(X_train_encoded)
train_f1_macro =  f1_score(y_train, train_predictions, average='macro')
train_accuracy = accuracy_score(y_train, train_predictions)

print(f'F1 Train Score: {train_f1_macro}. Accuracy: {train_accuracy}')
print(f'F1 Test Score: {test_f1_macro}. Accuracy: {test_accuracy}')

F1 Train Score: 0.9133910789290367. Accuracy: 0.9146382127375792
F1 Test Score: 0.8891068276274676. Accuracy: 0.8905468489496499
