In [63]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
import pandas as pd
import numpy as np
import xgboost
import lightgbm as lgb
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
# from lazypredict.Supervised import LazyClassifier


# X = pd.read_csv('data/X.csv')
# y = pd.read_csv('data/y.csv')

# #train test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # used for kaggle submission
# X_kaggle = pd.read_csv('data/X_kaggle.csv')


X = pd.read_csv('data/X_better.csv', header=None, sep=";").to_numpy()
y = pd.read_csv('data/y_better.csv').to_numpy().ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_kaggle = pd.read_csv('data/X_kaggle.csv', header=None).to_numpy()

### XGBoost ###

In [66]:
param_grid = {
    'n_estimators': [50, 100, 150, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.75, 0.9, 1],
    'colsample_bytree': [0.5, 0.75, 0.8, 1.0],
    'eval_metric': ['mlogloss', 'logloss'],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 3, 5]
}

def custom_score(y_true, y_pred):
    class_1_ratio = np.mean(y_pred)
    imbalance_penalty = np.abs(class_1_ratio - 0.5)
    accuracy = np.mean(y_true == y_pred)
    # you can change 10 to any other number to adjust the penality strength
    return accuracy - imbalance_penalty*7

acc5050 = make_scorer(custom_score)


xgb_model = xgboost.XGBClassifier()

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           scoring='accuracy', cv=3, verbose=1, n_jobs=-1)

grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
# dependent on scoring function not actually the acurracy but the custom score including imbalance penalty
print("Best (accuracy) score: ", grid_search.best_score_)

Fitting 3 folds for each of 12960 candidates, totalling 38880 fits
Best parameters found:  {'colsample_bytree': 0.5, 'eval_metric': 'mlogloss', 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 50, 'subsample': 0.6}
Best (accuracy) score:  nan


In [68]:
y_pred_test_xgb = grid_search.best_estimator_.predict(X_test)
print(f"Accuracy on test set: {accuracy_score(y_test, y_pred_test_xgb)}")

Accuracy on test set: 0.8806818181818182


In [69]:
y_scores = grid_search.best_estimator_.predict_proba(X_kaggle)[:, 1]

sorted_scores = np.sort(y_scores)
threshold_index = int(len(sorted_scores) * 0.51)
threshold = sorted_scores[threshold_index]

y_pred_thresholded = (y_scores >= threshold).astype(int)

y_pred_thresholded = pd.DataFrame(y_pred_thresholded, columns=['predictions'])
y_pred_thresholded["ID"] = y_pred_thresholded.index
print(y_pred_thresholded["predictions"].value_counts())
y_pred_thresholded.to_csv('data/thresholding_y_xgb.csv', index=False)

predictions
0    746
1    720
Name: count, dtype: int64


In [70]:
y_pred_xgb = grid_search.best_estimator_.predict(X_kaggle)
y_pred_xgb = pd.DataFrame(y_pred_xgb, columns=['predictions'])
y_pred_xgb["ID"] = y_pred_xgb.index
y_pred_xgb.to_csv('data/y_pred_xgb_try.csv', index=False)

In [71]:
f1_xgb = f1_score(y_test, y_pred_test_xgb)
f1_xgb

0.8810872027180068

In [72]:
y_pred_xgb["predictions"].value_counts()

predictions
0    767
1    699
Name: count, dtype: int64

### LazyClassifier ###

We can check which top 10 models are the best performing and furtherly train them ourselves with more delicate steps, meaning GridSearch and kFold :)

In [182]:
def custom_score(y_true, y_pred):
    class_1_ratio = np.mean(y_pred)
    imbalance_penalty = np.abs(class_1_ratio - 0.5)
    accuracy = np.mean(y_true == y_pred)
    # you can change 10 to any other number to adjust the penality strength
    score = accuracy - imbalance_penalty*7
    return score

acc5050 = make_scorer(custom_score)

In [199]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=accuracy_score)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|███████████████████████████████████████████| 29/29 [00:30<00:00,  1.04s/it]

[LightGBM] [Info] Number of positive: 5280, number of negative: 5280
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001795 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4024
[LightGBM] [Info] Number of data points in the train set: 10560, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.90               0.90     0.90      0.90   
XGBClassifier                      0.90               0.90     0.90      0.90   
SVC                                0.89               0.89     0.89      0.89   
ExtraTreesClassifier               0.89               0.89     0.89      0.89   
RandomForestClassifier             0.89               0.89     0.89      0.89




### Random Forest ###

In [201]:
# ditch acc5050, do it again with 'accuracy'

# Define the parameter grid
# kf = KFold(n_splits=5)

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_features': ['sqrt', 'log2']
}

def custom_score(y_true, y_pred):
    class_1_ratio = np.mean(y_pred)
    imbalance_penalty = np.abs(class_1_ratio - 0.5)
    accuracy = np.mean(y_true == y_pred)
    # you can change 10 to any other number to adjust the penality strength
    return accuracy - imbalance_penalty*7

acc5050 = make_scorer(custom_score)

# Initialize the Random Forest
rf_model = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_rf = GridSearchCV(
    estimator=rf_model, 
    param_grid=param_grid_rf, 
    scoring=acc5050, 
    cv=5, 
    verbose=1, 
    n_jobs=-1
)

# Fit GridSearchCV
grid_search_rf.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search_rf.best_params_)
# dependent on scoring function not actually the acurracy but the custom score including imbalance penalty
print("Best (accuracy) score: ", grid_search_rf.best_score_)

y_pred_test_rf = grid_search_rf.best_estimator_.predict(X_test)
print(f"Accuracy on test set: {accuracy_score(y_test, y_pred_test_rf)}")

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters found:  {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100}
Best (accuracy) score:  0.7857954545454547
Accuracy on test set: 0.8890151515151515


In [208]:
y_scores_rf = grid_search_rf.best_estimator_.predict_proba(X_kaggle)[:,1]

sorted_scores_rf = np.sort(y_scores_rf)
threshold_index_rf = int(len(sorted_scores_rf) * 0.5)
threshold_rf = sorted_scores_rf[threshold_index_rf]

y_pred_thresholded_rf = (y_scores_rf >= threshold_rf).astype(int)

print(f"Class distribution after thresholding: {np.bincount(y_pred_thresholded_rf)}")

y_pred_thresholded_rf = pd.DataFrame(y_pred_thresholded_rf, columns=['predictions'])
y_pred_thresholded_rf["ID"] = y_pred_thresholded_rf.index

y_pred_rf = grid_search_rf.best_estimator_.predict(X_kaggle)
y_pred_rf = pd.DataFrame(y_pred_rf, columns=['predictions'])
y_pred_rf["ID"] = y_pred_rf.index

Class distribution after thresholding: [729 736]


In [209]:
y_pred_thresholded_rf.to_csv('data/thresholding_rf_y.csv', index=False)
y_pred_thresholded_rf['predictions'].value_counts()

predictions
1    736
0    729
Name: count, dtype: int64

In [None]:
y_pred_rf.to_csv('data/y_pred_rf_new.csv', index=False)
y_pred_rf['predictions'].value_counts()

In [None]:
f1_rf = f1_score(y_test, y_pred_test_rf)
print(f"F1-score on test set: {f1_rf}")

### ExtraTreesClassifier ###

In [211]:
param_grid_etc = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_features': ['sqrt', 'log2']
}

def custom_score(y_true, y_pred):
    class_1_ratio = np.mean(y_pred)
    imbalance_penalty = np.abs(class_1_ratio - 0.5)
    accuracy = np.mean(y_true == y_pred)
    # you can change 10 to any other number to adjust the penality strength
    return accuracy - imbalance_penalty*7

acc5050 = make_scorer(custom_score)

# Initialize the ExtraTreeClassifier
etc_model = ExtraTreesClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_etc = GridSearchCV(estimator=etc_model, param_grid=param_grid_etc, cv=kf, scoring=acc5050, verbose=1)

# Fit GridSearchCV
grid_search_etc.fit(X_train, y_train)

best_etc_model = grid_search_etc.best_estimator_
print(grid_search_etc.best_score_)

y_pred_etc = best_etc_model.predict(X_kaggle)

y_pred_etc = pd.DataFrame(y_pred_etc, columns=['prediction'])
y_pred_etc['ID'] = y_pred_etc.index
y_pred_etc

Fitting 5 folds for each of 108 candidates, totalling 540 fits
0.7375


Unnamed: 0,prediction,ID
0,1,0
1,0,1
2,0,2
3,1,3
4,0,4
...,...,...
1460,0,1460
1461,1,1461
1462,0,1462
1463,0,1463


In [212]:
y_pred_test_etc = grid_search_etc.best_estimator_.predict(X_test)
print(f"Accuracy on test set: {accuracy_score(y_test, y_pred_test_etc)}")

f1_etc = f1_score(y_test, y_pred_test_etc)
f1_etc

Accuracy on test set: 0.8878787878787879


0.8899628252788104

### LGBMClassifier ###

In [214]:
param_grid_lgb = {
    'num_leaves': [25, 30, 50],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

def custom_score(y_true, y_pred):
    class_1_ratio = np.mean(y_pred)
    imbalance_penalty = np.abs(class_1_ratio - 0.5)
    accuracy = np.mean(y_true == y_pred)
    # you can change 10 to any other number to adjust the penality strength
    return accuracy - imbalance_penalty*7

acc5050 = make_scorer(custom_score)

# Initialize the LGBMClassifier
lgb_model = lgb.LGBMClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_lgb = GridSearchCV(
    estimator=lgb_model, 
    param_grid=param_grid_lgb, 
    scoring=acc5050, 
    cv=3, 
    verbose=1, 
    n_jobs=-1
)

# Fit GridSearchCV
grid_search_lgb.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search_lgb.best_params_)
# dependent on scoring function not actually the acurracy but the custom score including imbalance penalty
print("Best (accuracy) score: ", grid_search_lgb.best_score_)

y_pred_test_lgb = grid_search_lgb.best_estimator_.predict(X_test)
print(f"Accuracy on test set: {accuracy_score(y_test, y_pred_test_lgb)}")

y_pred_lgb = grid_search_lgb.best_estimator_.predict(X_kaggle)
y_pred_lgb = pd.DataFrame(y_pred_lgb, columns=['predictions'])
y_pred_lgb["ID"] = y_pred_lgb.index
# y_pred_lgb.to_csv('data/y_pred_lgb.csv', index=False)


Fitting 3 folds for each of 108 candidates, totalling 324 fits
[LightGBM] [Info] Number of positive: 5280, number of negative: 5280
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001484 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4064
[LightGBM] [Info] Number of data points in the train set: 10560, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Best parameters found:  {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'n_estimators': 200, 'num_leaves': 25, 'subsample': 0.8}
Best (accuracy) score:  0.7761363636363635
Accuracy on test set: 0.8992424242424243


In [215]:
y_scores_lgb = grid_search_lgb.best_estimator_.predict_proba(X_kaggle)[:, 1]

sorted_scores_lgm = np.sort(y_scores_lgb)
threshold_index_lgm = int(len(sorted_scores_lgm) * 0.5)
threshold_lgm = sorted_scores_lgm[threshold_index_lgm]

y_pred_thresholded_lgm = (y_scores_lgb >= threshold_lgm).astype(int)

print(f"Class distribution after thresholding: {np.bincount(y_pred_thresholded_lgm)}")

y_pred_thresholded_lgm = pd.DataFrame(y_pred_thresholded_lgm, columns=['predictions'])
y_pred_thresholded_lgm["ID"] = y_pred_thresholded_lgm.index

y_pred_lgm = grid_search_lgb.best_estimator_.predict(X_kaggle)
y_pred_lgm = pd.DataFrame(y_pred_lgm, columns=['predictions'])
y_pred_lgm["ID"] = y_pred_lgm.index

Class distribution after thresholding: [732 733]


In [216]:
y_pred_lgb['predictions'].value_counts()

#y_pred_lgb.to_csv('y_pred_lgb.csv', index=False)

predictions
0    795
1    670
Name: count, dtype: int64

In [221]:
f1_lgm = f1_score(y_test, y_pred_test_lgb)
print(f"F1-score on test set: {f1_lgm}")

F1-score on test set: 0.9007462686567165


In [None]:
y_pred_thresholded_lgm.to_csv('data/thresholding_lgm_y.csv', index=False)

### AdaBoostClassifier ###

In [32]:
param_grid_ada = {
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
    'n_estimators': [50, 100, 200, 300],
    'algorithm': ['SAMME', 'SAMME.R'],
}

def custom_score(y_true, y_pred):
    class_1_ratio = np.mean(y_pred)
    imbalance_penalty = np.abs(class_1_ratio - 0.5)
    accuracy = np.mean(y_true == y_pred)
    # you can change 10 to any other number to adjust the penality strength
    return accuracy - imbalance_penalty*7

acc5050 = make_scorer(custom_score)

# Initialize the AdaBoostClassifier
ada_model = AdaBoostClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_ada = GridSearchCV(estimator=ada_model, param_grid=param_grid_ada, cv=3, scoring=acc5050, verbose=1)

# Fit GridSearchCV
grid_search_ada.fit(X_train, y_train)

best_ada_model = grid_search_ada.best_estimator_
print(grid_search_ada.best_score_)

y_pred_ada = best_ada_model.predict(X_kaggle)

y_pred_ada = pd.DataFrame(y_pred_ada, columns=['prediction'])
y_pred_ada['ID'] = y_pred_ada.index
y_pred_ada

Fitting 3 folds for each of 32 candidates, totalling 96 fits
0.7312499999999997


Unnamed: 0,prediction,ID
0,0,0
1,1,1
2,0,2
3,0,3
4,1,4
...,...,...
1461,0,1461
1462,1,1462
1463,0,1463
1464,0,1464


In [33]:
y_pred_test_ada = grid_search_ada.best_estimator_.predict(X_test)
print(f"Accuracy on test set: {accuracy_score(y_test, y_pred_test_ada)}")

f1_ada = f1_score(y_test, y_pred_test_ada)
f1_ada

Accuracy on test set: 0.8852272727272728


0.8874860750092833

In [34]:
y_pred_ada = grid_search_ada.best_estimator_.predict(X_kaggle)
y_pred_ada = pd.DataFrame(y_pred_ada, columns=['predictions'])
y_pred_ada["ID"] = y_pred_ada.index
y_pred_ada.value_counts("predictions")

predictions
1    749
0    717
Name: count, dtype: int64

In [35]:
y_pred_ada.to_csv('data/ada.csv', index=False)

In [36]:
y_scores = grid_search_ada.best_estimator_.predict_proba(X_kaggle)[:, 1]

sorted_scores = np.sort(y_scores)
threshold_index = int(len(sorted_scores) * 0.5)
threshold = sorted_scores[threshold_index]

y_pred_thresholded = (y_scores >= threshold).astype(int)

print(f"Class distribution after thresholding: {np.bincount(y_pred_thresholded)}")

y_pred_thresholded = pd.DataFrame(y_pred_thresholded, columns=['predictions'])
y_pred_thresholded["ID"] = y_pred_thresholded.index
y_pred_thresholded.to_csv('data/thresholding_y_ada.csv', index=False)

Class distribution after thresholding: [733 733]


### Final files to submit (per model) ###

In [42]:
# XGBoost
y_pred_xgb.to_csv('data/y_pred_xgb.csv', index=False)

# Ramdom Forest
y_pred_rf.to_csv('data/y_pred_rf.csv', index=False)

# LightGBM
y_pred_lgb.to_csv('data/y_pred_lgb.csv', index=False)