In [None]:
import numpy as np
import pandas as pd

In [None]:

with open('data/ass2.pickle', 'rb') as handle:
    data = pd.read_pickle(handle)

X_train, y_train = data['train']
X_dev, y_dev = data['dev']

In [None]:

print('Train data:')
print(f'Number of Rows: {len(y_train)}')
print(f'The Classes are: {np.unique(y_train)}')
print(f'Class 0: {np.count_nonzero(y_train == 0)}')
print(f'Class 1: {np.count_nonzero(y_train == 1)}')

print('Dev data:')
print(f'Number of Rows: {len(y_dev)}')
print(f'The Classes are: {np.unique(y_dev)}')
print(f'Class 0: {np.count_nonzero(y_dev == 0)}')
print(f'Class 1: {np.count_nonzero(y_dev == 1)}')

We can see that the dataset is unbalanced so we will balance it.
We can also see that we are dealing with a binary classification problem so we will be using algorithms that are a better fit for this kind of problem.

## First we run LazyPredict to choose the few best algorithms to continue with

In [None]:
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_dev, y_train, y_dev)

print(models)

## Balance the data in two ways, Over-Sampling and Under-Sampling

The following block needs to be run twice to work sometimes

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
rus = RandomUnderSampler(random_state=42, replacement=True)
x_rus, y_rus = rus.fit_resample(X_train, y_train)
x_ros, y_ros = ros.fit_resample(X_train, y_train)

print(f'Under sampled: {len(y_rus)}')
print(f'Over sampled: {len(y_ros)}')

Our data has 14 features, we might need to reduce some of them based on their importance.

In [None]:
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2

def pca(k, X):
    pca = PCA(n_components=k)
    pca.fit(X)
    return (pca.transform(X), pca.transform(X_dev))

def select_k_best(k, X, y):
    k_best = SelectKBest(score_func=chi2, k=k)  
    k_best.fit(X, y)
    return (k_best.transform(X), k_best.transform(X_dev))

We plot the data in case we can see some important features

In [None]:
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# create pandas df
X = pd.DataFrame(X_train)
y = np.array(y_train)
X['target'] = pd.DataFrame(y.reshape(-1, 1), columns=["target"])
X.head(5)

# check the null values
X.isnull().sum()

# pairplot for distribution
sns.pairplot(X ,hue="target", palette='Set1')

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

def cross_validation(clf, X, y, k):
    sc = StandardScaler()
    return round(cross_val_score(clf, sc.fit_transform(X), y, cv=k).mean(), 3)

def fit_predict(classifier, X_train, y_train):
    sc = StandardScaler()
    scaled_X_train = sc.fit_transform(X_train)
    classifier.fit(scaled_X_train, y_train)

    train_score = accuracy_score(classifier.predict(scaled_X_train), y_train)
    dev_score = accuracy_score(classifier.predict(sc.transform(X_dev)), y_dev)
    over_fitting = train_score - dev_score

    return f'train score: {round(train_score, 3)} test score {round(dev_score, 3)} overfit {round(over_fitting, 3)}'

def fit_predict_poly(classifier, X_train, y_train, X_dev, y_dev):
    transformed_train = PolynomialFeatures(2).fit_transform(X_train)
    transformed_dev = PolynomialFeatures(2).fit_transform(X_dev)
    sc = StandardScaler()
    scaled_X_train = sc.fit_transform(transformed_train)

    classifier.fit(scaled_X_train, y_train)

    train_score = accuracy_score(classifier.predict(scaled_X_train), y_train)
    dev_score = accuracy_score(classifier.predict(sc.transform(transformed_dev)), y_dev)
    over_fitting = train_score - dev_score

    return f'train score: {round(train_score, 3)} test score {round(dev_score, 3)} overfit {round(over_fitting, 3)}'

## Initial test of the best algoriths usin cross-validation

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, VotingClassifier, AdaBoostClassifier, BaggingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

def check_models(X, y, k):
    for clf in clfs:
        print(clf)
        for i in range(1,15):
            X_train, X_dev = select_k_best(i, X, y)
            print(f'Iteration {i}: cv {cross_validation(clfs[clf], np.array(X_train), np.array(y), k)} Polynomial {fit_predict_poly(clfs[clf], X_train, y, X_dev, y_dev)}')

clfs = {
    'Random Forest':RandomForestClassifier(random_state = 42),
    'SVC':SVC(random_state = 42),
    'AdaBoost':AdaBoostClassifier(random_state=42),
    'Histogram Gradient Boosting':HistGradientBoostingClassifier(random_state = 42),
    'Bagging Classifier':BaggingClassifier(base_estimator = SVC()),
    'XGBoost':XGBClassifier(random_state = 42),
    'Light GBM':LGBMClassifier(random_state = 42)}

print('Regular Data:')
check_models(X_train, y_train, 5)
print('Undersampled Data:')
check_models(x_rus, y_rus, 5)
print('Oversampled Data:')
check_models(x_ros, y_ros, 5)


## Find best hyperparameters for each model using RandomizedSearch

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
rf_grid =  {'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(random_state=42), param_distributions = rf_grid, scoring = 'roc_auc', n_iter = 5, cv = 3, verbose=3, random_state=42, n_jobs = -1)
rf_random.fit(x_ros, y_ros)
print(rf_random.best_params_)
print(rf_random.best_estimator_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

svc_grid = {'C': [0.1, 1, 10, 100],
            'gamma': [1, 0.1, 0.01, 0.001],
            'kernel': ['rbf', 'poly', 'linear']}
svc_random = RandomizedSearchCV(estimator = SVC(), param_distributions = svc_grid, scoring = 'roc_auc', n_iter = 5, cv = 3, verbose=3, random_state=42, n_jobs = -1)
svc_random.fit(x_ros, y_ros)
print(svc_random.best_params_)
print(svc_random.best_estimator_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

xgb_grid = {
    "learning_rate"     : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3] ,
    "max_depth"         : [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight"  : [1, 3, 5, 7],
    "gamma"             : [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree"  : [0.3, 0.4, 0.5, 0.7]
}

xgb_random = RandomizedSearchCV(XGBClassifier(random_state=42), param_distributions=xgb_grid, n_iter=50, scoring='roc_auc', n_jobs=-1, cv=5, verbose=3)
xgb_random.fit(x_ros, y_ros)
print(xgb_random.best_params_)
print(xgb_random.best_estimator_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

lgbm_grid = {'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

lgbm_random = RandomizedSearchCV(LGBMClassifier(random_state=42), param_distributions=lgbm_grid, n_iter=50, scoring='roc_auc', n_jobs=-1, cv=5, verbose=3)

lgbm_random.fit(x_ros, y_ros)
print(lgbm_random.best_params_)
print(lgbm_random.best_estimator_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

hgb_grid = {
    'learning_rate': (0.01, 0.1, 1, 10),
    'max_leaf_nodes': (3, 10, 30, 60),
    'min_samples_leaf': (5, 10, 20, 40),
    'max_depth': (5, 6, 7, 8)}

hgb_random = RandomizedSearchCV(HistGradientBoostingClassifier(random_state=42), param_distributions=hgb_grid, n_iter=50, scoring='roc_auc', n_jobs=-1, cv=5, verbose=3)

hgb_random.fit(x_ros, y_ros)
print(hgb_random.best_params_)
print(hgb_random.best_estimator_)

## Test the new models with the 'dev' data

In [None]:
xgb_ros = XGBClassifier(
        base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bynode=1, colsample_bytree=0.5, gamma=0.4, gpu_id=-1,
        importance_type='gain', interaction_constraints='',
        learning_rate=0.3, max_delta_step=0, max_depth=15,
        min_child_weight=1, monotone_constraints='()',
        n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=42,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
        tree_method='exact', validate_parameters=1, verbosity=None)

lgbm_ros = LGBMClassifier(
        colsample_bytree=0.8027564890191548, min_child_samples=206,
        num_leaves=49, random_state=42, reg_alpha=0, reg_lambda=0.1,
        subsample=0.7676961204705843)

hgb_ros = HistGradientBoostingClassifier(
        max_depth=8, max_leaf_nodes=30,
        min_samples_leaf=5, random_state=42)

rf = RandomForestClassifier(random_state=42, n_estimators=120)

evc_ros1 = VotingClassifier(estimators=[('xgb',xgb_ros), ('hgb', hgb_ros)], voting='soft', flatten_transform=False)
evc_ros2 = VotingClassifier(estimators=[('xgb',xgb_ros), ('hgb', hgb_ros), ('rf', rf)], voting='soft', flatten_transform=False)
evc_ros3 = VotingClassifier(estimators=[('xgb',xgb_ros), ('hgb', hgb_ros), ('lgbm', lgbm_ros)], voting='soft', flatten_transform=False)
evc_ros4 = VotingClassifier(estimators=[('xgb',xgb_ros), ('hgb', hgb_ros), ('lgbm', lgbm_ros), ('rf', rf)], voting='soft', flatten_transform=False)



clfs = {'xgb_ros':xgb_ros, 'lgbm_ros':lgbm_ros, 'hgb_ros':hgb_ros, 'rf':rf, 'evc_ros1': evc_ros1, 'evc_ros2':evc_ros2, 'evc_ros3':evc_ros3, 'evc_ros4':evc_ros4}

for clf in clfs:
        print(f'{clf}: {fit_predict(clfs[clf], x_ros, y_ros)}')

In [None]:
xgb_reg = XGBClassifier(
    base_score=0.5, booster='gbtree', colsample_bylevel=1,
    colsample_bynode=1, colsample_bytree=0.5, gamma=0.4, gpu_id=-1,
    importance_type='gain', interaction_constraints='',
    learning_rate=0.15, max_delta_step=0, max_depth=4,
    min_child_weight=1, monotone_constraints='()',
    n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=42,
    reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
    tree_method='exact', validate_parameters=1, verbosity=None)

lgbm_reg = LGBMClassifier(
    colsample_bytree=0.8104721917981516, min_child_samples=118,
    min_child_weight=1, num_leaves=30, random_state=42, reg_alpha=0,
    reg_lambda=0.1, subsample=0.8243603847850702)

hgb_reg = HistGradientBoostingClassifier(
    max_depth=5, max_leaf_nodes=30,
    min_samples_leaf=5, random_state=42)

rf = RandomForestClassifier(random_state=42, n_estimators=120)

evc_reg1 = VotingClassifier(estimators=[('xgb',xgb_reg), ('hgb', hgb_reg)], voting='soft', flatten_transform=False)
evc_reg2 = VotingClassifier(estimators=[('xgb',xgb_reg), ('hgb', hgb_reg), ('rf', rf)], voting='soft', flatten_transform=False)
evc_reg3 = VotingClassifier(estimators=[('xgb',xgb_reg), ('hgb', hgb_reg), ('lgbm', lgbm_reg)], voting='soft', flatten_transform=False)
evc_reg4 = VotingClassifier(estimators=[('xgb',xgb_reg), ('hgb', hgb_reg), ('lgbm', lgbm_reg), ('rf', rf)], voting='soft', flatten_transform=False)


clfs = {'xgb_reg':xgb_reg, 'lgbm_reg':lgbm_reg, 'hgb_reg':hgb_reg, 'rf':rf, 'evc_reg1': evc_reg1, 'evc_reg2': evc_reg2, 'evc_reg3': evc_reg3, 'evc_reg4': evc_reg4}

for clf in clfs:
        print(f'{clf}: {fit_predict(clfs[clf], X_train, y_train)} cv {cross_validation(clfs[clf], X_train, y_train, 5)}')