In [None]:
import numpy as np
import pandas as pd

In [None]:

with open('data/ass2.pickle', 'rb') as handle:
    data = pd.read_pickle(handle)

X_train, y_train = data['train']
X_dev, y_dev = data['dev']

In [None]:

print(f'Number of Rows: {len(y_train)}')
print(f'The Classes are: {np.unique(y_train)}')
print(f'Class 0: {np.count_nonzero(y_train == 0)}')
print(f'Class 1: {np.count_nonzero(y_train == 1)}')

print(f'Number of Rows: {len(y_dev)}')
print(f'The Classes are: {np.unique(y_dev)}')
print(f'Class 0: {np.count_nonzero(y_dev == 0)}')
print(f'Class 1: {np.count_nonzero(y_dev == 1)}')

We can see that the dataset is unbalanced so we will balance it.
We can also see that we are dealing with a binary classification problem so we will be using algorithms that are a better fit for this kind of problem.

## First we run LazyPredict to choose the few best algorithms to continue with

In [None]:
# from lazypredict.Supervised import LazyClassifier

# clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
# models,predictions = clf.fit(X_train, X_dev, y_train, y_dev)

# print(models)

## Balance the data in two ways, Over-Sampling and Under-Sampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
rus = RandomUnderSampler(random_state=42, replacement=True)
x_rus, y_rus = rus.fit_resample(X_train, y_train)
x_ros, y_ros = ros.fit_resample(X_train, y_train)

print(f'Under sampled: {len(y_rus)}')
print(f'Over sampled: {len(y_ros)}')

Our data has 14 features, we might need to reduce some of them based on their importance.

In [None]:
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression

def pca(k, X_train):
    pca = PCA(n_components=k)
    pca.fit(X_train)
    return {'train': pd.DataFrame(pca.transform(X_train)), 'test': pd.DataFrame(pca.transform(X_dev))}

def select_k_best(k, X_train, y_train):
    anova_filter = SelectKBest(score_func=f_regression, k=k)  
    anova_filter.fit(X_train, y_train)
    return {'train': pd.DataFrame(anova_filter.transform(X_train)), 'test': pd.DataFrame(anova_filter.transform(X_dev))}

In [None]:
# # import libraries
# import seaborn as sns

# import warnings
# warnings.filterwarnings('ignore')

# # create pandas df
# X = pd.DataFrame(X_train_pca)
# y = np.array(y_train)
# X['target'] = pd.DataFrame(y.reshape(-1, 1), columns=["target"])
# X.head(5)

# # check the null values
# X.isnull().sum()

# # pairplot for distribution
# sns.pairplot(X ,hue="target", palette='Set1')

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score


def cross_validation(clf, X, y, k):
    sc = StandardScaler()
    return cross_val_score(clf, sc.fit_transform(X), y, cv=k).mean()

def fit_predict(classifier, X_train, y_train):
    sc = StandardScaler()
    scaled_X_train = sc.fit_transform(X_train)
    classifier.fit(scaled_X_train, y_train)

    train_score = classifier.score(scaled_X_train, y_train)
    dev_score = classifier.score(sc.transform(X_dev), y_dev)
    over_fitting = train_score - dev_score

    return f'train score: {round(train_score, 3)} test score {round(dev_score, 3)} overfit {round(over_fitting, 3)}'

def fit_predict_poly(classifier, X_train, y_train, X_dev, y_dev):
    transformed_train = PolynomialFeatures(2).fit_transform(X_train)
    transformed_dev = PolynomialFeatures(2).fit_transform(X_dev)
    sc = StandardScaler()
    scaled_X_train = sc.fit_transform(transformed_train)

    classifier.fit(scaled_X_train, y_train)

    train_score = classifier.score(scaled_X_train, y_train)
    dev_score = classifier.score(sc.transform(transformed_dev), y_dev)
    over_fitting = train_score - dev_score

    return f'train score: {round(train_score, 3)} test score {round(dev_score, 3)} overfit {round(over_fitting, 3)}'

## Initial test of the best algoriths usin cross-validation

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, VotingClassifier, AdaBoostClassifier, BaggingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

def check_models(X_train, y_train, k):
    print(f'Random Forest: {cross_validation(RandomForestClassifier(random_state = 42), X_train.to_numpy(), y_train.to_numpy(), k)}')
    print(f'SVC: {cross_validation(SVC(random_state = 42), X_train.to_numpy(), y_train.to_numpy(), k)}')
    print(f'AdaBoost: {cross_validation(AdaBoostClassifier(random_state=42), X_train.to_numpy(), y_train.to_numpy(), k)}')
    print(f'Histogram Gradient Boosting: {cross_validation(HistGradientBoostingClassifier(), X_train.to_numpy(), y_train.to_numpy(), k)}')
    print(f'Bagging Classifier: {cross_validation(BaggingClassifier(base_estimator = SVC(), random_state = 42), X_train.to_numpy(), y_train.to_numpy(), k)}')
    print(f'XGBoost: {cross_validation(XGBClassifier(random_state = 42), X_train.to_numpy(), y_train.to_numpy(), k)}')
    print(f'Light GBM: {cross_validation(LGBMClassifier(random_state = 42), X_train.to_numpy(), y_train.to_numpy(), k)}')

print('Undersampled Data:')
check_models(x_rus, y_rus, 3)
print('Oversampled Data:')
check_models(x_ros, y_ros, 3)


## Find best hyperparameters for each model using RandomizedSearch

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
rf_grid =  {'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(random_state=42), param_distributions = rf_grid, scoring = 'roc_auc', n_iter = 5, cv = 3, verbose=3, random_state=42, n_jobs = -1)
rf_random.fit(x_ros, y_ros)
pprint(rf_random.best_params_)
print(rf_random.best_estimator_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

svc_grid = {'C': [0.1, 1, 10, 100],
            'gamma': [1, 0.1, 0.01, 0.001],
            'kernel': ['rbf', 'poly', 'linear']}
svc_random = RandomizedSearchCV(estimator = SVC(), param_distributions = svc_grid, scoring = 'roc_auc', n_iter = 5, cv = 3, verbose=3, random_state=42, n_jobs = -1)
svc_random.fit(x_ros, y_ros)
pprint(svc_random.best_params_)
print(svc_random.best_estimator_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

xgb_grid = {
    "learning_rate"     : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3] ,
    "max_depth"         : [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight"  : [1, 3, 5, 7],
    "gamma"             : [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree"  : [0.3, 0.4, 0.5, 0.7]
}

xgb_random = RandomizedSearchCV(XGBClassifier(random_state=42), param_distributions=xgb_grid, n_iter=5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=3)
xgb_random.fit(x_ros, y_ros)
pprint(xgb_random.best_params_)
print(xgb_random.best_estimator_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

lgbm_grid = {'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

lgbm_random = RandomizedSearchCV(LGBMClassifier(random_state=42), param_distributions=lgbm_grid, n_iter=5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=3)
lgbm_random.fit(x_rus, y_rus)
pprint(lgbm_random.best_params_)
print(lgbm_random.best_estimator_)

## Test the new models with the 'dev' data

In [None]:
xgb_ros = XGBClassifier(
        base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bynode=1, colsample_bytree=0.4, gamma=0.3, gpu_id=-1,
        importance_type='gain', interaction_constraints='',
        learning_rate=0.2, max_delta_step=0, max_depth=12,
        min_child_weight=3, monotone_constraints='()',
        n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=42,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
        tree_method='exact', validate_parameters=1, verbosity=None)

xgb_rus = XGBClassifier(
        base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bynode=1, colsample_bytree=0.5, gamma=0.3, gpu_id=-1,
        importance_type='gain', interaction_constraints='',
        learning_rate=0.15, max_delta_step=0, max_depth=5,
        min_child_weight=1, monotone_constraints='()',
        n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=42,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
        tree_method='exact', validate_parameters=1, verbosity=None)

lgbm_ros = LGBMClassifier(
        colsample_bytree=0.7077667509501917, min_child_samples=424,
        min_child_weight=1e-05, num_leaves=33, random_state=42,
        reg_alpha=2, reg_lambda=10, subsample=0.366858596641466)

lgbm_rus = LGBMClassifier(
        colsample_bytree=0.41365111596542925, min_child_samples=347,
        min_child_weight=0.1, num_leaves=33, random_state=42,
        reg_alpha=0.1, reg_lambda=1, subsample=0.6512961625220872)

rf = RandomForestClassifier(random_state=42, n_estimators=120)

evc = VotingClassifier(estimators=[('xgb',xgb_ros), ('rf', rf)],voting='hard')

clfs = {'xgb_ros':xgb_ros, 'xgb_rus':xgb_rus, 'lgbm_ros':lgbm_ros, 'lgbm_rus':lgbm_rus, 'rf':rf, 'evc': evc}

best_X_train, best_X_dev = select_k_best(5, x_ros, y_ros).values()
pca_X_train, pca_X_dev = pca(5, x_ros).values()

for clf in clfs:
        print(f'{clf}: {fit_predict(clfs[clf], x_ros, y_ros)}')
        print(f'poly {clf}: {fit_predict_poly(clfs[clf], x_ros, y_ros, X_dev, y_dev)}')
        print(f'poly k best {clf}: {fit_predict_poly(clfs[clf], best_X_train, y_ros, best_X_dev, y_dev)}')
        print(f'poly pca {clf}: {fit_predict_poly(clfs[clf], pca_X_train, y_ros, pca_X_dev, y_dev)}')