In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import bz2
import _pickle as cPickle
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as make_pipeline_with_sampler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier

In [None]:
current_dir = Path.cwd()
relative_path = 'data/compressed_preprocessed.pbz2'
frame_path = current_dir.joinpath(relative_path)
df = bz2.BZ2File(str(frame_path), 'rb')
df = cPickle.load(df)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# def load_df(name):
#     current_dir = Path.cwd()
#     relative_path = 'data/preprocessed_' + name + '.pbz2'
#     frame_path = current_dir.joinpath(relative_path)
#     df = bz2.BZ2File(str(frame_path), 'rb')
#     df = cPickle.load(df)
    
#     return df

In [None]:
# #df_map = {'x_train': X_train, 'x_test': X_test, 'y_train': y_train, 'y_test': y_test}
# X_train = load_df('x_train')
# X_test = load_df('x_test')
# y_train = load_df('y_train')
# y_test = load_df('y_test')

In [None]:
y = df['is_goal']
X = df.drop(labels=['is_goal'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=df['is_goal'], random_state=0)

In [None]:
def goal_prob(y):
    return sum(y)/len(y)

In [None]:
print(f"Probability of goal in sample: {goal_prob(y):.3f}")

In [None]:
def evaluate_classifier(clf, df_scores, clf_name=None):
    from sklearn.pipeline import Pipeline
    if clf_name is None:
        if isinstance(clf, Pipeline):
            clf_name = clf[-1].__class__.__name__
        else:
            clf_name = clf.__class__.__name__
    acc = clf.fit(X_train, y_train).score(X_test, y_test)
    y_pred = clf.predict(X_test)
    bal_acc = metrics.balanced_accuracy_score(y_test, y_pred)
    f1_score = metrics.f1_score(y_test, y_pred)
    prec_score = metrics.precision_score(y_test, y_pred, zero_division=0)
    recall_score = metrics.recall_score(y_test, y_pred)
    clf_score = pd.DataFrame(
        {clf_name: [acc, bal_acc, f1_score, prec_score, recall_score]},
        index=['Accuracy', 'Balanced accuracy', 'F1 score', 'Precision', 'Recall']
    )
    df_scores = pd.concat([df_scores, clf_score], axis=1).round(decimals=3)
    return df_scores

df_scores = pd.DataFrame()

In [None]:
dummy_mostfreq_clf = DummyClassifier(strategy="most_frequent")
df_scores = evaluate_classifier(dummy_mostfreq_clf, df_scores, "Dummy (Most Frequent)")

In [None]:
dummy_strat_clf = DummyClassifier(strategy="stratified", random_state=0)
df_scores = evaluate_classifier(dummy_strat_clf, df_scores, "Dummy (Stratified)")

In [None]:
lr_iterations=400
param_grid = [ {'class_weight': [None, 'balanced'],
                'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 100]} ]
clf = GridSearchCV(
    LogisticRegression(max_iter=lr_iterations), param_grid, scoring='balanced_accuracy'
)
clf.fit(X_train, y_train)
C = clf.best_params_['C']
class_weight = clf.best_params_['class_weight']

In [None]:
print(clf.best_params_)

In [None]:
lr_iterations=1000
lr_clf = make_pipeline(
    LogisticRegression(max_iter=lr_iterations, C=C, class_weight=class_weight)
)
df_scores = evaluate_classifier(lr_clf, df_scores, "LR")

In [None]:
rf_clf = make_pipeline(
    RandomForestClassifier(random_state=0, n_jobs=2)
)
df_scores = evaluate_classifier(rf_clf, df_scores, "RF")

In [None]:
svm_iterations=1000
param_grid = [ {'class_weight': [None, 'balanced'],
                'C': [0.0001, 0.001, 0.01, 0.1, 1]} ]
clf = GridSearchCV(
    LinearSVC(random_state=0, tol=1e-5, max_iter=svm_iterations, dual=False), param_grid, scoring='balanced_accuracy'
)
clf.fit(X_train, y_train)
C = clf.best_params_['C']
class_weight = clf.best_params_['class_weight']

In [None]:
print(clf.best_params_)

In [None]:
svm_clf = make_pipeline(
    LinearSVC(random_state=0, tol=1e-5, max_iter=svm_iterations, dual=False, C=C, class_weight=class_weight)
)
df_scores = evaluate_classifier(svm_clf, df_scores, "SVM")

In [None]:
rf_clf.set_params(randomforestclassifier__class_weight="balanced")
df_scores = evaluate_classifier(
    rf_clf, df_scores, "RF with class weight"
)

In [None]:
lr_clf = make_pipeline_with_sampler(
    RandomUnderSampler(random_state=0),
    LogisticRegression(max_iter=lr_iterations)
)
df_scores = evaluate_classifier(
    lr_clf, df_scores, "LR with under-sampling"
)

In [None]:
rf_clf = make_pipeline_with_sampler(
    RandomUnderSampler(random_state=0),
    RandomForestClassifier(random_state=0, n_jobs=2)
)

df_scores = evaluate_classifier(
    rf_clf, df_scores, "RF with under-sampling"
)

In [None]:
svm_clf = make_pipeline_with_sampler(
    RandomUnderSampler(random_state=0),
    LinearSVC(random_state=0, tol=1e-5, max_iter=svm_iterations, dual=False)
)

df_scores = evaluate_classifier(
    svm_clf, df_scores, "SVM with under-sampling"
)

In [None]:
rf_clf = make_pipeline(
    BalancedRandomForestClassifier(random_state=0, n_jobs=2)
)

df_scores = evaluate_classifier(rf_clf, df_scores, "Balanced RF")

In [None]:
for i in range(5,30,5):
    print(f"Evaluating n = {i}")
    bag_clf = make_pipeline(
        BalancedBaggingClassifier(
            base_estimator=HistGradientBoostingClassifier(random_state=0),
            n_estimators=i, random_state=0, n_jobs=2
        )
    )
    
    t = "Balanced bagging (n=" + str(i) + ")"
    df_scores = evaluate_classifier(
        bag_clf, df_scores, t
    )


In [None]:
for i in range(50,300,50):
    print(f"Evaluating n = {i}")
    rus_clf = make_pipeline(
        RUSBoostClassifier(n_estimators=i, algorithm='SAMME.R', random_state=0)
    )
    
    t = "RUSBoost (n=" + str(i) + ")"
    df_scores = evaluate_classifier(
        rus_clf, df_scores, t
    )


In [None]:
eec_clf = make_pipeline(
    EasyEnsembleClassifier(random_state=0)
)

df_scores = evaluate_classifier(
    eec_clf, df_scores, "EEC"
)

In [None]:
df_scores