#### Imports & Downloads

In [3]:
import pandas as pd
import os
from gensim.models import Word2Vec
import numpy as np
from imblearn.under_sampling import NearMiss, CondensedNearestNeighbour, NeighbourhoodCleaningRule
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import TomekLinks
import time

from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

### Einladen der Daten

#### Originaldaten ohne umfassende Vorverarbeitung

In [4]:
current_dir = os.getcwd()
csv_path_train = os.path.abspath(os.path.join(current_dir, '../../../data/twitter_hate-speech/train_basic_cleaned.csv'))
df = pd.read_csv(csv_path_train, encoding='utf-8', index_col=0)

df.head()

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,@user when a father is dysfunctional and is s...
2,0,@user @user thanks for #lyft credit i can't us...
3,0,bihday your majesty
4,0,#model i love u take with u all the time in ...
5,0,factsguide: society now #motivation


In [5]:
df['label'].value_counts()
positive = len(df[df['label'] == 1])
negative = len(df[df['label'] == 0])
print("Positive:", positive)
print("Negative:", negative)
print("Verhältnis:", negative / positive)

Positive: 2013
Negative: 27517
Verhältnis: 13.669647292598112


#### Vorverarbeitete Daten

In [6]:
current_dir = os.getcwd()
csv_path_train = os.path.abspath(os.path.join(current_dir, '../../../data/twitter_hate-speech/train_cleaned.csv'))
df_cleaned = pd.read_csv(csv_path_train, encoding='utf-8', index_col=0)

df_cleaned.head()

Unnamed: 0_level_0,label,tweet,tweet_cleaned,user_handle,hashtags,emojis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,@user when a father is dysfunctional and is s...,father selfish drag kid run,1,['#run'],
2,0,@user @user thanks for #lyft credit i can't us...,thank lyft credit use cause offer van,2,"['#lyft', '#disapointed', '#getthanked']",
3,0,bihday your majesty,bihday majesty,0,[],
4,0,#model i love u take with u all the time in ...,model take time mobile phone kiss sunglass mou...,0,['#model'],":mobile_phone:,:kissing_face_with_smiling_eyes..."
5,0,factsguide: society now #motivation,factsguide society motivation,0,['#motivation'],


In [7]:
df_cleaned['label'].value_counts()
positive = len(df_cleaned[df_cleaned['label'] == 1])
negative = len(df_cleaned[df_cleaned['label'] == 0])
print("Positive:", positive)
print("Negative:", negative)
print("Verhältnis:", negative / positive)

Positive: 1811
Negative: 25839
Verhältnis: 14.26780784097184


Verhältnis hat sich durch die Bereinigung sogar noch weiter verschlechter

### Vektorisierung der Daten
https://spotintelligence.com/2023/02/15/word2vec-for-text-classification/#Unsupervised_text_classification_Word2Vec

In [8]:
def vectorize(sentence, w2v_model):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [9]:
def vectorize_df(df):
    X = df["tweet"]
    y = df["label"]

    sentences_base = [sentence.split() for sentence in X]
    w2v_model_base = Word2Vec(sentences_base, window=5, min_count=5, workers=-1)

    x_data = np.array([vectorize(sentence, w2v_model_base) for sentence in X])
    return x_data, y

In [10]:
X_base, y_base = vectorize_df(df)

In [11]:
X_base

array([[ 1.30374171e-03, -6.65663800e-04,  3.21442029e-03, ...,
        -2.68328539e-03,  1.41547900e-03, -2.84765265e-03],
       [ 3.36569443e-04, -9.82483965e-04,  9.05304274e-04, ...,
        -4.07109677e-04,  5.64215006e-04,  1.94166932e-04],
       [ 1.23032148e-03, -2.46845209e-03,  2.51201470e-03, ...,
         2.88697542e-03,  4.11413144e-03,  4.88469563e-03],
       ...,
       [-9.87032778e-04,  5.68386691e-04, -1.85755307e-05, ...,
         3.09016541e-05, -3.14187346e-04, -8.47426010e-04],
       [ 1.40571001e-03,  4.23042104e-03,  2.36720010e-03, ...,
        -4.67062840e-04,  2.66610738e-03, -7.61655625e-04],
       [ 8.88627619e-05, -6.70738227e-04,  2.82955565e-03, ...,
        -3.34652583e-03,  8.70123040e-04,  3.40033951e-03]])

In [12]:
y_base

id
1        0
2        0
3        0
4        0
5        0
        ..
31957    0
31958    0
31959    0
31960    0
31962    0
Name: label, Length: 29530, dtype: int64

In [13]:
X_clean, y_clean = vectorize_df(df_cleaned)

In [14]:
X_clean

array([[ 0.000164  , -0.00097227,  0.00330843, ..., -0.00523365,
         0.00096938, -0.00285136],
       [-0.00232347, -0.0016694 ,  0.00038817, ..., -0.00020569,
         0.00022666,  0.0038221 ],
       [-0.00620588, -0.00316428,  0.00138663, ...,  0.00165645,
         0.00093876, -0.00017074],
       ...,
       [ 0.0017181 ,  0.00186271,  0.00110266, ..., -0.0023023 ,
        -0.00016485,  0.00787419],
       [-0.00182129,  0.00049515,  0.0007572 , ..., -0.00087936,
        -0.00065907, -0.00146304],
       [-0.00335515, -0.00040929, -0.00032946, ..., -0.00268573,
         0.00366877, -0.0030683 ]])

In [15]:
y_clean

id
1        0
2        0
3        0
4        0
5        0
        ..
31956    0
31957    0
31958    0
31959    0
31960    0
Name: label, Length: 27650, dtype: int64

### Train/Test Split

In [16]:
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(X_base, y_base)

In [17]:
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean, y_clean)

### 1. Resampling Methods
https://www.analyticsvidhya.com/blog/2022/05/handling-imbalanced-data-with-imbalance-learn-in-python/#h-techniques-for-handling-imbalanced-data

In [18]:
def print_data(y_before, y_after, sampling_technique):
    counter_before = Counter(y_before)
    counter_after = Counter(y_after)
    print("Before sampling with:", sampling_technique, counter_before)
    print("After sampling with:", sampling_technique, counter_after)

#### 1.1 Oversampling

##### 1.1.1 SMOTE

In [19]:
os_smote = SMOTE()

In [20]:
X_train_base_s, y_train_base_s = os_smote.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_s, "SMOTE")

Before sampling with: SMOTE Counter({0: 20638, 1: 1509})
After sampling with: SMOTE Counter({0: 20638, 1: 20638})


In [21]:
X_train_clean_s, y_train_clean_s = os_smote.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_s, "SMOTE")

Before sampling with: SMOTE Counter({0: 19373, 1: 1364})
After sampling with: SMOTE Counter({0: 19373, 1: 19373})


##### 1.1.2 Borderline-SMOTE

In [22]:
os_bsmote = BorderlineSMOTE()

In [23]:
X_train_base_bs, y_train_base_bs = os_bsmote.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_bs, "BorderlineSMOTE")

Before sampling with: BorderlineSMOTE Counter({0: 20638, 1: 1509})
After sampling with: BorderlineSMOTE Counter({0: 20638, 1: 20638})


In [24]:
X_train_clean_bs, y_train_clean_bs = os_bsmote.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_bs, "BorderlineSMOTE")

Before sampling with: BorderlineSMOTE Counter({0: 19373, 1: 1364})
After sampling with: BorderlineSMOTE Counter({0: 19373, 1: 19373})


##### 1.1.3 ADASYN

In [25]:
os_ada = ADASYN()

In [26]:
X_train_base_a, y_train_base_a = os_ada.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_a, "ADASYN")

Before sampling with: ADASYN Counter({0: 20638, 1: 1509})
After sampling with: ADASYN Counter({1: 20681, 0: 20638})


In [27]:
X_train_clean_a, y_train_clean_a = os_ada.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_a, "ADASYN")

Before sampling with: ADASYN Counter({0: 19373, 1: 1364})
After sampling with: ADASYN Counter({0: 19373, 1: 18998})


#### 1.2 Undersampling

##### 1.2.1 NearMiss

In [28]:
us_near_miss = NearMiss(version=3, n_neighbors_ver3=3)

In [29]:
X_train_base_nm, y_train_base_nm = us_near_miss.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_nm, "NearMiss")

Before sampling with: NearMiss Counter({0: 20638, 1: 1509})
After sampling with: NearMiss Counter({0: 1509, 1: 1509})


In [30]:
X_train_clean_nm, y_train_clean_nm = us_near_miss.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_nm, "NearMiss")

Before sampling with: NearMiss Counter({0: 19373, 1: 1364})
After sampling with: NearMiss Counter({0: 1364, 1: 1364})


##### 1.2.2 Condensed Nearest Neighbor

In [31]:
us_cnn = CondensedNearestNeighbour(n_neighbors=1, n_jobs=-1)  # sehr langsam

In [32]:
# X_train_base_cnn, y_train_base_cnn = us_cnn.fit_resample(X_train_base, y_train_base)
# print_data(y_train_base, y_train_base_cnn, "CondensedNearestNeighbour")

In [33]:
# X_train_clean_cnn, y_train_clean_cnn = us_cnn.fit_resample(X_train_clean, y_train_clean)
# print_data(y_train_clean, y_train_clean_cnn, "CondensedNearestNeighbour")

##### 1.2.3 Neighborhood Cleaning

In [34]:
us_cnn_cr = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5, n_jobs=-1)  # sehr langsam

In [35]:
# X_train_base_ncr, y_train_base_ncr = us_cnn.fit_resample(X_train_base, y_train_base)
# print_data(y_train_base, y_train_base_ncr, "NeighbourhoodCleaningRule")

In [36]:
# X_train_clean_ncr, y_train_clean_ncr = us_cnn.fit_resample(X_train_clean, y_train_clean)
# print_data(y_train_clean, y_train_clean_ncr, "NeighbourhoodCleaningRule")

##### 1.2.4 Tomek Links Undersampler

In [37]:
us_tomek = TomekLinks()

In [38]:
X_train_base_t, y_train_base_t = us_tomek.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_t, "TomekLinks")

Before sampling with: TomekLinks Counter({0: 20638, 1: 1509})
After sampling with: TomekLinks Counter({0: 20581, 1: 1509})


In [39]:
X_train_clean_t, y_train_clean_t = us_tomek.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_t, "TomekLinks")

Before sampling with: TomekLinks Counter({0: 19373, 1: 1364})
After sampling with: TomekLinks Counter({0: 19326, 1: 1364})


### 2. Ensemble Models

In [40]:
evaluation = pd.DataFrame(
    columns=["model", "variant", "train_acc", "train_prec", "train_rec", "train_f1", "test_acc", "test_prec",
             "test_rec", "test_f1"])

In [41]:
def add_to_eval_df(model, model_name, variant, x_data_train, y_data_train, x_data_test, y_data_test):
    train_acc = model.score(x_data_train, y_data_train)
    train_precision = precision_score(y_data_train, model.predict(x_data_train), average="weighted")
    train_recall = recall_score(y_data_train, model.predict(x_data_train), average="weighted")
    train_f1 = f1_score(y_data_train, model.predict(x_data_train), average="weighted")

    test_acc = model.score(x_data_test, y_data_test)
    test_precision = precision_score(y_data_test, model.predict(x_data_test), average="weighted")
    test_recall = recall_score(y_data_test, model.predict(x_data_test), average="weighted")
    test_f1 = f1_score(y_data_test, model.predict(x_data_test), average="weighted")

    evaluation.loc[len(evaluation.index)] = [model_name, variant, train_acc, train_precision, train_recall, train_f1,
                                             test_acc, test_precision, test_recall, test_f1]

In [42]:
def evaluate_model(model, x_test, y_test, sampling_method):
    pred = model.predict(x_test)
    accscore = metrics.accuracy_score(pred, y_test)

    print(f'{sampling_method} model accuracy for classification is =', str('{:04.2f}'.format(accscore * 100)) + '%')
    print('------------------------------------------------')
    print('Confusion Matrix:')
    print(pd.DataFrame(confusion_matrix(y_test, pred)))
    print('------------------------------------------------')
    print('Classification Report:')
    print(classification_report(y_test, pred))

In [43]:
def fit_model(model, modelName):
    print("Starting model fitting.")

    start = time.time()
    print("1/12 Fitting: ", modelName, "BASE started...")
    model.fit(X_train_base, y_train_base)
    add_to_eval_df(model, modelName, "base", X_train_base, y_train_base, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("2/12 Fitting: ", modelName, "CLEAN started...")
    model.fit(X_train_clean, y_train_clean)
    add_to_eval_df(model, modelName, "clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("3/12 Fitting: ", modelName, "SMOTE-base started...")
    model.fit(X_train_base_s, y_train_base_s)
    add_to_eval_df(model, modelName, "SMOTE base", X_train_base_s, y_train_base_s, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("4/12 Fitting: ", modelName, "SMOTE-clean started...")
    model.fit(X_train_clean_s, y_train_clean_s)
    add_to_eval_df(model, modelName, "SMOTE clean", X_train_clean_s, y_train_clean_s, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("5/12 Fitting: ", modelName, "BorderlineSMOTE-base started...")
    model.fit(X_train_base_bs, y_train_base_bs)
    add_to_eval_df(model, modelName, "BorderlineSMOTE base", X_train_base_bs, y_train_base_bs, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("6/12 Fitting: ", modelName, "BorderlineSMOTE-clean started...")
    model.fit(X_train_clean_bs, y_train_clean_bs)
    add_to_eval_df(model, modelName, "BorderlineSMOTE clean", X_train_clean_bs, y_train_clean_bs, X_test_clean,
                   y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("7/12 Fitting: ", modelName, "ADASYN-base started...")
    model.fit(X_train_base_a, y_train_base_a)
    add_to_eval_df(model, modelName, "ADASYN base", X_train_base_a, y_train_base_a, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("8/12 Fitting: ", modelName, "ADASYN-clean started...")
    model.fit(X_train_clean_a, y_train_clean_a)
    add_to_eval_df(model, modelName, "ADASYN clean", X_train_clean_a, y_train_clean_a, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("9/12 Fitting: ", modelName, "NearMiss-base started...")
    model.fit(X_train_base_nm, y_train_base_nm)
    add_to_eval_df(model, modelName, "NearMiss base", X_train_base_nm, y_train_base_nm, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("10/12 Fitting: ", modelName, "NearMiss-clean started...")
    model.fit(X_train_clean_nm, y_train_clean_nm)
    add_to_eval_df(model, modelName, "NearMiss clean", X_train_clean_nm, y_train_clean_nm, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("11/12 Fitting: ", modelName, "TomekLink-base started...")
    model.fit(X_train_clean_t, y_train_clean_t)
    add_to_eval_df(model, modelName, "TomekLink clean", X_train_clean_t, y_train_clean_t, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("12/12 Fitting: ", modelName, "TomekLink-clean started...")
    model.fit(X_train_clean_t, y_train_clean_t)
    add_to_eval_df(model, modelName, "TomekLink clean", X_train_clean_t, y_train_clean_t, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    print("Model fitting finished.")

#### 2.1 Bagging

In [44]:
rf = RandomForestClassifier(n_jobs=-1)

In [45]:
fit_model(rf, "RandomForest")

Starting model fitting.
1/12 Fitting:  RandomForest BASE started...
Fitting:  RandomForest finished. Elapsed time:  7 Seconds
2/12 Fitting:  RandomForest CLEAN started...
Fitting:  RandomForest finished. Elapsed time:  7 Seconds
3/12 Fitting:  RandomForest SMOTE-base started...
Fitting:  RandomForest finished. Elapsed time:  13 Seconds
4/12 Fitting:  RandomForest SMOTE-clean started...
Fitting:  RandomForest finished. Elapsed time:  13 Seconds
5/12 Fitting:  RandomForest BorderlineSMOTE-base started...
Fitting:  RandomForest finished. Elapsed time:  13 Seconds
6/12 Fitting:  RandomForest BorderlineSMOTE-clean started...
Fitting:  RandomForest finished. Elapsed time:  12 Seconds
7/12 Fitting:  RandomForest ADASYN-base started...
Fitting:  RandomForest finished. Elapsed time:  17 Seconds
8/12 Fitting:  RandomForest ADASYN-clean started...
Fitting:  RandomForest finished. Elapsed time:  14 Seconds
9/12 Fitting:  RandomForest NearMiss-base started...
Fitting:  RandomForest finished. Elapse

In [46]:
brf = BalancedRandomForestClassifier(n_jobs=-1)

In [47]:
# BASE
brf.fit(X_train_base, y_train_base)
add_to_eval_df(brf, "BalancedRandomForest", "BALANCED base", X_train_base, y_train_base, X_test_base, y_test_base)



In [48]:
# CLEANED
brf.fit(X_train_clean, y_train_clean)
add_to_eval_df(brf, "BalancedRandomForest", "BALANCED clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)



In [49]:
evaluation[(evaluation.model == "RandomForest") | (evaluation.model == "BalancedRandomForest")].sort_values(
    by=["test_rec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
1,RandomForest,clean,0.999711,0.999711,0.999711,0.99971,0.935918,0.924098,0.935918,0.906055
10,RandomForest,TomekLink clean,0.99971,0.99971,0.99971,0.99971,0.935773,0.921662,0.935773,0.905707
11,RandomForest,TomekLink clean,0.99971,0.99971,0.99971,0.99971,0.935773,0.918701,0.935773,0.905979
0,RandomForest,base,0.999729,0.999729,0.999729,0.999729,0.933496,0.937926,0.933496,0.903062
5,RandomForest,BorderlineSMOTE clean,0.999871,0.999871,0.999871,0.999871,0.922176,0.896158,0.922176,0.906817
4,RandomForest,BorderlineSMOTE base,0.999176,0.999178,0.999176,0.999176,0.918326,0.895816,0.918326,0.904959
3,RandomForest,SMOTE clean,0.999794,0.999794,0.999794,0.999794,0.917981,0.896207,0.917981,0.905666
7,RandomForest,ADASYN clean,0.999739,0.999739,0.999739,0.999739,0.913786,0.893155,0.913786,0.902476
2,RandomForest,SMOTE base,0.999443,0.999443,0.999443,0.999443,0.911147,0.893146,0.911147,0.901177
6,RandomForest,ADASYN base,0.999443,0.999444,0.999443,0.999443,0.909251,0.895514,0.909251,0.901808


#### 2.2 Boosting

##### 2.2.1a XGBClassifier

In [50]:
xgb = XGBClassifier()

In [51]:
fit_model(xgb, "XGBClassifier")

Starting model fitting.
1/12 Fitting:  XGBClassifier BASE started...
Fitting:  XGBClassifier finished. Elapsed time:  3 Seconds
2/12 Fitting:  XGBClassifier CLEAN started...
Fitting:  XGBClassifier finished. Elapsed time:  3 Seconds
3/12 Fitting:  XGBClassifier SMOTE-base started...
Fitting:  XGBClassifier finished. Elapsed time:  3 Seconds
4/12 Fitting:  XGBClassifier SMOTE-clean started...
Fitting:  XGBClassifier finished. Elapsed time:  3 Seconds
5/12 Fitting:  XGBClassifier BorderlineSMOTE-base started...
Fitting:  XGBClassifier finished. Elapsed time:  3 Seconds
6/12 Fitting:  XGBClassifier BorderlineSMOTE-clean started...
Fitting:  XGBClassifier finished. Elapsed time:  3 Seconds
7/12 Fitting:  XGBClassifier ADASYN-base started...
Fitting:  XGBClassifier finished. Elapsed time:  2 Seconds
8/12 Fitting:  XGBClassifier ADASYN-clean started...
Fitting:  XGBClassifier finished. Elapsed time:  2 Seconds
9/12 Fitting:  XGBClassifier NearMiss-base started...
Fitting:  XGBClassifier fini

In [52]:
evaluation[evaluation.model == "XGBClassifier"].sort_values(by=["test_rec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
24,XGBClassifier,TomekLink clean,0.99942,0.99942,0.99942,0.999419,0.936352,0.9168,0.936352,0.909647
25,XGBClassifier,TomekLink clean,0.99942,0.99942,0.99942,0.999419,0.936352,0.9168,0.936352,0.909647
15,XGBClassifier,clean,0.999566,0.999566,0.999566,0.999565,0.935773,0.912286,0.935773,0.909072
14,XGBClassifier,base,0.999142,0.999142,0.999142,0.99914,0.935663,0.931619,0.935663,0.909167
19,XGBClassifier,BorderlineSMOTE clean,0.997651,0.997651,0.997651,0.997651,0.905396,0.904404,0.905396,0.904897
18,XGBClassifier,BorderlineSMOTE base,0.994549,0.99455,0.994549,0.994549,0.896113,0.901739,0.896113,0.898845
21,XGBClassifier,ADASYN clean,0.996534,0.996551,0.996534,0.996534,0.890352,0.899949,0.890352,0.894982
16,XGBClassifier,SMOTE base,0.994743,0.99477,0.994743,0.994743,0.887986,0.901167,0.887986,0.894202
17,XGBClassifier,SMOTE clean,0.99698,0.996992,0.99698,0.99698,0.887892,0.900993,0.887892,0.894133
20,XGBClassifier,ADASYN base,0.995789,0.995808,0.995789,0.995789,0.88785,0.900422,0.88785,0.8938


##### 2.2.1b XGBClassifier tuning

In [71]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [82]:
param_test1 = {
    'max_depth': range(3, 10, 2),
}
gsearch1 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test1, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [83]:
gsearch1.fit(X_train_clean, y_train_clean)
gsearch1.best_score_, gsearch1.best_params_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


(0.7424466598112309, {'max_depth': 9})

In [84]:
param_test2 = {
    'min_child_weight': range(1, 6, 2),
}
gsearch2 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test2, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [85]:
gsearch2.fit(X_train_clean, y_train_clean)
gsearch2.best_score_, gsearch2.best_params_

Fitting 5 folds for each of 3 candidates, totalling 15 fits


(0.7459381685237177, {'min_child_weight': 5})

In [88]:
param_test3 = {
    'gamma': [i / 10.0 for i in range(0, 5)],
}
gsearch3 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test3, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [89]:
gsearch3.fit(X_train_clean, y_train_clean)
gsearch3.best_score_, gsearch3.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


(0.7473632876235745, {'gamma': 0.1})

In [90]:
param_test4 = {
    'subsample': [i / 10.0 for i in range(6, 10)],
}
gsearch4 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    gamma=0.1,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test4, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [91]:
gsearch4.fit(X_train_clean, y_train_clean)
gsearch4.best_score_, gsearch4.best_params_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


(0.7473632876235745, {'subsample': 0.8})

In [92]:
param_test5 = {
    'colsample_bytree': [i / 10.0 for i in range(6, 10)],
}
gsearch5 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [93]:
gsearch5.fit(X_train_clean, y_train_clean)
gsearch5.best_score_, gsearch5.best_params_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


(0.7473632876235745, {'colsample_bytree': 0.8})

In [94]:
param_test6 = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test6, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [95]:
gsearch6.fit(X_train_clean, y_train_clean)
gsearch6.best_score_, gsearch6.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


(0.7473632876235745, {'reg_alpha': 1e-05})

In [96]:
xgb_tune = XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27,
)

In [97]:
fit_model(xgb_tune, "XGBClassifier-tuned")
evaluation[evaluation.model == "XGBClassifier-tuned"].sort_values(by=["test_rec"], ascending=False)

Starting model fitting.
1/12 Fitting:  XGBClassifier-tuned BASE started...
Fitting:  XGBClassifier-tuned finished. Elapsed time:  5 Seconds
2/12 Fitting:  XGBClassifier-tuned CLEAN started...
Fitting:  XGBClassifier-tuned finished. Elapsed time:  5 Seconds
3/12 Fitting:  XGBClassifier-tuned SMOTE-base started...
Fitting:  XGBClassifier-tuned finished. Elapsed time:  8 Seconds
4/12 Fitting:  XGBClassifier-tuned SMOTE-clean started...
Fitting:  XGBClassifier-tuned finished. Elapsed time:  9 Seconds
5/12 Fitting:  XGBClassifier-tuned BorderlineSMOTE-base started...
Fitting:  XGBClassifier-tuned finished. Elapsed time:  6 Seconds
6/12 Fitting:  XGBClassifier-tuned BorderlineSMOTE-clean started...
Fitting:  XGBClassifier-tuned finished. Elapsed time:  6 Seconds
7/12 Fitting:  XGBClassifier-tuned ADASYN-base started...
Fitting:  XGBClassifier-tuned finished. Elapsed time:  6 Seconds
8/12 Fitting:  XGBClassifier-tuned ADASYN-clean started...
Fitting:  XGBClassifier-tuned finished. Elapsed tim

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
60,XGBClassifier-tuned,TomekLink clean,0.998115,0.998117,0.998115,0.998103,0.93722,0.931315,0.93722,0.909395
61,XGBClassifier-tuned,TomekLink clean,0.998115,0.998117,0.998115,0.998103,0.93722,0.931315,0.93722,0.909395
51,XGBClassifier-tuned,clean,0.997782,0.997785,0.997782,0.997765,0.936786,0.931787,0.936786,0.908122
50,XGBClassifier-tuned,base,0.996839,0.99685,0.996839,0.996804,0.93485,0.93074,0.93485,0.907133
55,XGBClassifier-tuned,BorderlineSMOTE clean,0.998787,0.998788,0.998787,0.998787,0.911326,0.904257,0.911326,0.907633
54,XGBClassifier-tuned,BorderlineSMOTE base,0.997383,0.997387,0.997383,0.997383,0.90749,0.902133,0.90749,0.904715
53,XGBClassifier-tuned,SMOTE clean,0.999406,0.999406,0.999406,0.999406,0.901924,0.898574,0.901924,0.900225
57,XGBClassifier-tuned,ADASYN clean,0.999166,0.999166,0.999166,0.999166,0.898308,0.898835,0.898308,0.898571
52,XGBClassifier-tuned,SMOTE base,0.998449,0.99845,0.998449,0.998449,0.896925,0.900538,0.896925,0.898699
56,XGBClassifier-tuned,ADASYN base,0.998427,0.998427,0.998427,0.998427,0.894352,0.897957,0.894352,0.896125


##### 2.2.1 CatBoostClassifier

In [53]:
cat = CatBoostClassifier(iterations=2,
                         depth=2,
                         learning_rate=1,
                         loss_function='Logloss',
                         verbose=True)

In [54]:
fit_model(cat, "CatBoostClassifier")

Starting model fitting.
1/12 Fitting:  CatBoostClassifier BASE started...
0:	learn: 0.2482938	total: 158ms	remaining: 158ms
1:	learn: 0.2445480	total: 166ms	remaining: 0us


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting:  CatBoostClassifier finished. Elapsed time:  7 Seconds
2/12 Fitting:  CatBoostClassifier CLEAN started...
0:	learn: 0.2422224	total: 9.27ms	remaining: 9.27ms
1:	learn: 0.2381026	total: 18.3ms	remaining: 0us


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting:  CatBoostClassifier finished. Elapsed time:  6 Seconds
3/12 Fitting:  CatBoostClassifier SMOTE-base started...
0:	learn: 0.6573267	total: 11.8ms	remaining: 11.8ms
1:	learn: 0.6294410	total: 22.5ms	remaining: 0us
Fitting:  CatBoostClassifier finished. Elapsed time:  9 Seconds
4/12 Fitting:  CatBoostClassifier SMOTE-clean started...
0:	learn: 0.6523384	total: 9.7ms	remaining: 9.7ms
1:	learn: 0.6275114	total: 18.3ms	remaining: 0us
Fitting:  CatBoostClassifier finished. Elapsed time:  8 Seconds
5/12 Fitting:  CatBoostClassifier BorderlineSMOTE-base started...
0:	learn: 0.6531919	total: 9.98ms	remaining: 9.98ms
1:	learn: 0.6279220	total: 20.3ms	remaining: 0us
Fitting:  CatBoostClassifier finished. Elapsed time:  8 Seconds
6/12 Fitting:  CatBoostClassifier BorderlineSMOTE-clean started...
0:	learn: 0.6451640	total: 15.1ms	remaining: 15.1ms
1:	learn: 0.6177823	total: 28.4ms	remaining: 0us
Fitting:  CatBoostClassifier finished. Elapsed time:  8 Seconds
7/12 Fitting:  CatBoostClassifie

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting:  CatBoostClassifier finished. Elapsed time:  5 Seconds
12/12 Fitting:  CatBoostClassifier TomekLink-clean started...
0:	learn: 0.2426231	total: 7.63ms	remaining: 7.63ms
1:	learn: 0.2389774	total: 15.1ms	remaining: 0us


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting:  CatBoostClassifier finished. Elapsed time:  4 Seconds
Model fitting finished.


In [55]:
evaluation[evaluation.model == "CatBoostClassifier"].sort_values(by=["test_rec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
27,CatBoostClassifier,clean,0.934224,0.872774,0.934224,0.902454,0.935339,0.874859,0.935339,0.904089
36,CatBoostClassifier,TomekLink clean,0.934074,0.872495,0.934074,0.902235,0.935339,0.874859,0.935339,0.904089
37,CatBoostClassifier,TomekLink clean,0.934074,0.872495,0.934074,0.902235,0.935339,0.874859,0.935339,0.904089
26,CatBoostClassifier,base,0.931864,0.868371,0.931864,0.898998,0.931735,0.86813,0.931735,0.898809
30,CatBoostClassifier,BorderlineSMOTE base,0.661498,0.663853,0.661498,0.660278,0.693485,0.884205,0.693485,0.767804
31,CatBoostClassifier,BorderlineSMOTE clean,0.669488,0.670451,0.669488,0.669021,0.684363,0.890649,0.684363,0.763719
29,CatBoostClassifier,SMOTE clean,0.659604,0.659836,0.659604,0.65948,0.656878,0.891162,0.656878,0.743972
32,CatBoostClassifier,ADASYN base,0.657688,0.657701,0.657688,0.657684,0.645537,0.888848,0.645537,0.733368
35,CatBoostClassifier,NearMiss clean,0.58761,0.605143,0.58761,0.56967,0.63923,0.878632,0.63923,0.73083
28,CatBoostClassifier,SMOTE base,0.662443,0.6633,0.662443,0.662,0.614791,0.890127,0.614791,0.709803


##### 2.2.1 LGBMClassifier

In [56]:
light = LGBMClassifier()

In [57]:
fit_model(light, "LGBMClassifier")

Starting model fitting.
1/12 Fitting:  LGBMClassifier BASE started...
[LightGBM] [Info] Number of positive: 1509, number of negative: 20638
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008524 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 22147, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.068136 -> initscore=-2.615687
[LightGBM] [Info] Start training from score -2.615687
Fitting:  LGBMClassifier finished. Elapsed time:  1 Seconds
2/12 Fitting:  LGBMClassifier CLEAN started...
[LightGBM] [Info] Number of positive: 1364, number of negative: 19373
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 207

In [58]:
evaluation[evaluation.model == "LGBMClassifier"].sort_values(by=["test_rec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
48,LGBMClassifier,TomekLink clean,0.965926,0.967125,0.965926,0.960293,0.936062,0.922873,0.936062,0.906672
49,LGBMClassifier,TomekLink clean,0.965926,0.967125,0.965926,0.960293,0.936062,0.922873,0.936062,0.906672
39,LGBMClassifier,clean,0.966485,0.967646,0.966485,0.961052,0.935773,0.916859,0.935773,0.90625
38,LGBMClassifier,base,0.962568,0.964014,0.962568,0.955849,0.933496,0.922201,0.933496,0.904505
43,LGBMClassifier,BorderlineSMOTE clean,0.948072,0.948609,0.948072,0.948057,0.857949,0.900116,0.857949,0.87685
42,LGBMClassifier,BorderlineSMOTE base,0.941806,0.942448,0.941806,0.941785,0.848977,0.898356,0.848977,0.870578
41,LGBMClassifier,SMOTE clean,0.931967,0.932416,0.931967,0.93195,0.841603,0.902923,0.841603,0.868058
45,LGBMClassifier,ADASYN clean,0.932214,0.932896,0.932214,0.932202,0.835817,0.900703,0.835817,0.863911
40,LGBMClassifier,SMOTE base,0.928893,0.929438,0.928893,0.928871,0.834078,0.898682,0.834078,0.861683
44,LGBMClassifier,ADASYN base,0.929669,0.930422,0.929669,0.929637,0.829744,0.897243,0.829744,0.858625


#### 2.3 Stacking

In [59]:
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm', SVC(probability=True, random_state=42))
]
stack = StackingClassifier(estimators=base_models)

In [60]:
# stack.fit(X_train_base, y_train_base)

#### Model comparision

In [61]:
evaluation.sort_values(by=["test_rec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
25,XGBClassifier,TomekLink clean,0.99942,0.99942,0.99942,0.999419,0.936352,0.9168,0.936352,0.909647
24,XGBClassifier,TomekLink clean,0.99942,0.99942,0.99942,0.999419,0.936352,0.9168,0.936352,0.909647
48,LGBMClassifier,TomekLink clean,0.965926,0.967125,0.965926,0.960293,0.936062,0.922873,0.936062,0.906672
49,LGBMClassifier,TomekLink clean,0.965926,0.967125,0.965926,0.960293,0.936062,0.922873,0.936062,0.906672
1,RandomForest,clean,0.999711,0.999711,0.999711,0.99971,0.935918,0.924098,0.935918,0.906055
39,LGBMClassifier,clean,0.966485,0.967646,0.966485,0.961052,0.935773,0.916859,0.935773,0.90625
10,RandomForest,TomekLink clean,0.99971,0.99971,0.99971,0.99971,0.935773,0.921662,0.935773,0.905707
11,RandomForest,TomekLink clean,0.99971,0.99971,0.99971,0.99971,0.935773,0.918701,0.935773,0.905979
15,XGBClassifier,clean,0.999566,0.999566,0.999566,0.999565,0.935773,0.912286,0.935773,0.909072
14,XGBClassifier,base,0.999142,0.999142,0.999142,0.99914,0.935663,0.931619,0.935663,0.909167
