#### Imports & Downloads

In [349]:
import pandas as pd
import os
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from imblearn.under_sampling import NearMiss, CondensedNearestNeighbour, NeighbourhoodCleaningRule
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import TomekLinks
import time
from sklearn.model_selection import GridSearchCV,

from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

SyntaxError: trailing comma not allowed without surrounding parentheses (3800709767.py, line 10)

### Einladen der Daten

#### Originaldaten ohne umfassende Vorverarbeitung

In [2]:
current_dir = os.getcwd()
csv_path_train = os.path.abspath(os.path.join(current_dir, '../../../data/twitter_hate-speech/train_basic_cleaned.csv'))
df = pd.read_csv(csv_path_train, encoding='utf-8', index_col=0)

df.head()

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,@user when a father is dysfunctional and is s...
2,0,@user @user thanks for #lyft credit i can't us...
3,0,bihday your majesty
4,0,#model i love u take with u all the time in ...
5,0,factsguide: society now #motivation


In [3]:
df['label'].value_counts()
positive = len(df[df['label'] == 1])
negative = len(df[df['label'] == 0])
print("Positive:", positive)
print("Negative:", negative)
print("Verhältnis:", negative / positive)

Positive: 2013
Negative: 27517
Verhältnis: 13.669647292598112


#### Vorverarbeitete Daten

In [4]:
current_dir = os.getcwd()
csv_path_train = os.path.abspath(os.path.join(current_dir, '../../../data/twitter_hate-speech/train_cleaned.csv'))
df_cleaned = pd.read_csv(csv_path_train, encoding='utf-8', index_col=0)

df_cleaned.head()

Unnamed: 0_level_0,label,tweet,tweet_cleaned,user_handle,hashtags,emojis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,@user when a father is dysfunctional and is s...,father selfish drag kid run,1,['#run'],
2,0,@user @user thanks for #lyft credit i can't us...,thank lyft credit use cause offer van,2,"['#lyft', '#disapointed', '#getthanked']",
3,0,bihday your majesty,bihday majesty,0,[],
4,0,#model i love u take with u all the time in ...,model take time mobile phone kiss sunglass mou...,0,['#model'],":mobile_phone:,:kissing_face_with_smiling_eyes..."
5,0,factsguide: society now #motivation,factsguide society motivation,0,['#motivation'],


In [5]:
df_cleaned['label'].value_counts()
positive = len(df_cleaned[df_cleaned['label'] == 1])
negative = len(df_cleaned[df_cleaned['label'] == 0])
print("Positive:", positive)
print("Negative:", negative)
print("Verhältnis:", negative / positive)

Positive: 1811
Negative: 25839
Verhältnis: 14.26780784097184


Verhältnis hat sich durch die Bereinigung sogar noch weiter verschlechter

### Train/Test Split

In [207]:
def print_pos_neg(y_train, y_test):
    positive = np.count_nonzero(y_train == 1)
    negative = np.count_nonzero(y_train == 0)
    print("Train")
    print("- Positive:", positive)
    print("- Negative:", negative)
    print("- Verhältnis:", negative / positive)

    positive = np.count_nonzero(y_test == 1)
    negative = np.count_nonzero(y_test == 0)
    print("Test")
    print("- Positive:", positive)
    print("- Negative:", negative)
    print("- Verhältnis:", negative / positive)

In [267]:
X_base = df["tweet"]
y_base = df["label"]

In [268]:
# Ohne stratify
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(X_base, y_base, test_size=0.3, random_state=42)
print_pos_neg(y_train_base, y_test_base)

Train
- Positive: 1401
- Negative: 19270
- Verhältnis: 13.754461099214847
Test
- Positive: 612
- Negative: 8247
- Verhältnis: 13.47549019607843


In [269]:
# mit stratify
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(X_base, y_base, test_size=0.3, stratify=y_base,
                                                                        random_state=42)
print_pos_neg(y_train_base, y_test_base)

Train
- Positive: 1409
- Negative: 19262
- Verhältnis: 13.67068843151171
Test
- Positive: 604
- Negative: 8255
- Verhältnis: 13.667218543046358


In [270]:
X_clean = df_cleaned["tweet_cleaned"]
y_clean = df_cleaned["label"]

In [271]:
# Ohne stratify
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean, y_clean, test_size=0.3,
                                                                            random_state=42)
print_pos_neg(y_train_clean, y_test_clean)

Train
- Positive: 66
- Negative: 4071
- Verhältnis: 61.68181818181818
Test
- Positive: 26
- Negative: 1748
- Verhältnis: 67.23076923076923


In [272]:
# mit stratify
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean, y_clean, test_size=0.3,
                                                                            stratify=y_clean, random_state=42)
print_pos_neg(y_train_clean, y_test_clean)

Train
- Positive: 64
- Negative: 4073
- Verhältnis: 63.640625
Test
- Positive: 28
- Negative: 1746
- Verhältnis: 62.357142857142854


### Vektorisierung

In [None]:
# https://spotintelligence.com/2023/02/15/word2vec-for-text-classification/#Unsupervised_text_classification_Word2Vec

# def vectorize(sentence, w2v_model):
#     words = sentence.split()
#     words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
#     if len(words_vecs) == 0:
#         return np.zeros(100)
#     words_vecs = np.array(words_vecs)
#     return words_vecs.mean(axis=0)


# def vectorize_df(df):
#     X = df["tweet"]
#     y = df["label"]
#
#     sentences_base = [sentence.split() for sentence in X]
#     w2v_model_base = Word2Vec(sentences_base, window=5, min_count=5, workers=-1)
#
#     x_data = np.array([vectorize(sentence, w2v_model_base) for sentence in X])
#     return x_data, y


# def vectorize_df_cleaned(df):
#     X = df["tweet_cleaned"]
#     y = df["label"]
#
#     sentences_base = [sentence.split() for sentence in X]
#     w2v_model_base = Word2Vec(sentences_base, window=5, min_count=5, workers=-1, )
#
#     x_data = np.array([vectorize(sentence, w2v_model_base) for sentence in X])
#     return x_data, y


# X_base, y_base = vectorize_df(df)
# X_clean, y_clean = vectorize_df(df_cleaned)


# nicht richtig genutzt, führt zu falschen train/test Daten

In [273]:
vectorizer = TfidfVectorizer(max_features=5000)

X_train_base = vectorizer.fit_transform(X_train_base)
X_test_base = vectorizer.transform(X_test_base)

In [274]:
print("X_train_base shape", X_train_base.shape)
print("y_train_base shape", y_train_base.shape)

print("X_test_base shape", X_test_base.shape)
print("y_test_base shape", y_test_base.shape)

X_train_base shape (20671, 5000)
y_train_base shape (20671,)
X_test_base shape (8859, 5000)
y_test_base shape (8859,)


In [275]:
# df_cleaned.dropna(inplace=True)  #irgendwoher kommt eine na row in tweet_cleaned, dadurch funktioniert der Vectorizer nicht
vectorizer_clean = TfidfVectorizer(max_features=5000)

X_train_clean = vectorizer_clean.fit_transform(X_train_clean)
X_test_clean = vectorizer_clean.transform(X_test_clean)

In [277]:
print("X_train_clean shape", X_train_clean.shape)
print("y_train_clean shape", y_train_clean.shape)

print("X_test_clean shape", X_test_clean.shape)
print("y_test_clean shape", y_test_clean.shape)

X_train_clean shape (4137, 4093)
y_train_clean shape (4137,)
X_test_clean shape (1774, 4093)
y_test_clean shape (1774,)


### 1. Resampling Methods

In [278]:
def print_data(y_before, y_after, sampling_technique):
    counter_before = Counter(y_before)
    counter_after = Counter(y_after)
    print("Before sampling with:", sampling_technique, counter_before)
    print("After sampling with:", sampling_technique, counter_after)

#### 1.1 Oversampling

##### 1.1.1 SMOTE

In [279]:
os_smote = SMOTE()

In [280]:
X_train_base_s, y_train_base_s = os_smote.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_s, "SMOTE")

Before sampling with: SMOTE Counter({0: 19262, 1: 1409})
After sampling with: SMOTE Counter({1: 19262, 0: 19262})


In [281]:
X_train_clean_s, y_train_clean_s = os_smote.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_s, "SMOTE")

Before sampling with: SMOTE Counter({0: 4073, 1: 64})
After sampling with: SMOTE Counter({0: 4073, 1: 4073})


##### 1.1.2 Borderline-SMOTE

In [282]:
os_bsmote = BorderlineSMOTE()

In [283]:
X_train_base_bs, y_train_base_bs = os_bsmote.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_bs, "BorderlineSMOTE")

Before sampling with: BorderlineSMOTE Counter({0: 19262, 1: 1409})
After sampling with: BorderlineSMOTE Counter({1: 19262, 0: 19262})


In [284]:
X_train_clean_bs, y_train_clean_bs = os_bsmote.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_bs, "BorderlineSMOTE")

Before sampling with: BorderlineSMOTE Counter({0: 4073, 1: 64})
After sampling with: BorderlineSMOTE Counter({0: 4073, 1: 4073})


##### 1.1.3 ADASYN

In [285]:
os_ada = ADASYN()

In [286]:
X_train_base_a, y_train_base_a = os_ada.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_a, "ADASYN")

Before sampling with: ADASYN Counter({0: 19262, 1: 1409})
After sampling with: ADASYN Counter({0: 19262, 1: 18876})


In [287]:
X_train_clean_a, y_train_clean_a = os_ada.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_a, "ADASYN")

Before sampling with: ADASYN Counter({0: 4073, 1: 64})
After sampling with: ADASYN Counter({0: 4073, 1: 4071})


#### 1.2 Undersampling

##### 1.2.1 NearMiss

In [288]:
us_near_miss = NearMiss(version=3, n_neighbors_ver3=3)

In [289]:
X_train_base_nm, y_train_base_nm = us_near_miss.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_nm, "NearMiss")

Before sampling with: NearMiss Counter({0: 19262, 1: 1409})
After sampling with: NearMiss Counter({1: 1409, 0: 185})




In [290]:
X_train_clean_nm, y_train_clean_nm = us_near_miss.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_nm, "NearMiss")

Before sampling with: NearMiss Counter({0: 4073, 1: 64})
After sampling with: NearMiss Counter({0: 64, 1: 64})


##### 1.2.2 Condensed Nearest Neighbor

In [291]:
us_cnn = CondensedNearestNeighbour(n_neighbors=1, n_jobs=-1)  # sehr langsam

In [292]:
# X_train_base_cnn, y_train_base_cnn = us_cnn.fit_resample(X_train_base, y_train_base)
# print_data(y_train_base, y_train_base_cnn, "CondensedNearestNeighbour")

In [293]:
# X_train_clean_cnn, y_train_clean_cnn = us_cnn.fit_resample(X_train_clean, y_train_clean)
# print_data(y_train_clean, y_train_clean_cnn, "CondensedNearestNeighbour")

##### 1.2.3 Neighborhood Cleaning

In [294]:
us_cnn_cr = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5, n_jobs=-1)  # sehr langsam

In [295]:
# X_train_base_ncr, y_train_base_ncr = us_cnn.fit_resample(X_train_base, y_train_base)
# print_data(y_train_base, y_train_base_ncr, "NeighbourhoodCleaningRule")

In [296]:
# X_train_clean_ncr, y_train_clean_ncr = us_cnn.fit_resample(X_train_clean, y_train_clean)
# print_data(y_train_clean, y_train_clean_ncr, "NeighbourhoodCleaningRule")

##### 1.2.4 Tomek Links Undersampler

In [297]:
us_tomek = TomekLinks()

In [298]:
X_train_base_t, y_train_base_t = us_tomek.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_t, "TomekLinks")

Before sampling with: TomekLinks Counter({0: 19262, 1: 1409})
After sampling with: TomekLinks Counter({0: 19223, 1: 1409})


In [299]:
X_train_clean_t, y_train_clean_t = us_tomek.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_t, "TomekLinks")

Before sampling with: TomekLinks Counter({0: 4073, 1: 64})
After sampling with: TomekLinks Counter({0: 4070, 1: 64})


### 2. Ensemble Models

In [300]:
evaluation = pd.DataFrame(
    columns=["model", "variant", "train_acc", "train_prec", "train_rec", "train_f1", "test_acc", "test_prec",
             "test_rec", "test_f1"])

In [301]:
def add_to_eval_df(model, model_name, variant, x_data_train, y_data_train, x_data_test, y_data_test):
    train_acc = model.score(x_data_train, y_data_train)
    train_precision = precision_score(y_data_train, model.predict(x_data_train))
    train_recall = recall_score(y_data_train, model.predict(x_data_train))
    train_f1 = f1_score(y_data_train, model.predict(x_data_train))

    test_acc = model.score(x_data_test, y_data_test)
    test_precision = precision_score(y_data_test, model.predict(x_data_test))
    test_recall = recall_score(y_data_test, model.predict(x_data_test))
    test_f1 = f1_score(y_data_test, model.predict(x_data_test))

    evaluation.loc[len(evaluation.index)] = [model_name, variant, train_acc, train_precision, train_recall, train_f1,
                                             test_acc, test_precision, test_recall, test_f1]

In [302]:
def evaluate_model(model, x_test, y_test, sampling_method):
    pred = model.predict(x_test)
    accscore = metrics.accuracy_score(pred, y_test)

    print(f'{sampling_method} model accuracy for classification is =', str('{:04.2f}'.format(accscore * 100)) + '%')
    print('------------------------------------------------')
    print('Confusion Matrix:')
    print(pd.DataFrame(confusion_matrix(y_test, pred)))
    print('------------------------------------------------')
    print('Classification Report:')
    print(classification_report(y_test, pred))

In [303]:
def fit_model(model, modelName):
    print("Starting model fitting.")

    start = time.time()
    print("1/12 Fitting: ", modelName, "BASE started...")
    model.fit(X_train_base, y_train_base)
    add_to_eval_df(model, modelName, "base", X_train_base, y_train_base, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("2/12 Fitting: ", modelName, "CLEAN started...")
    model.fit(X_train_clean, y_train_clean)
    add_to_eval_df(model, modelName, "clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("3/12 Fitting: ", modelName, "SMOTE-base started...")
    model.fit(X_train_base_s, y_train_base_s)
    add_to_eval_df(model, modelName, "SMOTE base", X_train_base_s, y_train_base_s, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("4/12 Fitting: ", modelName, "SMOTE-clean started...")
    model.fit(X_train_clean_s, y_train_clean_s)
    add_to_eval_df(model, modelName, "SMOTE clean", X_train_clean_s, y_train_clean_s, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("5/12 Fitting: ", modelName, "BorderlineSMOTE-base started...")
    model.fit(X_train_base_bs, y_train_base_bs)
    add_to_eval_df(model, modelName, "BorderlineSMOTE base", X_train_base_bs, y_train_base_bs, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("6/12 Fitting: ", modelName, "BorderlineSMOTE-clean started...")
    model.fit(X_train_clean_bs, y_train_clean_bs)
    add_to_eval_df(model, modelName, "BorderlineSMOTE clean", X_train_clean_bs, y_train_clean_bs, X_test_clean,
                   y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("7/12 Fitting: ", modelName, "ADASYN-base started...")
    model.fit(X_train_base_a, y_train_base_a)
    add_to_eval_df(model, modelName, "ADASYN base", X_train_base_a, y_train_base_a, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("8/12 Fitting: ", modelName, "ADASYN-clean started...")
    model.fit(X_train_clean_a, y_train_clean_a)
    add_to_eval_df(model, modelName, "ADASYN clean", X_train_clean_a, y_train_clean_a, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("9/12 Fitting: ", modelName, "NearMiss-base started...")
    model.fit(X_train_base_nm, y_train_base_nm)
    add_to_eval_df(model, modelName, "NearMiss base", X_train_base_nm, y_train_base_nm, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("10/12 Fitting: ", modelName, "NearMiss-clean started...")
    model.fit(X_train_clean_nm, y_train_clean_nm)
    add_to_eval_df(model, modelName, "NearMiss clean", X_train_clean_nm, y_train_clean_nm, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("11/12 Fitting: ", modelName, "TomekLink-base started...")
    model.fit(X_train_clean_t, y_train_clean_t)
    add_to_eval_df(model, modelName, "TomekLink base", X_train_clean_t, y_train_clean_t, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("12/12 Fitting: ", modelName, "TomekLink-clean started...")
    model.fit(X_train_clean_t, y_train_clean_t)
    add_to_eval_df(model, modelName, "TomekLink clean", X_train_clean_t, y_train_clean_t, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    print("Model fitting finished.")

#### 2.1 Bagging

In [304]:
rf = RandomForestClassifier(n_jobs=-1)

In [305]:
fit_model(rf, "RandomForest")

Starting model fitting.
1/12 Fitting:  RandomForest BASE started...
Fitting:  RandomForest finished. Elapsed time:  3 Seconds
2/12 Fitting:  RandomForest CLEAN started...
Fitting:  RandomForest finished. Elapsed time:  1 Seconds
3/12 Fitting:  RandomForest SMOTE-base started...
Fitting:  RandomForest finished. Elapsed time:  8 Seconds
4/12 Fitting:  RandomForest SMOTE-clean started...
Fitting:  RandomForest finished. Elapsed time:  1 Seconds
5/12 Fitting:  RandomForest BorderlineSMOTE-base started...
Fitting:  RandomForest finished. Elapsed time:  5 Seconds
6/12 Fitting:  RandomForest BorderlineSMOTE-clean started...
Fitting:  RandomForest finished. Elapsed time:  1 Seconds
7/12 Fitting:  RandomForest ADASYN-base started...
Fitting:  RandomForest finished. Elapsed time:  6 Seconds
8/12 Fitting:  RandomForest ADASYN-clean started...
Fitting:  RandomForest finished. Elapsed time:  1 Seconds
9/12 Fitting:  RandomForest NearMiss-base started...
Fitting:  RandomForest finished. Elapsed time

In [306]:
brf = BalancedRandomForestClassifier(n_jobs=-1)

In [307]:
# BASE
brf.fit(X_train_base, y_train_base)
add_to_eval_df(brf, "BalancedRandomForest", "BALANCED base", X_train_base, y_train_base, X_test_base, y_test_base)



In [308]:
# CLEANED
brf.fit(X_train_clean, y_train_clean)
add_to_eval_df(brf, "BalancedRandomForest", "BALANCED clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)



In [309]:
evaluation[(evaluation.model == "RandomForest") | (evaluation.model == "BalancedRandomForest")].sort_values(
    by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
3,RandomForest,SMOTE clean,1.0,1.0,1.0,1.0,0.988162,1.0,0.25,0.4
5,RandomForest,BorderlineSMOTE clean,1.0,1.0,1.0,1.0,0.988162,1.0,0.25,0.4
11,RandomForest,TomekLink clean,1.0,1.0,1.0,1.0,0.986471,1.0,0.142857,0.25
0,RandomForest,base,0.999855,1.0,0.997871,0.998934,0.956203,0.880282,0.413907,0.563063
7,RandomForest,ADASYN clean,1.0,1.0,1.0,1.0,0.987599,0.875,0.25,0.388889
4,RandomForest,BorderlineSMOTE base,0.999922,1.0,0.999844,0.999922,0.950785,0.868421,0.327815,0.475962
10,RandomForest,TomekLink base,1.0,1.0,1.0,1.0,0.985908,0.8,0.142857,0.242424
6,RandomForest,ADASYN base,0.999921,0.999947,0.999894,0.999921,0.952703,0.782875,0.423841,0.549946
2,RandomForest,SMOTE base,0.999922,1.0,0.999844,0.999922,0.952026,0.75942,0.433775,0.55216
1,RandomForest,clean,1.0,1.0,1.0,1.0,0.985344,0.75,0.107143,0.1875


#### 2.2 Boosting

##### 2.2.1a XGBClassifier

In [310]:
xgb = XGBClassifier()

In [311]:
fit_model(xgb, "XGBClassifier")

Starting model fitting.
1/12 Fitting:  XGBClassifier BASE started...
Fitting:  XGBClassifier finished. Elapsed time:  3 Seconds
2/12 Fitting:  XGBClassifier CLEAN started...
Fitting:  XGBClassifier finished. Elapsed time:  1 Seconds
3/12 Fitting:  XGBClassifier SMOTE-base started...
Fitting:  XGBClassifier finished. Elapsed time:  12 Seconds
4/12 Fitting:  XGBClassifier SMOTE-clean started...
Fitting:  XGBClassifier finished. Elapsed time:  1 Seconds
5/12 Fitting:  XGBClassifier BorderlineSMOTE-base started...
Fitting:  XGBClassifier finished. Elapsed time:  7 Seconds
6/12 Fitting:  XGBClassifier BorderlineSMOTE-clean started...
Fitting:  XGBClassifier finished. Elapsed time:  1 Seconds
7/12 Fitting:  XGBClassifier ADASYN-base started...
Fitting:  XGBClassifier finished. Elapsed time:  10 Seconds
8/12 Fitting:  XGBClassifier ADASYN-clean started...
Fitting:  XGBClassifier finished. Elapsed time:  2 Seconds
9/12 Fitting:  XGBClassifier NearMiss-base started...
Fitting:  XGBClassifier fi

In [312]:
evaluation[evaluation.model == "XGBClassifier"].sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
21,XGBClassifier,ADASYN clean,0.998404,1.0,0.996807,0.998401,0.988726,0.9,0.321429,0.473684
14,XGBClassifier,base,0.96512,0.972527,0.502484,0.662611,0.95101,0.824427,0.357616,0.498845
19,XGBClassifier,BorderlineSMOTE clean,0.997545,1.0,0.99509,0.997539,0.988162,0.818182,0.321429,0.461538
17,XGBClassifier,SMOTE clean,0.998527,1.0,0.997054,0.998525,0.987599,0.8,0.285714,0.421053
18,XGBClassifier,BorderlineSMOTE base,0.978221,0.993359,0.96288,0.977882,0.946947,0.681081,0.417219,0.517454
15,XGBClassifier,clean,0.992265,1.0,0.5,0.666667,0.985344,0.666667,0.142857,0.235294
20,XGBClassifier,ADASYN base,0.969374,0.991507,0.946228,0.968338,0.944012,0.644385,0.399007,0.492843
16,XGBClassifier,SMOTE base,0.969422,0.990347,0.948084,0.968755,0.944802,0.64268,0.428808,0.514399
24,XGBClassifier,TomekLink base,0.992259,1.0,0.5,0.666667,0.98478,0.6,0.107143,0.181818
25,XGBClassifier,TomekLink clean,0.992259,1.0,0.5,0.666667,0.98478,0.6,0.107143,0.181818


##### 2.2.1b XGBClassifier tuning


In [314]:
# todo erneut machen

param_test1 = {
    'max_depth': range(3, 10, 2),
}
gsearch1 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test1, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [315]:
# gsearch1.fit(X_train_clean, y_train_clean)
# gsearch1.best_score_, gsearch1.best_params_

In [316]:
param_test2 = {
    'min_child_weight': range(1, 6, 2),
}
gsearch2 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test2, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [317]:
# gsearch2.fit(X_train_clean, y_train_clean)
# gsearch2.best_score_, gsearch2.best_params_

In [318]:
param_test3 = {
    'gamma': [i / 10.0 for i in range(0, 5)],
}
gsearch3 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test3, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [319]:
# gsearch3.fit(X_train_clean, y_train_clean)
# gsearch3.best_score_, gsearch3.best_params_

In [320]:
param_test4 = {
    'subsample': [i / 10.0 for i in range(6, 10)],
}
gsearch4 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    gamma=0.1,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test4, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [321]:
# gsearch4.fit(X_train_clean, y_train_clean)
# gsearch4.best_score_, gsearch4.best_params_

In [322]:
param_test5 = {
    'colsample_bytree': [i / 10.0 for i in range(6, 10)],
}
gsearch5 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [323]:
# gsearch5.fit(X_train_clean, y_train_clean)
# gsearch5.best_score_, gsearch5.best_params_

In [324]:
param_test6 = {
    'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test6, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [325]:
# gsearch6.fit(X_train_clean, y_train_clean)
# gsearch6.best_score_, gsearch6.best_params_

In [326]:
xgb_tune = XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27,
)

In [None]:
# fit_model(xgb_tune, "XGBClassifier-tuned")
# evaluation[evaluation.model == "XGBClassifier-tuned"].sort_values(by=["test_prec"], ascending=False)

##### 2.2.1 CatBoostClassifier

In [328]:
cat = CatBoostClassifier(iterations=2,
                         depth=2,
                         learning_rate=1,
                         loss_function='Logloss',
                         verbose=True)

In [329]:
fit_model(cat, "CatBoostClassifier")

Starting model fitting.
1/12 Fitting:  CatBoostClassifier BASE started...
0:	learn: 0.2487701	total: 16.5ms	remaining: 16.5ms
1:	learn: 0.2343139	total: 33.1ms	remaining: 0us
Fitting:  CatBoostClassifier finished. Elapsed time:  0 Seconds
2/12 Fitting:  CatBoostClassifier CLEAN started...
0:	learn: 0.0796309	total: 6.26ms	remaining: 6.26ms
1:	learn: 0.0749569	total: 12.1ms	remaining: 0us
Fitting:  CatBoostClassifier finished. Elapsed time:  0 Seconds
3/12 Fitting:  CatBoostClassifier SMOTE-base started...
0:	learn: 0.6007060	total: 25.2ms	remaining: 25.2ms
1:	learn: 0.5466845	total: 46.5ms	remaining: 0us
Fitting:  CatBoostClassifier finished. Elapsed time:  1 Seconds
4/12 Fitting:  CatBoostClassifier SMOTE-clean started...
0:	learn: 0.6029243	total: 8.2ms	remaining: 8.2ms
1:	learn: 0.5223909	total: 16.3ms	remaining: 0us
Fitting:  CatBoostClassifier finished. Elapsed time:  0 Seconds
5/12 Fitting:  CatBoostClassifier BorderlineSMOTE-base started...
0:	learn: 0.5941689	total: 22.5ms	rema

In [330]:
evaluation[evaluation.model == "CatBoostClassifier"].sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
38,CatBoostClassifier,base,0.936191,0.736842,0.099361,0.175109,0.936787,0.744444,0.110927,0.193084
41,CatBoostClassifier,SMOTE clean,0.717162,0.987865,0.439725,0.608563,0.981398,0.272727,0.107143,0.153846
43,CatBoostClassifier,BorderlineSMOTE clean,0.757918,0.979461,0.526884,0.685185,0.974634,0.206897,0.214286,0.210526
40,CatBoostClassifier,SMOTE base,0.715606,0.686216,0.794518,0.736407,0.639914,0.11403,0.63245,0.193222
44,CatBoostClassifier,ADASYN base,0.712098,0.68034,0.789044,0.730671,0.639914,0.11403,0.63245,0.193222
42,CatBoostClassifier,BorderlineSMOTE base,0.714464,0.665518,0.86232,0.751244,0.576363,0.107063,0.710265,0.186077
45,CatBoostClassifier,ADASYN clean,0.73502,0.926818,0.510194,0.658112,0.947012,0.076923,0.214286,0.113208
46,CatBoostClassifier,NearMiss base,0.891468,0.891635,0.998581,0.942082,0.072017,0.068148,0.995033,0.12756
47,CatBoostClassifier,NearMiss clean,0.570312,0.846154,0.171875,0.285714,0.967869,0.032258,0.035714,0.033898
39,CatBoostClassifier,clean,0.985738,1.0,0.078125,0.144928,0.983653,0.0,0.0,0.0


##### 2.2.1 LGBMClassifier

In [331]:
light = LGBMClassifier()

In [332]:
fit_model(light, "LGBMClassifier")

Starting model fitting.
1/12 Fitting:  LGBMClassifier BASE started...
[LightGBM] [Info] Number of positive: 1409, number of negative: 19262
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 45936
[LightGBM] [Info] Number of data points in the train set: 20671, number of used features: 1379
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.068163 -> initscore=-2.615254
[LightGBM] [Info] Start training from score -2.615254
Fitting:  LGBMClassifier finished. Elapsed time:  9 Seconds
2/12 Fitting:  LGBMClassifier CLEAN started...
[LightGBM] [Info] Number of positive: 64, number of negative: 4073
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003226 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you c

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting:  LGBMClassifier finished. Elapsed time:  0 Seconds
12/12 Fitting:  LGBMClassifier TomekLink-clean started...
[LightGBM] [Info] Number of positive: 64, number of negative: 4070
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003228 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7829
[LightGBM] [Info] Number of data points in the train set: 4134, number of used features: 356
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.015481 -> initscore=-4.152515
[LightGBM] [Info] Start training from score -4.152515
Fitting:  LGBMClassifier finished. Elapsed time:  0 Seconds
Model fitting finished.


In [333]:
evaluation[evaluation.model == "LGBMClassifier"].sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
55,LGBMClassifier,BorderlineSMOTE clean,1.0,1.0,1.0,1.0,0.987599,0.8,0.285714,0.421053
50,LGBMClassifier,base,0.964298,0.944371,0.506033,0.658965,0.948301,0.753472,0.359272,0.486547
53,LGBMClassifier,SMOTE clean,1.0,1.0,1.0,1.0,0.987035,0.727273,0.285714,0.410256
57,LGBMClassifier,ADASYN clean,1.0,1.0,1.0,1.0,0.987035,0.727273,0.285714,0.410256
54,LGBMClassifier,BorderlineSMOTE base,0.976846,0.984645,0.968799,0.976658,0.945592,0.632035,0.483444,0.547842
52,LGBMClassifier,SMOTE base,0.969733,0.97873,0.960336,0.969446,0.941641,0.581006,0.516556,0.546889
56,LGBMClassifier,ADASYN base,0.969138,0.978688,0.958519,0.968498,0.940738,0.574388,0.504967,0.537445
51,LGBMClassifier,clean,0.999758,1.0,0.984375,0.992126,0.981398,0.222222,0.071429,0.108108
60,LGBMClassifier,TomekLink base,0.999758,0.984615,1.0,0.992248,0.981398,0.222222,0.071429,0.108108
61,LGBMClassifier,TomekLink clean,0.999758,0.984615,1.0,0.992248,0.981398,0.222222,0.071429,0.108108


#### 2.3 Stacking

In [334]:
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm', LinearSVC(random_state=42))
]
stack = StackingClassifier(estimators=base_models)

In [335]:
# stack.fit(X_train_base, y_train_base)

#### Model comparision

In [336]:
evaluation.sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
11,RandomForest,TomekLink clean,1.000000,1.000000,1.000000,1.000000,0.986471,1.000000,0.142857,0.250000
3,RandomForest,SMOTE clean,1.000000,1.000000,1.000000,1.000000,0.988162,1.000000,0.250000,0.400000
5,RandomForest,BorderlineSMOTE clean,1.000000,1.000000,1.000000,1.000000,0.988162,1.000000,0.250000,0.400000
21,XGBClassifier,ADASYN clean,0.998404,1.000000,0.996807,0.998401,0.988726,0.900000,0.321429,0.473684
0,RandomForest,base,0.999855,1.000000,0.997871,0.998934,0.956203,0.880282,0.413907,0.563063
...,...,...,...,...,...,...,...,...,...,...
49,CatBoostClassifier,TomekLink clean,0.985728,0.857143,0.093750,0.169014,0.983089,0.000000,0.000000,0.000000
36,XGBClassifier-tuned,TomekLink base,0.984519,0.000000,0.000000,0.000000,0.984216,0.000000,0.000000,0.000000
39,CatBoostClassifier,clean,0.985738,1.000000,0.078125,0.144928,0.983653,0.000000,0.000000,0.000000
37,XGBClassifier-tuned,TomekLink clean,0.984519,0.000000,0.000000,0.000000,0.984216,0.000000,0.000000,0.000000


In [337]:
evaluation.sort_values(by=["test_rec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
46,CatBoostClassifier,NearMiss base,0.891468,0.891635,0.998581,0.942082,0.072017,0.068148,0.995033,0.127560
34,XGBClassifier-tuned,NearMiss base,0.911543,0.912760,0.995032,0.952122,0.083644,0.067763,0.975166,0.126721
8,RandomForest,NearMiss base,0.998118,1.000000,0.997871,0.998934,0.222711,0.078502,0.968543,0.145233
22,XGBClassifier,NearMiss base,0.966123,0.969508,0.992903,0.981066,0.137600,0.070976,0.963576,0.132213
58,LGBMClassifier,NearMiss base,0.971142,0.975576,0.992193,0.983814,0.138052,0.070801,0.960265,0.131878
...,...,...,...,...,...,...,...,...,...,...
49,CatBoostClassifier,TomekLink clean,0.985728,0.857143,0.093750,0.169014,0.983089,0.000000,0.000000,0.000000
48,CatBoostClassifier,TomekLink base,0.985728,0.857143,0.093750,0.169014,0.983089,0.000000,0.000000,0.000000
36,XGBClassifier-tuned,TomekLink base,0.984519,0.000000,0.000000,0.000000,0.984216,0.000000,0.000000,0.000000
37,XGBClassifier-tuned,TomekLink clean,0.984519,0.000000,0.000000,0.000000,0.984216,0.000000,0.000000,0.000000


In [338]:
evaluation.sort_values(by=["test_f1"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
0,RandomForest,base,0.999855,1.000000,0.997871,0.998934,0.956203,0.880282,0.413907,0.563063
2,RandomForest,SMOTE base,0.999922,1.000000,0.999844,0.999922,0.952026,0.759420,0.433775,0.552160
6,RandomForest,ADASYN base,0.999921,0.999947,0.999894,0.999921,0.952703,0.782875,0.423841,0.549946
54,LGBMClassifier,BorderlineSMOTE base,0.976846,0.984645,0.968799,0.976658,0.945592,0.632035,0.483444,0.547842
52,LGBMClassifier,SMOTE base,0.969733,0.978730,0.960336,0.969446,0.941641,0.581006,0.516556,0.546889
...,...,...,...,...,...,...,...,...,...,...
37,XGBClassifier-tuned,TomekLink clean,0.984519,0.000000,0.000000,0.000000,0.984216,0.000000,0.000000,0.000000
36,XGBClassifier-tuned,TomekLink base,0.984519,0.000000,0.000000,0.000000,0.984216,0.000000,0.000000,0.000000
35,XGBClassifier-tuned,NearMiss clean,0.500000,0.000000,0.000000,0.000000,0.984216,0.000000,0.000000,0.000000
59,LGBMClassifier,NearMiss clean,0.500000,0.000000,0.000000,0.000000,0.984216,0.000000,0.000000,0.000000


#### 3. Einfache SVM / Bayes-Classifier zum Vergleich

##### 3.1 LinearSVC

In [339]:
svc = LinearSVC()

In [340]:
fit_model(svc, "LinearSVC")

Starting model fitting.
1/12 Fitting:  LinearSVC BASE started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
2/12 Fitting:  LinearSVC CLEAN started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
3/12 Fitting:  LinearSVC SMOTE-base started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
4/12 Fitting:  LinearSVC SMOTE-clean started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
5/12 Fitting:  LinearSVC BorderlineSMOTE-base started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
6/12 Fitting:  LinearSVC BorderlineSMOTE-clean started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
7/12 Fitting:  LinearSVC ADASYN-base started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
8/12 Fitting:  LinearSVC ADASYN-clean started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
9/12 Fitting:  LinearSVC NearMiss-base started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
10/12 Fitting:  LinearSVC NearMiss-clean 

In [341]:
evaluation[evaluation.model == "LinearSVC"].sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
62,LinearSVC,base,0.980891,0.967712,0.7445,0.841556,0.957896,0.8,0.509934,0.622851
63,LinearSVC,clean,0.99855,1.0,0.90625,0.95082,0.986471,0.75,0.214286,0.333333
65,LinearSVC,SMOTE clean,1.0,1.0,1.0,1.0,0.987599,0.75,0.321429,0.45
69,LinearSVC,ADASYN clean,1.0,1.0,1.0,1.0,0.987599,0.75,0.321429,0.45
72,LinearSVC,TomekLink base,0.998549,1.0,0.90625,0.95082,0.986471,0.75,0.214286,0.333333
73,LinearSVC,TomekLink clean,0.998549,1.0,0.90625,0.95082,0.986471,0.75,0.214286,0.333333
67,LinearSVC,BorderlineSMOTE clean,1.0,1.0,1.0,1.0,0.987035,0.727273,0.285714,0.410256
66,LinearSVC,BorderlineSMOTE base,0.989643,0.991096,0.988163,0.989627,0.947963,0.623489,0.597682,0.610313
64,LinearSVC,SMOTE base,0.987956,0.978467,0.997871,0.988074,0.93453,0.514563,0.701987,0.593838
68,LinearSVC,ADASYN base,0.988201,0.977655,0.998993,0.988209,0.933175,0.507126,0.706954,0.590595


In [342]:
svc = LinearSVC(
    class_weight="balanced")  # ohne balanced wird die SVC in einigen Fällen nur auf 0 trainiert, d.h. precision = 0

In [343]:
fit_model(svc, "LinearSVC-balanced")

Starting model fitting.
1/12 Fitting:  LinearSVC-balanced BASE started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  0 Seconds
2/12 Fitting:  LinearSVC-balanced CLEAN started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  0 Seconds
3/12 Fitting:  LinearSVC-balanced SMOTE-base started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  0 Seconds
4/12 Fitting:  LinearSVC-balanced SMOTE-clean started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  0 Seconds
5/12 Fitting:  LinearSVC-balanced BorderlineSMOTE-base started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  0 Seconds
6/12 Fitting:  LinearSVC-balanced BorderlineSMOTE-clean started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  0 Seconds
7/12 Fitting:  LinearSVC-balanced ADASYN-base started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  0 Seconds
8/12 Fitting:  LinearSVC-balanced ADASYN-clean started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  0 Seconds
9/

In [351]:
evaluation[evaluation.model == "LinearSVC-balanced"].sort_values(by=["test_f1"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
78,LinearSVC-balanced,BorderlineSMOTE base,0.989643,0.991096,0.988163,0.989627,0.947963,0.623489,0.597682,0.610313
76,LinearSVC-balanced,SMOTE base,0.987956,0.978467,0.997871,0.988074,0.93453,0.514563,0.701987,0.593838
74,LinearSVC-balanced,base,0.969619,0.69227,0.997871,0.817442,0.930128,0.491803,0.745033,0.592495
80,LinearSVC-balanced,ADASYN base,0.987965,0.9771,0.999099,0.987977,0.932498,0.503538,0.706954,0.588154
75,LinearSVC-balanced,clean,0.999758,0.984615,1.0,0.992248,0.987599,0.75,0.321429,0.45
77,LinearSVC-balanced,SMOTE clean,1.0,1.0,1.0,1.0,0.987599,0.75,0.321429,0.45
81,LinearSVC-balanced,ADASYN clean,1.0,1.0,1.0,1.0,0.987599,0.75,0.321429,0.45
84,LinearSVC-balanced,TomekLink base,0.999758,0.984615,1.0,0.992248,0.987599,0.75,0.321429,0.45
85,LinearSVC-balanced,TomekLink clean,0.999758,0.984615,1.0,0.992248,0.987599,0.75,0.321429,0.45
79,LinearSVC-balanced,BorderlineSMOTE clean,1.0,1.0,1.0,1.0,0.987035,0.727273,0.285714,0.410256


##### 3.2 Bayes-Classifier

In [345]:
from sklearn.naive_bayes import GaussianNB

In [346]:
bayes = GaussianNB()

In [347]:
fit_model(svc, "GaussianNB")

Starting model fitting.
1/12 Fitting:  GaussianNB BASE started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
2/12 Fitting:  GaussianNB CLEAN started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
3/12 Fitting:  GaussianNB SMOTE-base started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
4/12 Fitting:  GaussianNB SMOTE-clean started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
5/12 Fitting:  GaussianNB BorderlineSMOTE-base started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
6/12 Fitting:  GaussianNB BorderlineSMOTE-clean started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
7/12 Fitting:  GaussianNB ADASYN-base started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
8/12 Fitting:  GaussianNB ADASYN-clean started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
9/12 Fitting:  GaussianNB NearMiss-base started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
10/12 Fitting:  Gaussia

In [350]:
evaluation[evaluation.model == "GaussianNB"].sort_values(by=["test_f1"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
90,GaussianNB,BorderlineSMOTE base,0.989643,0.991096,0.988163,0.989627,0.947963,0.623489,0.597682,0.610313
88,GaussianNB,SMOTE base,0.987956,0.978467,0.997871,0.988074,0.93453,0.514563,0.701987,0.593838
86,GaussianNB,base,0.969619,0.69227,0.997871,0.817442,0.930128,0.491803,0.745033,0.592495
92,GaussianNB,ADASYN base,0.987965,0.9771,0.999099,0.987977,0.932498,0.503538,0.706954,0.588154
87,GaussianNB,clean,0.999758,0.984615,1.0,0.992248,0.987599,0.75,0.321429,0.45
89,GaussianNB,SMOTE clean,1.0,1.0,1.0,1.0,0.987599,0.75,0.321429,0.45
93,GaussianNB,ADASYN clean,1.0,1.0,1.0,1.0,0.987599,0.75,0.321429,0.45
96,GaussianNB,TomekLink base,0.999758,0.984615,1.0,0.992248,0.987599,0.75,0.321429,0.45
97,GaussianNB,TomekLink clean,0.999758,0.984615,1.0,0.992248,0.987599,0.75,0.321429,0.45
91,GaussianNB,BorderlineSMOTE clean,1.0,1.0,1.0,1.0,0.987035,0.727273,0.285714,0.410256
