#### Imports & Downloads

In [2]:
import pandas as pd
import os
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from imblearn.under_sampling import NearMiss, CondensedNearestNeighbour, NeighbourhoodCleaningRule
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import TomekLinks
import time
from sklearn.model_selection import GridSearchCV

from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

### Einladen der Daten

#### Originaldaten ohne umfassende Vorverarbeitung

In [3]:
current_dir = os.getcwd()
csv_path_train = os.path.abspath(os.path.join(current_dir, '../../../data/twitter_hate-speech/train_basic_cleaned.csv'))
df = pd.read_csv(csv_path_train, encoding='utf-8', index_col=0)

df.head()

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,@user when a father is dysfunctional and is s...
2,0,@user @user thanks for #lyft credit i can't us...
3,0,bihday your majesty
4,0,#model i love u take with u all the time in ...
5,0,factsguide: society now #motivation


In [4]:
df['label'].value_counts()
positive = len(df[df['label'] == 1])
negative = len(df[df['label'] == 0])
print("Positive:", positive)
print("Negative:", negative)
print("Verhältnis:", negative / positive)

Positive: 2013
Negative: 27517
Verhältnis: 13.669647292598112


#### Vorverarbeitete Daten

In [5]:
current_dir = os.getcwd()
csv_path_train = os.path.abspath(os.path.join(current_dir, '../../../data/twitter_hate-speech/train_cleaned.csv'))
df_cleaned = pd.read_csv(csv_path_train, encoding='utf-8', index_col=0)

df_cleaned.head()

Unnamed: 0_level_0,label,tweet,tweet_cleaned,user_handle,hashtags,emojis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,@user when a father is dysfunctional and is s...,father selfish drag kid run,1,['#run'],
2,0,@user @user thanks for #lyft credit i can't us...,thank lyft credit use cause offer van,2,"['#lyft', '#disapointed', '#getthanked']",
3,0,bihday your majesty,bihday majesty,0,[],
4,0,#model i love u take with u all the time in ...,model take time mobile phone kiss sunglass mou...,0,['#model'],":mobile_phone:,:kissing_face_with_smiling_eyes..."
5,0,factsguide: society now #motivation,factsguide society motivation,0,['#motivation'],


In [6]:
df_cleaned.dropna(subset=['tweet_cleaned'], inplace=True)  #irgendwoher kommt eine na row in tweet_cleaned, dadurch funktioniert der Vectorizer nicht

In [7]:
df_cleaned['label'].value_counts()
positive = len(df_cleaned[df_cleaned['label'] == 1])
negative = len(df_cleaned[df_cleaned['label'] == 0])
print("Positive:", positive)
print("Negative:", negative)
print("Verhältnis:", negative / positive)

Positive: 1811
Negative: 25838
Verhältnis: 14.267255659856433


Verhältnis hat sich durch die Bereinigung sogar noch weiter verschlechter

### Train/Test Split

In [8]:
def print_pos_neg(y_train, y_test):
    positive = np.count_nonzero(y_train == 1)
    negative = np.count_nonzero(y_train == 0)
    print("Train")
    print("- Positive:", positive)
    print("- Negative:", negative)
    print("- Verhältnis:", negative / positive)

    positive = np.count_nonzero(y_test == 1)
    negative = np.count_nonzero(y_test == 0)
    print("Test")
    print("- Positive:", positive)
    print("- Negative:", negative)
    print("- Verhältnis:", negative / positive)

In [9]:
X_base = df["tweet"]
y_base = df["label"]

In [10]:
# Ohne stratify
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(X_base, y_base, test_size=0.3, random_state=42)
print_pos_neg(y_train_base, y_test_base)

Train
- Positive: 1401
- Negative: 19270
- Verhältnis: 13.754461099214847
Test
- Positive: 612
- Negative: 8247
- Verhältnis: 13.47549019607843


In [11]:
# mit stratify
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(X_base, y_base, test_size=0.3, stratify=y_base,
                                                                        random_state=42)
print_pos_neg(y_train_base, y_test_base)

Train
- Positive: 1409
- Negative: 19262
- Verhältnis: 13.67068843151171
Test
- Positive: 604
- Negative: 8255
- Verhältnis: 13.667218543046358


In [12]:
X_clean = df_cleaned["tweet_cleaned"]
y_clean = df_cleaned["label"]

In [13]:
# Ohne stratify
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean, y_clean, test_size=0.3,
                                                                            random_state=42)
print_pos_neg(y_train_clean, y_test_clean)

Train
- Positive: 1270
- Negative: 18084
- Verhältnis: 14.239370078740157
Test
- Positive: 541
- Negative: 7754
- Verhältnis: 14.33271719038817


In [14]:
# mit stratify
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean, y_clean, test_size=0.3,
                                                                            stratify=y_clean, random_state=42)
print_pos_neg(y_train_clean, y_test_clean)

Train
- Positive: 1268
- Negative: 18086
- Verhältnis: 14.263406940063092
Test
- Positive: 543
- Negative: 7752
- Verhältnis: 14.276243093922652


### Vektorisierung

In [15]:
# https://spotintelligence.com/2023/02/15/word2vec-for-text-classification/#Unsupervised_text_classification_Word2Vec

# def vectorize(sentence, w2v_model):
#     words = sentence.split()
#     words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
#     if len(words_vecs) == 0:
#         return np.zeros(100)
#     words_vecs = np.array(words_vecs)
#     return words_vecs.mean(axis=0)


# def vectorize_df(df):
#     X = df["tweet"]
#     y = df["label"]
#
#     sentences_base = [sentence.split() for sentence in X]
#     w2v_model_base = Word2Vec(sentences_base, window=5, min_count=5, workers=-1)
#
#     x_data = np.array([vectorize(sentence, w2v_model_base) for sentence in X])
#     return x_data, y


# def vectorize_df_cleaned(df):
#     X = df["tweet_cleaned"]
#     y = df["label"]
#
#     sentences_base = [sentence.split() for sentence in X]
#     w2v_model_base = Word2Vec(sentences_base, window=5, min_count=5, workers=-1, )
#
#     x_data = np.array([vectorize(sentence, w2v_model_base) for sentence in X])
#     return x_data, y


# X_base, y_base = vectorize_df(df)
# X_clean, y_clean = vectorize_df(df_cleaned)


# nicht richtig genutzt, führt zu falschen train/test Daten

In [16]:
vectorizer = TfidfVectorizer(max_features=5000)

X_train_base = vectorizer.fit_transform(X_train_base)
X_test_base = vectorizer.transform(X_test_base)

In [17]:
print("X_train_base shape", X_train_base.shape)
print("y_train_base shape", y_train_base.shape)

print("X_test_base shape", X_test_base.shape)
print("y_test_base shape", y_test_base.shape)

X_train_base shape (20671, 5000)
y_train_base shape (20671,)
X_test_base shape (8859, 5000)
y_test_base shape (8859,)


In [18]:
vectorizer_clean = TfidfVectorizer(max_features=5000)

X_train_clean = vectorizer_clean.fit_transform(X_train_clean)
X_test_clean = vectorizer_clean.transform(X_test_clean)

In [19]:
print("X_train_clean shape", X_train_clean.shape)
print("y_train_clean shape", y_train_clean.shape)

print("X_test_clean shape", X_test_clean.shape)
print("y_test_clean shape", y_test_clean.shape)

X_train_clean shape (19354, 5000)
y_train_clean shape (19354,)
X_test_clean shape (8295, 5000)
y_test_clean shape (8295,)


### 1. Resampling Methods

In [20]:
def print_data(y_before, y_after, sampling_technique):
    counter_before = Counter(y_before)
    counter_after = Counter(y_after)
    print("Before sampling with:", sampling_technique, counter_before)
    print("After sampling with:", sampling_technique, counter_after)

#### 1.1 Oversampling

##### 1.1.1 SMOTE

In [21]:
os_smote = SMOTE()

In [22]:
X_train_base_s, y_train_base_s = os_smote.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_s, "SMOTE")

Before sampling with: SMOTE Counter({0: 19262, 1: 1409})
After sampling with: SMOTE Counter({1: 19262, 0: 19262})


In [23]:
X_train_clean_s, y_train_clean_s = os_smote.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_s, "SMOTE")

Before sampling with: SMOTE Counter({0: 18086, 1: 1268})
After sampling with: SMOTE Counter({0: 18086, 1: 18086})


##### 1.1.2 Borderline-SMOTE

In [24]:
os_bsmote = BorderlineSMOTE()

In [25]:
X_train_base_bs, y_train_base_bs = os_bsmote.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_bs, "BorderlineSMOTE")

Before sampling with: BorderlineSMOTE Counter({0: 19262, 1: 1409})
After sampling with: BorderlineSMOTE Counter({1: 19262, 0: 19262})


In [26]:
X_train_clean_bs, y_train_clean_bs = os_bsmote.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_bs, "BorderlineSMOTE")

Before sampling with: BorderlineSMOTE Counter({0: 18086, 1: 1268})
After sampling with: BorderlineSMOTE Counter({0: 18086, 1: 18086})


##### 1.1.3 ADASYN

In [27]:
os_ada = ADASYN()

In [28]:
X_train_base_a, y_train_base_a = os_ada.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_a, "ADASYN")

Before sampling with: ADASYN Counter({0: 19262, 1: 1409})
After sampling with: ADASYN Counter({0: 19262, 1: 18876})


In [29]:
X_train_clean_a, y_train_clean_a = os_ada.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_a, "ADASYN")

Before sampling with: ADASYN Counter({0: 18086, 1: 1268})
After sampling with: ADASYN Counter({1: 18656, 0: 18086})


#### 1.2 Undersampling

##### 1.2.1 NearMiss

In [30]:
us_near_miss = NearMiss(version=3, n_neighbors_ver3=3)

In [31]:
X_train_base_nm, y_train_base_nm = us_near_miss.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_nm, "NearMiss")

Before sampling with: NearMiss Counter({0: 19262, 1: 1409})
After sampling with: NearMiss Counter({1: 1409, 0: 185})




In [32]:
X_train_clean_nm, y_train_clean_nm = us_near_miss.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_nm, "NearMiss")

Before sampling with: NearMiss Counter({0: 18086, 1: 1268})
After sampling with: NearMiss Counter({1: 1268, 0: 357})




##### 1.2.2 Condensed Nearest Neighbor

In [33]:
us_cnn = CondensedNearestNeighbour(n_neighbors=1, n_jobs=-1)  # sehr langsam

In [34]:
# X_train_base_cnn, y_train_base_cnn = us_cnn.fit_resample(X_train_base, y_train_base)
# print_data(y_train_base, y_train_base_cnn, "CondensedNearestNeighbour")

In [35]:
# X_train_clean_cnn, y_train_clean_cnn = us_cnn.fit_resample(X_train_clean, y_train_clean)
# print_data(y_train_clean, y_train_clean_cnn, "CondensedNearestNeighbour")

##### 1.2.3 Neighborhood Cleaning

In [36]:
us_cnn_cr = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5, n_jobs=-1)  # sehr langsam

In [37]:
# X_train_base_ncr, y_train_base_ncr = us_cnn.fit_resample(X_train_base, y_train_base)
# print_data(y_train_base, y_train_base_ncr, "NeighbourhoodCleaningRule")

In [38]:
# X_train_clean_ncr, y_train_clean_ncr = us_cnn.fit_resample(X_train_clean, y_train_clean)
# print_data(y_train_clean, y_train_clean_ncr, "NeighbourhoodCleaningRule")

##### 1.2.4 Tomek Links Undersampler

In [39]:
us_tomek = TomekLinks()

In [40]:
X_train_base_t, y_train_base_t = us_tomek.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_t, "TomekLinks")

Before sampling with: TomekLinks Counter({0: 19262, 1: 1409})
After sampling with: TomekLinks Counter({0: 19223, 1: 1409})


In [41]:
X_train_clean_t, y_train_clean_t = us_tomek.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_t, "TomekLinks")

Before sampling with: TomekLinks Counter({0: 18086, 1: 1268})
After sampling with: TomekLinks Counter({0: 18022, 1: 1268})


### 2. Ensemble Models

In [42]:
evaluation = pd.DataFrame(
    columns=["model", "variant", "train_acc", "train_prec", "train_rec", "train_f1", "test_acc", "test_prec",
             "test_rec", "test_f1"])

In [43]:
def add_to_eval_df(model, model_name, variant, x_data_train, y_data_train, x_data_test, y_data_test):
    train_acc = model.score(x_data_train, y_data_train)
    train_precision = precision_score(y_data_train, model.predict(x_data_train))
    train_recall = recall_score(y_data_train, model.predict(x_data_train))
    train_f1 = f1_score(y_data_train, model.predict(x_data_train))

    test_acc = model.score(x_data_test, y_data_test)
    test_precision = precision_score(y_data_test, model.predict(x_data_test))
    test_recall = recall_score(y_data_test, model.predict(x_data_test))
    test_f1 = f1_score(y_data_test, model.predict(x_data_test))

    evaluation.loc[len(evaluation.index)] = [model_name, variant, train_acc, train_precision, train_recall, train_f1,
                                             test_acc, test_precision, test_recall, test_f1]

In [44]:
def evaluate_model(model, x_test, y_test, sampling_method):
    pred = model.predict(x_test)
    accscore = metrics.accuracy_score(pred, y_test)

    print(f'{sampling_method} model accuracy for classification is =', str('{:04.2f}'.format(accscore * 100)) + '%')
    print('------------------------------------------------')
    print('Confusion Matrix:')
    print(pd.DataFrame(confusion_matrix(y_test, pred)))
    print('------------------------------------------------')
    print('Classification Report:')
    print(classification_report(y_test, pred))

In [45]:
def fit_model(model, modelName):
    print("Starting model fitting.")

    start = time.time()
    print("1/12 Fitting: ", modelName, "BASE started...")
    model.fit(X_train_base, y_train_base)
    add_to_eval_df(model, modelName, "base", X_train_base, y_train_base, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("2/12 Fitting: ", modelName, "CLEAN started...")
    model.fit(X_train_clean, y_train_clean)
    add_to_eval_df(model, modelName, "clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("3/12 Fitting: ", modelName, "SMOTE-base started...")
    model.fit(X_train_base_s, y_train_base_s)
    add_to_eval_df(model, modelName, "SMOTE base", X_train_base_s, y_train_base_s, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("4/12 Fitting: ", modelName, "SMOTE-clean started...")
    model.fit(X_train_clean_s, y_train_clean_s)
    add_to_eval_df(model, modelName, "SMOTE clean", X_train_clean_s, y_train_clean_s, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("5/12 Fitting: ", modelName, "BorderlineSMOTE-base started...")
    model.fit(X_train_base_bs, y_train_base_bs)
    add_to_eval_df(model, modelName, "BorderlineSMOTE base", X_train_base_bs, y_train_base_bs, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("6/12 Fitting: ", modelName, "BorderlineSMOTE-clean started...")
    model.fit(X_train_clean_bs, y_train_clean_bs)
    add_to_eval_df(model, modelName, "BorderlineSMOTE clean", X_train_clean_bs, y_train_clean_bs, X_test_clean,
                   y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("7/12 Fitting: ", modelName, "ADASYN-base started...")
    model.fit(X_train_base_a, y_train_base_a)
    add_to_eval_df(model, modelName, "ADASYN base", X_train_base_a, y_train_base_a, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("8/12 Fitting: ", modelName, "ADASYN-clean started...")
    model.fit(X_train_clean_a, y_train_clean_a)
    add_to_eval_df(model, modelName, "ADASYN clean", X_train_clean_a, y_train_clean_a, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("9/12 Fitting: ", modelName, "NearMiss-base started...")
    model.fit(X_train_base_nm, y_train_base_nm)
    add_to_eval_df(model, modelName, "NearMiss base", X_train_base_nm, y_train_base_nm, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("10/12 Fitting: ", modelName, "NearMiss-clean started...")
    model.fit(X_train_clean_nm, y_train_clean_nm)
    add_to_eval_df(model, modelName, "NearMiss clean", X_train_clean_nm, y_train_clean_nm, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("11/12 Fitting: ", modelName, "TomekLink-base started...")
    model.fit(X_train_clean_t, y_train_clean_t)
    add_to_eval_df(model, modelName, "TomekLink base", X_train_clean_t, y_train_clean_t, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("12/12 Fitting: ", modelName, "TomekLink-clean started...")
    model.fit(X_train_clean_t, y_train_clean_t)
    add_to_eval_df(model, modelName, "TomekLink clean", X_train_clean_t, y_train_clean_t, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    print("Model fitting finished.")

#### 2.1 Bagging

In [46]:
rf = RandomForestClassifier(n_jobs=-1)

In [47]:
fit_model(rf, "RandomForest")

Starting model fitting.
1/12 Fitting:  RandomForest BASE started...
Fitting:  RandomForest finished. Elapsed time:  4 Seconds
2/12 Fitting:  RandomForest CLEAN started...
Fitting:  RandomForest finished. Elapsed time:  7 Seconds
3/12 Fitting:  RandomForest SMOTE-base started...
Fitting:  RandomForest finished. Elapsed time:  16 Seconds
4/12 Fitting:  RandomForest SMOTE-clean started...
Fitting:  RandomForest finished. Elapsed time:  7 Seconds
5/12 Fitting:  RandomForest BorderlineSMOTE-base started...
Fitting:  RandomForest finished. Elapsed time:  5 Seconds
6/12 Fitting:  RandomForest BorderlineSMOTE-clean started...
Fitting:  RandomForest finished. Elapsed time:  6 Seconds
7/12 Fitting:  RandomForest ADASYN-base started...
Fitting:  RandomForest finished. Elapsed time:  6 Seconds
8/12 Fitting:  RandomForest ADASYN-clean started...
Fitting:  RandomForest finished. Elapsed time:  9 Seconds
9/12 Fitting:  RandomForest NearMiss-base started...
Fitting:  RandomForest finished. Elapsed tim

In [48]:
brf = BalancedRandomForestClassifier(n_jobs=-1)

In [49]:
# BASE
brf.fit(X_train_base, y_train_base)
add_to_eval_df(brf, "BalancedRandomForest", "BALANCED base", X_train_base, y_train_base, X_test_base, y_test_base)



In [50]:
# CLEANED
brf.fit(X_train_clean, y_train_clean)
add_to_eval_df(brf, "BalancedRandomForest", "BALANCED clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)



In [51]:
evaluation[(evaluation.model == "RandomForest") | (evaluation.model == "BalancedRandomForest")].sort_values(
    by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
4,RandomForest,BorderlineSMOTE base,0.999922,0.999948,0.999896,0.999922,0.951913,0.886957,0.337748,0.489209
0,RandomForest,base,0.999855,1.0,0.997871,0.998934,0.956316,0.867797,0.423841,0.569522
10,RandomForest,TomekLink base,0.999844,1.0,0.997634,0.998816,0.955515,0.786184,0.440147,0.564345
2,RandomForest,SMOTE base,0.999922,0.999948,0.999896,0.999922,0.952478,0.776435,0.425497,0.549733
1,RandomForest,clean,0.999845,1.0,0.997634,0.998816,0.955395,0.774603,0.449355,0.568765
11,RandomForest,TomekLink clean,0.999793,1.0,0.996845,0.99842,0.95443,0.770492,0.432781,0.554245
6,RandomForest,ADASYN base,0.999921,0.999947,0.999894,0.999921,0.950672,0.749254,0.415563,0.534611
5,RandomForest,BorderlineSMOTE clean,0.999917,0.999889,0.999945,0.999917,0.9478,0.612705,0.550645,0.580019
7,RandomForest,ADASYN clean,0.999918,0.999946,0.999893,0.99992,0.94563,0.591633,0.546961,0.568421
3,RandomForest,SMOTE clean,0.999917,0.999889,0.999945,0.999917,0.945389,0.586873,0.559853,0.573044


#### 2.2 Boosting

##### 2.2.1a XGBClassifier

In [52]:
xgb = XGBClassifier()

In [53]:
fit_model(xgb, "XGBClassifier")

Starting model fitting.
1/12 Fitting:  XGBClassifier BASE started...
Fitting:  XGBClassifier finished. Elapsed time:  4 Seconds
2/12 Fitting:  XGBClassifier CLEAN started...
Fitting:  XGBClassifier finished. Elapsed time:  4 Seconds
3/12 Fitting:  XGBClassifier SMOTE-base started...
Fitting:  XGBClassifier finished. Elapsed time:  11 Seconds
4/12 Fitting:  XGBClassifier SMOTE-clean started...
Fitting:  XGBClassifier finished. Elapsed time:  13 Seconds
5/12 Fitting:  XGBClassifier BorderlineSMOTE-base started...
Fitting:  XGBClassifier finished. Elapsed time:  10 Seconds
6/12 Fitting:  XGBClassifier BorderlineSMOTE-clean started...
Fitting:  XGBClassifier finished. Elapsed time:  6 Seconds
7/12 Fitting:  XGBClassifier ADASYN-base started...
Fitting:  XGBClassifier finished. Elapsed time:  6 Seconds
8/12 Fitting:  XGBClassifier ADASYN-clean started...
Fitting:  XGBClassifier finished. Elapsed time:  6 Seconds
9/12 Fitting:  XGBClassifier NearMiss-base started...
Fitting:  XGBClassifier f

In [54]:
evaluation[evaluation.model == "XGBClassifier"].sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
14,XGBClassifier,base,0.96512,0.972527,0.502484,0.662611,0.95101,0.824427,0.357616,0.498845
15,XGBClassifier,clean,0.961455,0.945392,0.436909,0.597627,0.949367,0.78341,0.313076,0.447368
24,XGBClassifier,TomekLink base,0.961172,0.954466,0.429811,0.592713,0.948764,0.780952,0.302026,0.435591
25,XGBClassifier,TomekLink clean,0.961172,0.954466,0.429811,0.592713,0.948764,0.780952,0.302026,0.435591
18,XGBClassifier,BorderlineSMOTE base,0.978169,0.993252,0.96288,0.97783,0.947511,0.692521,0.413907,0.518135
16,XGBClassifier,SMOTE base,0.968877,0.990869,0.946475,0.968163,0.944463,0.636585,0.432119,0.514793
20,XGBClassifier,ADASYN base,0.969427,0.990364,0.947446,0.96843,0.942996,0.625954,0.407285,0.49348
19,XGBClassifier,BorderlineSMOTE clean,0.947418,0.975273,0.918113,0.94583,0.940567,0.549213,0.513812,0.530923
17,XGBClassifier,SMOTE clean,0.942248,0.974379,0.908382,0.940224,0.937553,0.524178,0.499079,0.511321
21,XGBClassifier,ADASYN clean,0.942083,0.974452,0.909788,0.94101,0.937673,0.52381,0.526703,0.525253


##### 2.2.1b XGBClassifier tuning


In [55]:
# todo erneut machen

param_test1 = {
    'max_depth': range(3, 10, 2),
}
gsearch1 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test1, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [56]:
# gsearch1.fit(X_train_clean, y_train_clean)
# gsearch1.best_score_, gsearch1.best_params_

In [57]:
param_test2 = {
    'min_child_weight': range(1, 6, 2),
}
gsearch2 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test2, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [58]:
# gsearch2.fit(X_train_clean, y_train_clean)
# gsearch2.best_score_, gsearch2.best_params_

In [59]:
param_test3 = {
    'gamma': [i / 10.0 for i in range(0, 5)],
}
gsearch3 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test3, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [60]:
# gsearch3.fit(X_train_clean, y_train_clean)
# gsearch3.best_score_, gsearch3.best_params_

In [61]:
param_test4 = {
    'subsample': [i / 10.0 for i in range(6, 10)],
}
gsearch4 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    gamma=0.1,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test4, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [62]:
# gsearch4.fit(X_train_clean, y_train_clean)
# gsearch4.best_score_, gsearch4.best_params_

In [63]:
param_test5 = {
    'colsample_bytree': [i / 10.0 for i in range(6, 10)],
}
gsearch5 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [64]:
# gsearch5.fit(X_train_clean, y_train_clean)
# gsearch5.best_score_, gsearch5.best_params_

In [65]:
param_test6 = {
    'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test6, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [66]:
# gsearch6.fit(X_train_clean, y_train_clean)
# gsearch6.best_score_, gsearch6.best_params_

In [67]:
xgb_tune = XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27,
)

In [68]:
# fit_model(xgb_tune, "XGBClassifier-tuned")
# evaluation[evaluation.model == "XGBClassifier-tuned"].sort_values(by=["test_prec"], ascending=False)

##### 2.2.1 CatBoostClassifier

In [69]:
cat = CatBoostClassifier(iterations=100,
                         depth=5,
                         learning_rate=0.1,
                         loss_function='Logloss',
                         verbose=True)

In [70]:
fit_model(cat, "CatBoostClassifier")

Starting model fitting.
1/12 Fitting:  CatBoostClassifier BASE started...
0:	learn: 0.5886354	total: 183ms	remaining: 18.1s
1:	learn: 0.5090445	total: 226ms	remaining: 11.1s
2:	learn: 0.4469016	total: 265ms	remaining: 8.58s
3:	learn: 0.3988917	total: 304ms	remaining: 7.29s
4:	learn: 0.3609651	total: 343ms	remaining: 6.51s
5:	learn: 0.3310584	total: 381ms	remaining: 5.97s
6:	learn: 0.3071056	total: 424ms	remaining: 5.63s
7:	learn: 0.2889219	total: 464ms	remaining: 5.33s
8:	learn: 0.2748976	total: 504ms	remaining: 5.1s
9:	learn: 0.2627787	total: 545ms	remaining: 4.91s
10:	learn: 0.2534025	total: 585ms	remaining: 4.73s
11:	learn: 0.2455747	total: 627ms	remaining: 4.6s
12:	learn: 0.2389367	total: 668ms	remaining: 4.47s
13:	learn: 0.2337709	total: 708ms	remaining: 4.35s
14:	learn: 0.2294589	total: 749ms	remaining: 4.24s
15:	learn: 0.2254072	total: 790ms	remaining: 4.14s
16:	learn: 0.2221890	total: 830ms	remaining: 4.05s
17:	learn: 0.2197272	total: 870ms	remaining: 3.96s
18:	learn: 0.2174933

In [71]:
evaluation[evaluation.model == "CatBoostClassifier"].sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
26,CatBoostClassifier,base,0.946834,0.914439,0.242725,0.383623,0.945705,0.901961,0.228477,0.364597
36,CatBoostClassifier,TomekLink base,0.948834,0.916914,0.243691,0.385047,0.945389,0.835821,0.206262,0.330871
37,CatBoostClassifier,TomekLink clean,0.948834,0.916914,0.243691,0.385047,0.945389,0.835821,0.206262,0.330871
27,CatBoostClassifier,clean,0.948693,0.920489,0.237382,0.377429,0.945027,0.832061,0.200737,0.323442
30,CatBoostClassifier,BorderlineSMOTE base,0.9546,0.967687,0.940608,0.953956,0.933965,0.517958,0.453642,0.483672
32,CatBoostClassifier,ADASYN base,0.930778,0.959423,0.898125,0.927762,0.930579,0.490401,0.465232,0.477485
28,CatBoostClassifier,SMOTE base,0.931705,0.959699,0.901256,0.92956,0.928434,0.475248,0.476821,0.476033
29,CatBoostClassifier,SMOTE clean,0.88845,0.936014,0.833905,0.882014,0.912236,0.386781,0.581952,0.464706
31,CatBoostClassifier,BorderlineSMOTE clean,0.888698,0.935025,0.835453,0.882439,0.911513,0.383961,0.581952,0.462665
33,CatBoostClassifier,ADASYN clean,0.884437,0.9354,0.829706,0.879389,0.909946,0.379147,0.589319,0.461428


##### 2.2.1 LGBMClassifier

In [72]:
light = LGBMClassifier()

In [73]:
fit_model(light, "LGBMClassifier")

Starting model fitting.
1/12 Fitting:  LGBMClassifier BASE started...
[LightGBM] [Info] Number of positive: 1409, number of negative: 19262
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037400 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45936
[LightGBM] [Info] Number of data points in the train set: 20671, number of used features: 1379
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.068163 -> initscore=-2.615254
[LightGBM] [Info] Start training from score -2.615254
Fitting:  LGBMClassifier finished. Elapsed time:  2 Seconds
2/12 Fitting:  LGBMClassifier CLEAN started...
[LightGBM] [Info] Number of positive: 1268, number of negative: 18086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032250 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 33446
[LightGBM] [Info] Number of data points in the train set: 19

In [74]:
evaluation[evaluation.model == "LGBMClassifier"].sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
38,LGBMClassifier,base,0.964298,0.944371,0.506033,0.658965,0.948301,0.753472,0.359272,0.486547
48,LGBMClassifier,TomekLink base,0.960342,0.926995,0.430599,0.588045,0.947559,0.714286,0.331492,0.45283
49,LGBMClassifier,TomekLink clean,0.960342,0.926995,0.430599,0.588045,0.947559,0.714286,0.331492,0.45283
39,LGBMClassifier,clean,0.960938,0.92953,0.436909,0.594421,0.946594,0.69685,0.325967,0.444166
42,LGBMClassifier,BorderlineSMOTE base,0.976975,0.984803,0.968903,0.976788,0.945366,0.631004,0.478477,0.544256
40,LGBMClassifier,SMOTE base,0.970278,0.979058,0.961115,0.970003,0.940738,0.573832,0.508278,0.539069
44,LGBMClassifier,ADASYN base,0.970423,0.979158,0.960691,0.969836,0.940738,0.573284,0.511589,0.540682
41,LGBMClassifier,SMOTE clean,0.950404,0.962946,0.936857,0.949723,0.933213,0.491971,0.620626,0.54886
43,LGBMClassifier,BorderlineSMOTE clean,0.952366,0.964199,0.939622,0.951752,0.932731,0.489019,0.615101,0.544861
45,LGBMClassifier,ADASYN clean,0.947798,0.962427,0.933641,0.947815,0.931887,0.484286,0.624309,0.545455


#### 2.3 Stacking

In [75]:
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm', LinearSVC(random_state=42))
]
stack = StackingClassifier(estimators=base_models)

In [76]:
# stack.fit(X_train_base, y_train_base)

#### Model comparision

In [77]:
evaluation.sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
26,CatBoostClassifier,base,0.946834,0.914439,0.242725,0.383623,0.945705,0.901961,0.228477,0.364597
4,RandomForest,BorderlineSMOTE base,0.999922,0.999948,0.999896,0.999922,0.951913,0.886957,0.337748,0.489209
0,RandomForest,base,0.999855,1.0,0.997871,0.998934,0.956316,0.867797,0.423841,0.569522
37,CatBoostClassifier,TomekLink clean,0.948834,0.916914,0.243691,0.385047,0.945389,0.835821,0.206262,0.330871
36,CatBoostClassifier,TomekLink base,0.948834,0.916914,0.243691,0.385047,0.945389,0.835821,0.206262,0.330871
27,CatBoostClassifier,clean,0.948693,0.920489,0.237382,0.377429,0.945027,0.832061,0.200737,0.323442
14,XGBClassifier,base,0.96512,0.972527,0.502484,0.662611,0.95101,0.824427,0.357616,0.498845
10,RandomForest,TomekLink base,0.999844,1.0,0.997634,0.998816,0.955515,0.786184,0.440147,0.564345
15,XGBClassifier,clean,0.961455,0.945392,0.436909,0.597627,0.949367,0.78341,0.313076,0.447368
24,XGBClassifier,TomekLink base,0.961172,0.954466,0.429811,0.592713,0.948764,0.780952,0.302026,0.435591


In [78]:
evaluation.sort_values(by=["test_rec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
34,CatBoostClassifier,NearMiss base,0.915307,0.9131,0.99929,0.954253,0.085111,0.069057,0.995033,0.12915
35,CatBoostClassifier,NearMiss clean,0.827692,0.819534,0.999211,0.900498,0.121037,0.068653,0.98895,0.128392
47,LGBMClassifier,NearMiss clean,0.814154,0.815274,0.985016,0.892143,0.126462,0.067827,0.968692,0.126778
23,XGBClassifier,NearMiss clean,0.871385,0.865424,0.988959,0.923077,0.236769,0.076901,0.968692,0.14249
22,XGBClassifier,NearMiss base,0.966123,0.969508,0.992903,0.981066,0.1376,0.070976,0.963576,0.132213
8,RandomForest,NearMiss base,0.998118,0.998582,0.99929,0.998936,0.229936,0.07884,0.963576,0.145755
46,LGBMClassifier,NearMiss base,0.971142,0.975576,0.992193,0.983814,0.138052,0.070801,0.960265,0.131878
9,RandomForest,NearMiss clean,0.998154,1.0,0.997634,0.998816,0.477034,0.099261,0.865562,0.178098
12,BalancedRandomForest,BALANCED base,0.853998,0.318274,1.0,0.482865,0.811604,0.24163,0.824503,0.373734
13,BalancedRandomForest,BALANCED clean,0.89196,0.37742,0.999211,0.547892,0.851236,0.275795,0.782689,0.407869


In [79]:
evaluation.sort_values(by=["test_f1"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
5,RandomForest,BorderlineSMOTE clean,0.999917,0.999889,0.999945,0.999917,0.9478,0.612705,0.550645,0.580019
3,RandomForest,SMOTE clean,0.999917,0.999889,0.999945,0.999917,0.945389,0.586873,0.559853,0.573044
0,RandomForest,base,0.999855,1.0,0.997871,0.998934,0.956316,0.867797,0.423841,0.569522
1,RandomForest,clean,0.999845,1.0,0.997634,0.998816,0.955395,0.774603,0.449355,0.568765
7,RandomForest,ADASYN clean,0.999918,0.999946,0.999893,0.99992,0.94563,0.591633,0.546961,0.568421
10,RandomForest,TomekLink base,0.999844,1.0,0.997634,0.998816,0.955515,0.786184,0.440147,0.564345
11,RandomForest,TomekLink clean,0.999793,1.0,0.996845,0.99842,0.95443,0.770492,0.432781,0.554245
2,RandomForest,SMOTE base,0.999922,0.999948,0.999896,0.999922,0.952478,0.776435,0.425497,0.549733
41,LGBMClassifier,SMOTE clean,0.950404,0.962946,0.936857,0.949723,0.933213,0.491971,0.620626,0.54886
45,LGBMClassifier,ADASYN clean,0.947798,0.962427,0.933641,0.947815,0.931887,0.484286,0.624309,0.545455


#### 3. Einfache SVM / Bayes-Classifier zum Vergleich

##### 3.1 LinearSVC

In [80]:
svc = LinearSVC()

In [81]:
fit_model(svc, "LinearSVC")

Starting model fitting.
1/12 Fitting:  LinearSVC BASE started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
2/12 Fitting:  LinearSVC CLEAN started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
3/12 Fitting:  LinearSVC SMOTE-base started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
4/12 Fitting:  LinearSVC SMOTE-clean started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
5/12 Fitting:  LinearSVC BorderlineSMOTE-base started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
6/12 Fitting:  LinearSVC BorderlineSMOTE-clean started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
7/12 Fitting:  LinearSVC ADASYN-base started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
8/12 Fitting:  LinearSVC ADASYN-clean started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
9/12 Fitting:  LinearSVC NearMiss-base started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
10/12 Fitting:  LinearSVC NearMiss-clean 

In [82]:
evaluation[evaluation.model == "LinearSVC"].sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
50,LinearSVC,base,0.980891,0.967712,0.7445,0.841556,0.957896,0.8,0.509934,0.622851
51,LinearSVC,clean,0.981089,0.956478,0.745268,0.837766,0.95443,0.733711,0.47698,0.578125
60,LinearSVC,TomekLink base,0.981804,0.962664,0.752366,0.844622,0.95443,0.731092,0.480663,0.58
61,LinearSVC,TomekLink clean,0.981804,0.962664,0.752366,0.844622,0.95443,0.731092,0.480663,0.58
54,LinearSVC,BorderlineSMOTE base,0.989695,0.990841,0.988527,0.989683,0.946608,0.611205,0.596026,0.603521
52,LinearSVC,SMOTE base,0.988319,0.978482,0.998598,0.988438,0.934417,0.513973,0.700331,0.592852
56,LinearSVC,ADASYN base,0.988358,0.977661,0.999311,0.988368,0.933288,0.507729,0.706954,0.591003
53,LinearSVC,SMOTE clean,0.983274,0.970552,0.996793,0.983497,0.911031,0.396166,0.685083,0.502024
55,LinearSVC,BorderlineSMOTE clean,0.98344,0.970815,0.996848,0.983659,0.91091,0.39485,0.677716,0.498983
57,LinearSVC,ADASYN clean,0.983017,0.969828,0.997588,0.983512,0.907052,0.382474,0.683241,0.490416


In [83]:
svc = LinearSVC(
    class_weight="balanced")  # ohne balanced wird die SVC in einigen Fällen nur auf 0 trainiert, d.h. precision = 0

In [84]:
fit_model(svc, "LinearSVC-balanced")

Starting model fitting.
1/12 Fitting:  LinearSVC-balanced BASE started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  0 Seconds
2/12 Fitting:  LinearSVC-balanced CLEAN started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  0 Seconds
3/12 Fitting:  LinearSVC-balanced SMOTE-base started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  0 Seconds
4/12 Fitting:  LinearSVC-balanced SMOTE-clean started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  0 Seconds
5/12 Fitting:  LinearSVC-balanced BorderlineSMOTE-base started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  0 Seconds
6/12 Fitting:  LinearSVC-balanced BorderlineSMOTE-clean started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  0 Seconds
7/12 Fitting:  LinearSVC-balanced ADASYN-base started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  0 Seconds
8/12 Fitting:  LinearSVC-balanced ADASYN-clean started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  0 Seconds
9/

In [85]:
evaluation[evaluation.model == "LinearSVC-balanced"].sort_values(by=["test_f1"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
66,LinearSVC-balanced,BorderlineSMOTE base,0.989695,0.990841,0.988527,0.989683,0.946608,0.611205,0.596026,0.603521
64,LinearSVC-balanced,SMOTE base,0.988319,0.978482,0.998598,0.988438,0.934417,0.513973,0.700331,0.592852
62,LinearSVC-balanced,base,0.969619,0.69227,0.997871,0.817442,0.930128,0.491803,0.745033,0.592495
68,LinearSVC-balanced,ADASYN base,0.988253,0.97736,0.999417,0.988265,0.933175,0.507126,0.706954,0.590595
72,LinearSVC-balanced,TomekLink base,0.968844,0.679012,0.997634,0.808049,0.917661,0.420455,0.6814,0.520028
73,LinearSVC-balanced,TomekLink clean,0.968844,0.679012,0.997634,0.808049,0.917661,0.420455,0.6814,0.520028
63,LinearSVC-balanced,clean,0.967655,0.670201,0.996845,0.801522,0.918143,0.421839,0.675875,0.519462
65,LinearSVC-balanced,SMOTE clean,0.983274,0.970552,0.996793,0.983497,0.911031,0.396166,0.685083,0.502024
67,LinearSVC-balanced,BorderlineSMOTE clean,0.98344,0.970815,0.996848,0.983659,0.91091,0.39485,0.677716,0.498983
69,LinearSVC-balanced,ADASYN clean,0.983289,0.970579,0.99732,0.983768,0.90862,0.387435,0.6814,0.493992


##### 3.2 Bayes-Classifier

In [86]:
from sklearn.naive_bayes import GaussianNB

In [87]:
bayes = GaussianNB()

In [88]:
fit_model(svc, "GaussianNB")

Starting model fitting.
1/12 Fitting:  GaussianNB BASE started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
2/12 Fitting:  GaussianNB CLEAN started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
3/12 Fitting:  GaussianNB SMOTE-base started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
4/12 Fitting:  GaussianNB SMOTE-clean started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
5/12 Fitting:  GaussianNB BorderlineSMOTE-base started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
6/12 Fitting:  GaussianNB BorderlineSMOTE-clean started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
7/12 Fitting:  GaussianNB ADASYN-base started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
8/12 Fitting:  GaussianNB ADASYN-clean started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
9/12 Fitting:  GaussianNB NearMiss-base started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
10/12 Fitting:  Gaussia

In [89]:
evaluation[evaluation.model == "GaussianNB"].sort_values(by=["test_f1"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
78,GaussianNB,BorderlineSMOTE base,0.989695,0.990841,0.988527,0.989683,0.946608,0.611205,0.596026,0.603521
76,GaussianNB,SMOTE base,0.988319,0.978482,0.998598,0.988438,0.934417,0.513973,0.700331,0.592852
74,GaussianNB,base,0.969619,0.69227,0.997871,0.817442,0.930128,0.491803,0.745033,0.592495
80,GaussianNB,ADASYN base,0.988253,0.97736,0.999417,0.988265,0.933175,0.507126,0.706954,0.590595
84,GaussianNB,TomekLink base,0.968844,0.679012,0.997634,0.808049,0.917661,0.420455,0.6814,0.520028
85,GaussianNB,TomekLink clean,0.968844,0.679012,0.997634,0.808049,0.917661,0.420455,0.6814,0.520028
75,GaussianNB,clean,0.967655,0.670201,0.996845,0.801522,0.918143,0.421839,0.675875,0.519462
77,GaussianNB,SMOTE clean,0.983274,0.970552,0.996793,0.983497,0.911031,0.396166,0.685083,0.502024
79,GaussianNB,BorderlineSMOTE clean,0.98344,0.970815,0.996848,0.983659,0.91091,0.39485,0.677716,0.498983
81,GaussianNB,ADASYN clean,0.983289,0.970579,0.99732,0.983768,0.90862,0.387435,0.6814,0.493992
