#### Imports & Downloads

In [1]:
import pandas as pd
import os
from gensim.models import Word2Vec
import numpy as np
from imblearn.under_sampling import NearMiss, CondensedNearestNeighbour, NeighbourhoodCleaningRule
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import TomekLinks
import time

from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

### Einladen der Daten

#### Originaldaten ohne umfassende Vorverarbeitung

In [2]:
current_dir = os.getcwd()
csv_path_train = os.path.abspath(os.path.join(current_dir, '../../../data/twitter_hate-speech/train_basic_cleaned.csv'))
df = pd.read_csv(csv_path_train, encoding='utf-8', index_col=0)

df.head()

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,@user when a father is dysfunctional and is s...
2,0,@user @user thanks for #lyft credit i can't us...
3,0,bihday your majesty
4,0,#model i love u take with u all the time in ...
5,0,factsguide: society now #motivation


In [3]:
df['label'].value_counts()
positive = len(df[df['label'] == 1])
negative = len(df[df['label'] == 0])
print("Positive:", positive)
print("Negative:", negative)
print("Verhältnis:", negative / positive)

Positive: 2013
Negative: 27517
Verhältnis: 13.669647292598112


#### Vorverarbeitete Daten

In [4]:
current_dir = os.getcwd()
csv_path_train = os.path.abspath(os.path.join(current_dir, '../../../data/twitter_hate-speech/train_cleaned.csv'))
df_cleaned = pd.read_csv(csv_path_train, encoding='utf-8', index_col=0)

df_cleaned.head()

Unnamed: 0_level_0,label,tweet,tweet_cleaned,user_handle,hashtags,emojis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,@user when a father is dysfunctional and is s...,father selfish drag kid run,1,['#run'],
2,0,@user @user thanks for #lyft credit i can't us...,thank lyft credit use cause offer van,2,"['#lyft', '#disapointed', '#getthanked']",
3,0,bihday your majesty,bihday majesty,0,[],
4,0,#model i love u take with u all the time in ...,model take time mobile phone kiss sunglass mou...,0,['#model'],":mobile_phone:,:kissing_face_with_smiling_eyes..."
5,0,factsguide: society now #motivation,factsguide society motivation,0,['#motivation'],


In [5]:
df_cleaned['label'].value_counts()
positive = len(df_cleaned[df_cleaned['label'] == 1])
negative = len(df_cleaned[df_cleaned['label'] == 0])
print("Positive:", positive)
print("Negative:", negative)
print("Verhältnis:", negative / positive)

Positive: 1811
Negative: 25839
Verhältnis: 14.26780784097184


Verhältnis hat sich durch die Bereinigung sogar noch weiter verschlechter

### Vektorisierung der Daten
https://spotintelligence.com/2023/02/15/word2vec-for-text-classification/#Unsupervised_text_classification_Word2Vec

In [6]:
def vectorize(sentence, w2v_model):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [7]:
def vectorize_df(df):
    X = df["tweet"]
    y = df["label"]

    sentences_base = [sentence.split() for sentence in X]
    w2v_model_base = Word2Vec(sentences_base, window=5, min_count=5, workers=-1)

    x_data = np.array([vectorize(sentence, w2v_model_base) for sentence in X])
    return x_data, y

In [8]:
X_base, y_base = vectorize_df(df)

In [9]:
X_base

array([[ 1.30374171e-03, -6.65663800e-04,  3.21442029e-03, ...,
        -2.68328539e-03,  1.41547900e-03, -2.84765265e-03],
       [ 3.36569443e-04, -9.82483965e-04,  9.05304274e-04, ...,
        -4.07109677e-04,  5.64215006e-04,  1.94166932e-04],
       [ 1.23032148e-03, -2.46845209e-03,  2.51201470e-03, ...,
         2.88697542e-03,  4.11413144e-03,  4.88469563e-03],
       ...,
       [-9.87032778e-04,  5.68386691e-04, -1.85755307e-05, ...,
         3.09016541e-05, -3.14187346e-04, -8.47426010e-04],
       [ 1.40571001e-03,  4.23042104e-03,  2.36720010e-03, ...,
        -4.67062840e-04,  2.66610738e-03, -7.61655625e-04],
       [ 8.88627619e-05, -6.70738227e-04,  2.82955565e-03, ...,
        -3.34652583e-03,  8.70123040e-04,  3.40033951e-03]])

In [10]:
y_base

id
1        0
2        0
3        0
4        0
5        0
        ..
31957    0
31958    0
31959    0
31960    0
31962    0
Name: label, Length: 29530, dtype: int64

In [11]:
X_clean, y_clean = vectorize_df(df_cleaned)

In [12]:
X_clean

array([[ 0.000164  , -0.00097227,  0.00330843, ..., -0.00523365,
         0.00096938, -0.00285136],
       [-0.00232347, -0.0016694 ,  0.00038817, ..., -0.00020569,
         0.00022666,  0.0038221 ],
       [-0.00620588, -0.00316428,  0.00138663, ...,  0.00165645,
         0.00093876, -0.00017074],
       ...,
       [ 0.0017181 ,  0.00186271,  0.00110266, ..., -0.0023023 ,
        -0.00016485,  0.00787419],
       [-0.00182129,  0.00049515,  0.0007572 , ..., -0.00087936,
        -0.00065907, -0.00146304],
       [-0.00335515, -0.00040929, -0.00032946, ..., -0.00268573,
         0.00366877, -0.0030683 ]])

In [13]:
y_clean

id
1        0
2        0
3        0
4        0
5        0
        ..
31956    0
31957    0
31958    0
31959    0
31960    0
Name: label, Length: 27650, dtype: int64

### Train/Test Split

In [14]:
def print_pos_neg(y_train, y_test):
    positive = np.count_nonzero(y_train == 1)
    negative = np.count_nonzero(y_train == 0)
    print("Train")
    print("- Positive:", positive)
    print("- Negative:", negative)
    print("- Verhältnis:", negative / positive)

    positive = np.count_nonzero(y_test == 1)
    negative = np.count_nonzero(y_test == 0)
    print("Test")
    print("- Positive:", positive)
    print("- Negative:", negative)
    print("- Verhältnis:", negative / positive)

In [15]:
# Ohne stratify
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(X_base, y_base, test_size=0.3, random_state=42)
print_pos_neg(y_train_base, y_test_base)

Train
- Positive: 1401
- Negative: 19270
- Verhältnis: 13.754461099214847
Test
- Positive: 612
- Negative: 8247
- Verhältnis: 13.47549019607843


In [16]:
# mit stratify
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(X_base, y_base, test_size=0.3, stratify=y_base, random_state=42)
print_pos_neg(y_train_base, y_test_base)

Train
- Positive: 1409
- Negative: 19262
- Verhältnis: 13.67068843151171
Test
- Positive: 604
- Negative: 8255
- Verhältnis: 13.667218543046358


In [17]:
# Ohne stratify
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean, y_clean, test_size=0.3, random_state=42)
print_pos_neg(y_train_clean, y_test_clean)

Train
- Positive: 1273
- Negative: 18082
- Verhältnis: 14.204241948153967
Test
- Positive: 538
- Negative: 7757
- Verhältnis: 14.4182156133829


In [18]:
# mit stratify
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean, y_clean, test_size=0.3, stratify=y_clean, random_state=42)
print_pos_neg(y_train_clean, y_test_clean)

Train
- Positive: 1268
- Negative: 18087
- Verhältnis: 14.264195583596214
Test
- Positive: 543
- Negative: 7752
- Verhältnis: 14.276243093922652


### 1. Resampling Methods
https://www.analyticsvidhya.com/blog/2022/05/handling-imbalanced-data-with-imbalance-learn-in-python/#h-techniques-for-handling-imbalanced-data

In [19]:
def print_data(y_before, y_after, sampling_technique):
    counter_before = Counter(y_before)
    counter_after = Counter(y_after)
    print("Before sampling with:", sampling_technique, counter_before)
    print("After sampling with:", sampling_technique, counter_after)

#### 1.1 Oversampling

##### 1.1.1 SMOTE

In [20]:
os_smote = SMOTE()

In [21]:
X_train_base_s, y_train_base_s = os_smote.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_s, "SMOTE")

Before sampling with: SMOTE Counter({0: 19262, 1: 1409})
After sampling with: SMOTE Counter({1: 19262, 0: 19262})


In [22]:
X_train_clean_s, y_train_clean_s = os_smote.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_s, "SMOTE")

Before sampling with: SMOTE Counter({0: 18087, 1: 1268})
After sampling with: SMOTE Counter({0: 18087, 1: 18087})


##### 1.1.2 Borderline-SMOTE

In [23]:
os_bsmote = BorderlineSMOTE()

In [24]:
X_train_base_bs, y_train_base_bs = os_bsmote.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_bs, "BorderlineSMOTE")

Before sampling with: BorderlineSMOTE Counter({0: 19262, 1: 1409})
After sampling with: BorderlineSMOTE Counter({1: 19262, 0: 19262})


In [25]:
X_train_clean_bs, y_train_clean_bs = os_bsmote.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_bs, "BorderlineSMOTE")

Before sampling with: BorderlineSMOTE Counter({0: 18087, 1: 1268})
After sampling with: BorderlineSMOTE Counter({0: 18087, 1: 18087})


##### 1.1.3 ADASYN

In [26]:
os_ada = ADASYN()

In [27]:
X_train_base_a, y_train_base_a = os_ada.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_a, "ADASYN")

Before sampling with: ADASYN Counter({0: 19262, 1: 1409})
After sampling with: ADASYN Counter({1: 19358, 0: 19262})


In [28]:
X_train_clean_a, y_train_clean_a = os_ada.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_a, "ADASYN")

Before sampling with: ADASYN Counter({0: 18087, 1: 1268})
After sampling with: ADASYN Counter({0: 18087, 1: 17645})


#### 1.2 Undersampling

##### 1.2.1 NearMiss

In [29]:
us_near_miss = NearMiss(version=3, n_neighbors_ver3=3)

In [30]:
X_train_base_nm, y_train_base_nm = us_near_miss.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_nm, "NearMiss")

Before sampling with: NearMiss Counter({0: 19262, 1: 1409})
After sampling with: NearMiss Counter({0: 1409, 1: 1409})


In [31]:
X_train_clean_nm, y_train_clean_nm = us_near_miss.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_nm, "NearMiss")

Before sampling with: NearMiss Counter({0: 18087, 1: 1268})
After sampling with: NearMiss Counter({0: 1268, 1: 1268})


##### 1.2.2 Condensed Nearest Neighbor

In [32]:
us_cnn = CondensedNearestNeighbour(n_neighbors=1, n_jobs=-1)  # sehr langsam

In [33]:
# X_train_base_cnn, y_train_base_cnn = us_cnn.fit_resample(X_train_base, y_train_base)
# print_data(y_train_base, y_train_base_cnn, "CondensedNearestNeighbour")

In [34]:
# X_train_clean_cnn, y_train_clean_cnn = us_cnn.fit_resample(X_train_clean, y_train_clean)
# print_data(y_train_clean, y_train_clean_cnn, "CondensedNearestNeighbour")

##### 1.2.3 Neighborhood Cleaning

In [35]:
us_cnn_cr = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5, n_jobs=-1)  # sehr langsam

In [36]:
# X_train_base_ncr, y_train_base_ncr = us_cnn.fit_resample(X_train_base, y_train_base)
# print_data(y_train_base, y_train_base_ncr, "NeighbourhoodCleaningRule")

In [37]:
# X_train_clean_ncr, y_train_clean_ncr = us_cnn.fit_resample(X_train_clean, y_train_clean)
# print_data(y_train_clean, y_train_clean_ncr, "NeighbourhoodCleaningRule")

##### 1.2.4 Tomek Links Undersampler

In [38]:
us_tomek = TomekLinks()

In [39]:
X_train_base_t, y_train_base_t = us_tomek.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_t, "TomekLinks")

Before sampling with: TomekLinks Counter({0: 19262, 1: 1409})
After sampling with: TomekLinks Counter({0: 19210, 1: 1409})


In [40]:
X_train_clean_t, y_train_clean_t = us_tomek.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_t, "TomekLinks")

Before sampling with: TomekLinks Counter({0: 18087, 1: 1268})
After sampling with: TomekLinks Counter({0: 18036, 1: 1268})


### 2. Ensemble Models

In [41]:
evaluation = pd.DataFrame(
    columns=["model", "variant", "train_acc", "train_prec", "train_rec", "train_f1", "test_acc", "test_prec",
             "test_rec", "test_f1"])

In [42]:
def add_to_eval_df(model, model_name, variant, x_data_train, y_data_train, x_data_test, y_data_test):
    train_acc = model.score(x_data_train, y_data_train)
    train_precision = precision_score(y_data_train, model.predict(x_data_train), average="weighted")
    train_recall = recall_score(y_data_train, model.predict(x_data_train), average="weighted")
    train_f1 = f1_score(y_data_train, model.predict(x_data_train), average="weighted")

    test_acc = model.score(x_data_test, y_data_test)
    test_precision = precision_score(y_data_test, model.predict(x_data_test), average="weighted")
    test_recall = recall_score(y_data_test, model.predict(x_data_test), average="weighted")
    test_f1 = f1_score(y_data_test, model.predict(x_data_test), average="weighted")

    evaluation.loc[len(evaluation.index)] = [model_name, variant, train_acc, train_precision, train_recall, train_f1,
                                             test_acc, test_precision, test_recall, test_f1]

In [43]:
def evaluate_model(model, x_test, y_test, sampling_method):
    pred = model.predict(x_test)
    accscore = metrics.accuracy_score(pred, y_test)

    print(f'{sampling_method} model accuracy for classification is =', str('{:04.2f}'.format(accscore * 100)) + '%')
    print('------------------------------------------------')
    print('Confusion Matrix:')
    print(pd.DataFrame(confusion_matrix(y_test, pred)))
    print('------------------------------------------------')
    print('Classification Report:')
    print(classification_report(y_test, pred))

In [44]:
def fit_model(model, modelName):
    print("Starting model fitting.")

    start = time.time()
    print("1/12 Fitting: ", modelName, "BASE started...")
    model.fit(X_train_base, y_train_base)
    add_to_eval_df(model, modelName, "base", X_train_base, y_train_base, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("2/12 Fitting: ", modelName, "CLEAN started...")
    model.fit(X_train_clean, y_train_clean)
    add_to_eval_df(model, modelName, "clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("3/12 Fitting: ", modelName, "SMOTE-base started...")
    model.fit(X_train_base_s, y_train_base_s)
    add_to_eval_df(model, modelName, "SMOTE base", X_train_base_s, y_train_base_s, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("4/12 Fitting: ", modelName, "SMOTE-clean started...")
    model.fit(X_train_clean_s, y_train_clean_s)
    add_to_eval_df(model, modelName, "SMOTE clean", X_train_clean_s, y_train_clean_s, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("5/12 Fitting: ", modelName, "BorderlineSMOTE-base started...")
    model.fit(X_train_base_bs, y_train_base_bs)
    add_to_eval_df(model, modelName, "BorderlineSMOTE base", X_train_base_bs, y_train_base_bs, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("6/12 Fitting: ", modelName, "BorderlineSMOTE-clean started...")
    model.fit(X_train_clean_bs, y_train_clean_bs)
    add_to_eval_df(model, modelName, "BorderlineSMOTE clean", X_train_clean_bs, y_train_clean_bs, X_test_clean,
                   y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("7/12 Fitting: ", modelName, "ADASYN-base started...")
    model.fit(X_train_base_a, y_train_base_a)
    add_to_eval_df(model, modelName, "ADASYN base", X_train_base_a, y_train_base_a, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("8/12 Fitting: ", modelName, "ADASYN-clean started...")
    model.fit(X_train_clean_a, y_train_clean_a)
    add_to_eval_df(model, modelName, "ADASYN clean", X_train_clean_a, y_train_clean_a, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("9/12 Fitting: ", modelName, "NearMiss-base started...")
    model.fit(X_train_base_nm, y_train_base_nm)
    add_to_eval_df(model, modelName, "NearMiss base", X_train_base_nm, y_train_base_nm, X_test_base, y_test_base)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("10/12 Fitting: ", modelName, "NearMiss-clean started...")
    model.fit(X_train_clean_nm, y_train_clean_nm)
    add_to_eval_df(model, modelName, "NearMiss clean", X_train_clean_nm, y_train_clean_nm, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("11/12 Fitting: ", modelName, "TomekLink-base started...")
    model.fit(X_train_clean_t, y_train_clean_t)
    add_to_eval_df(model, modelName, "TomekLink clean", X_train_clean_t, y_train_clean_t, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    start = time.time()
    print("12/12 Fitting: ", modelName, "TomekLink-clean started...")
    model.fit(X_train_clean_t, y_train_clean_t)
    add_to_eval_df(model, modelName, "TomekLink clean", X_train_clean_t, y_train_clean_t, X_test_clean, y_test_clean)
    end = time.time()
    elapsed_time = round(end - start)
    print("Fitting: ", modelName, "finished. Elapsed time: ", elapsed_time, "Seconds")

    print("Model fitting finished.")

#### 2.1 Bagging

In [45]:
rf = RandomForestClassifier(n_jobs=-1)

In [46]:
fit_model(rf, "RandomForest")

Starting model fitting.
1/12 Fitting:  RandomForest BASE started...
Fitting:  RandomForest finished. Elapsed time:  7 Seconds
2/12 Fitting:  RandomForest CLEAN started...
Fitting:  RandomForest finished. Elapsed time:  5 Seconds
3/12 Fitting:  RandomForest SMOTE-base started...
Fitting:  RandomForest finished. Elapsed time:  12 Seconds
4/12 Fitting:  RandomForest SMOTE-clean started...
Fitting:  RandomForest finished. Elapsed time:  12 Seconds
5/12 Fitting:  RandomForest BorderlineSMOTE-base started...
Fitting:  RandomForest finished. Elapsed time:  12 Seconds
6/12 Fitting:  RandomForest BorderlineSMOTE-clean started...
Fitting:  RandomForest finished. Elapsed time:  12 Seconds
7/12 Fitting:  RandomForest ADASYN-base started...
Fitting:  RandomForest finished. Elapsed time:  13 Seconds
8/12 Fitting:  RandomForest ADASYN-clean started...
Fitting:  RandomForest finished. Elapsed time:  12 Seconds
9/12 Fitting:  RandomForest NearMiss-base started...
Fitting:  RandomForest finished. Elapse

In [47]:
brf = BalancedRandomForestClassifier(n_jobs=-1)

In [48]:
# BASE
brf.fit(X_train_base, y_train_base)
add_to_eval_df(brf, "BalancedRandomForest", "BALANCED base", X_train_base, y_train_base, X_test_base, y_test_base)



In [49]:
# CLEANED
brf.fit(X_train_clean, y_train_clean)
add_to_eval_df(brf, "BalancedRandomForest", "BALANCED clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)



In [50]:
evaluation[(evaluation.model == "RandomForest") | (evaluation.model == "BalancedRandomForest")].sort_values(
    by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
0,RandomForest,base,0.99971,0.99971,0.99971,0.999709,0.933853,0.938237,0.933853,0.903827
10,RandomForest,TomekLink clean,0.999793,0.999793,0.999793,0.999793,0.934901,0.920633,0.934901,0.904267
11,RandomForest,TomekLink clean,0.999741,0.999741,0.999741,0.999741,0.93478,0.914781,0.93478,0.904205
1,RandomForest,clean,0.999742,0.999742,0.999742,0.999741,0.934659,0.910228,0.934659,0.904142
13,BalancedRandomForest,BALANCED clean,0.775614,0.949218,0.775614,0.831134,0.681254,0.909012,0.681254,0.761686
12,BalancedRandomForest,BALANCED base,0.792269,0.948473,0.792269,0.841936,0.699627,0.90567,0.699627,0.7735
2,RandomForest,SMOTE base,0.999689,0.999689,0.999689,0.999689,0.917146,0.899655,0.917146,0.907058
6,RandomForest,ADASYN base,0.999612,0.999612,0.999612,0.999612,0.916808,0.898265,0.916808,0.906098
4,RandomForest,BorderlineSMOTE base,0.999663,0.999663,0.999663,0.999663,0.919743,0.897874,0.919743,0.906571
3,RandomForest,SMOTE clean,0.999502,0.999503,0.999502,0.999502,0.913924,0.890929,0.913924,0.901218


#### 2.2 Boosting

##### 2.2.1a XGBClassifier

In [51]:
xgb = XGBClassifier()

In [52]:
fit_model(xgb, "XGBClassifier")

Starting model fitting.
1/12 Fitting:  XGBClassifier BASE started...
Fitting:  XGBClassifier finished. Elapsed time:  7 Seconds
2/12 Fitting:  XGBClassifier CLEAN started...
Fitting:  XGBClassifier finished. Elapsed time:  5 Seconds
3/12 Fitting:  XGBClassifier SMOTE-base started...
Fitting:  XGBClassifier finished. Elapsed time:  3 Seconds
4/12 Fitting:  XGBClassifier SMOTE-clean started...
Fitting:  XGBClassifier finished. Elapsed time:  4 Seconds
5/12 Fitting:  XGBClassifier BorderlineSMOTE-base started...
Fitting:  XGBClassifier finished. Elapsed time:  5 Seconds
6/12 Fitting:  XGBClassifier BorderlineSMOTE-clean started...
Fitting:  XGBClassifier finished. Elapsed time:  5 Seconds
7/12 Fitting:  XGBClassifier ADASYN-base started...
Fitting:  XGBClassifier finished. Elapsed time:  2 Seconds
8/12 Fitting:  XGBClassifier ADASYN-clean started...
Fitting:  XGBClassifier finished. Elapsed time:  2 Seconds
9/12 Fitting:  XGBClassifier NearMiss-base started...
Fitting:  XGBClassifier fini

In [53]:
evaluation[evaluation.model == "XGBClassifier"].sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
14,XGBClassifier,base,0.999226,0.999227,0.999226,0.999224,0.936675,0.931477,0.936675,0.911829
15,XGBClassifier,clean,0.99969,0.99969,0.99969,0.99969,0.935503,0.919527,0.935503,0.906815
24,XGBClassifier,TomekLink clean,0.999482,0.999482,0.999482,0.999481,0.935744,0.91846,0.935744,0.908218
25,XGBClassifier,TomekLink clean,0.999482,0.999482,0.999482,0.999481,0.935744,0.91846,0.935744,0.908218
20,XGBClassifier,ADASYN base,0.996556,0.996568,0.996556,0.996556,0.890958,0.90101,0.890958,0.895756
16,XGBClassifier,SMOTE base,0.996314,0.99633,0.996314,0.996314,0.890394,0.900566,0.890394,0.89525
18,XGBClassifier,BorderlineSMOTE base,0.997326,0.997327,0.997326,0.997326,0.899086,0.899395,0.899086,0.89924
17,XGBClassifier,SMOTE clean,0.997097,0.997103,0.997097,0.997097,0.888728,0.896905,0.888728,0.892701
21,XGBClassifier,ADASYN clean,0.996726,0.996746,0.996726,0.996726,0.886679,0.89679,0.886679,0.891564
19,XGBClassifier,BorderlineSMOTE clean,0.997401,0.997402,0.997401,0.997401,0.897529,0.89593,0.897529,0.896725


##### 2.2.1b XGBClassifier tuning

In [54]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [55]:
param_test1 = {
    'max_depth': range(3, 10, 2),
}
gsearch1 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test1, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [56]:
# gsearch1.fit(X_train_clean, y_train_clean)
# gsearch1.best_score_, gsearch1.best_params_

In [57]:
param_test2 = {
    'min_child_weight': range(1, 6, 2),
}
gsearch2 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test2, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [58]:
# gsearch2.fit(X_train_clean, y_train_clean)
# gsearch2.best_score_, gsearch2.best_params_

In [59]:
param_test3 = {
    'gamma': [i / 10.0 for i in range(0, 5)],
}
gsearch3 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test3, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [60]:
# gsearch3.fit(X_train_clean, y_train_clean)
# gsearch3.best_score_, gsearch3.best_params_

In [61]:
param_test4 = {
    'subsample': [i / 10.0 for i in range(6, 10)],
}
gsearch4 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    gamma=0.1,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test4, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [62]:
# gsearch4.fit(X_train_clean, y_train_clean)
# gsearch4.best_score_, gsearch4.best_params_

In [63]:
param_test5 = {
    'colsample_bytree': [i / 10.0 for i in range(6, 10)],
}
gsearch5 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [64]:
# gsearch5.fit(X_train_clean, y_train_clean)
# gsearch5.best_score_, gsearch5.best_params_

In [65]:
param_test6 = {
    'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test6, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [66]:
# gsearch6.fit(X_train_clean, y_train_clean)
# gsearch6.best_score_, gsearch6.best_params_

In [67]:
xgb_tune = XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27,
)

In [68]:
fit_model(xgb_tune, "XGBClassifier-tuned")
evaluation[evaluation.model == "XGBClassifier-tuned"].sort_values(by=["test_prec"], ascending=False)

Starting model fitting.
1/12 Fitting:  XGBClassifier-tuned BASE started...
Fitting:  XGBClassifier-tuned finished. Elapsed time:  5 Seconds
2/12 Fitting:  XGBClassifier-tuned CLEAN started...
Fitting:  XGBClassifier-tuned finished. Elapsed time:  5 Seconds
3/12 Fitting:  XGBClassifier-tuned SMOTE-base started...
Fitting:  XGBClassifier-tuned finished. Elapsed time:  6 Seconds
4/12 Fitting:  XGBClassifier-tuned SMOTE-clean started...
Fitting:  XGBClassifier-tuned finished. Elapsed time:  7 Seconds
5/12 Fitting:  XGBClassifier-tuned BorderlineSMOTE-base started...
Fitting:  XGBClassifier-tuned finished. Elapsed time:  6 Seconds
6/12 Fitting:  XGBClassifier-tuned BorderlineSMOTE-clean started...
Fitting:  XGBClassifier-tuned finished. Elapsed time:  6 Seconds
7/12 Fitting:  XGBClassifier-tuned ADASYN-base started...
Fitting:  XGBClassifier-tuned finished. Elapsed time:  6 Seconds
8/12 Fitting:  XGBClassifier-tuned ADASYN-clean started...
Fitting:  XGBClassifier-tuned finished. Elapsed tim

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
26,XGBClassifier-tuned,base,0.997484,0.997489,0.997484,0.997463,0.93532,0.932893,0.93532,0.907973
36,XGBClassifier-tuned,TomekLink clean,0.998291,0.998294,0.998291,0.99828,0.935865,0.928727,0.935865,0.906797
37,XGBClassifier-tuned,TomekLink clean,0.998291,0.998294,0.998291,0.99828,0.935865,0.928727,0.935865,0.906797
27,XGBClassifier-tuned,clean,0.997158,0.997167,0.997158,0.997129,0.935624,0.926975,0.935624,0.906226
32,XGBClassifier-tuned,ADASYN base,0.998809,0.998809,0.998809,0.998809,0.90473,0.902957,0.90473,0.903833
28,XGBClassifier-tuned,SMOTE base,0.999195,0.999195,0.999195,0.999195,0.903827,0.902187,0.903827,0.902999
30,XGBClassifier-tuned,BorderlineSMOTE base,0.998053,0.998056,0.998053,0.998053,0.909019,0.901107,0.909019,0.904848
31,XGBClassifier-tuned,BorderlineSMOTE clean,0.997844,0.997844,0.997844,0.997844,0.908137,0.897597,0.908137,0.902581
29,XGBClassifier-tuned,SMOTE clean,0.998894,0.998894,0.998894,0.998894,0.900181,0.896672,0.900181,0.898401
33,XGBClassifier-tuned,ADASYN clean,0.999048,0.99905,0.999048,0.999048,0.899458,0.895557,0.899458,0.897477


##### 2.2.1 CatBoostClassifier

In [69]:
cat = CatBoostClassifier(iterations=2,
                         depth=2,
                         learning_rate=1,
                         loss_function='Logloss',
                         verbose=True)

In [70]:
fit_model(cat, "CatBoostClassifier")

Starting model fitting.
1/12 Fitting:  CatBoostClassifier BASE started...
0:	learn: 0.2482778	total: 188ms	remaining: 188ms
1:	learn: 0.2451450	total: 195ms	remaining: 0us


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting:  CatBoostClassifier finished. Elapsed time:  6 Seconds
2/12 Fitting:  CatBoostClassifier CLEAN started...
0:	learn: 0.2412269	total: 7.77ms	remaining: 7.77ms
1:	learn: 0.2376897	total: 15.5ms	remaining: 0us


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting:  CatBoostClassifier finished. Elapsed time:  5 Seconds
3/12 Fitting:  CatBoostClassifier SMOTE-base started...
0:	learn: 0.6582918	total: 7.94ms	remaining: 7.94ms
1:	learn: 0.6348875	total: 16.2ms	remaining: 0us
Fitting:  CatBoostClassifier finished. Elapsed time:  7 Seconds
4/12 Fitting:  CatBoostClassifier SMOTE-clean started...
0:	learn: 0.6556356	total: 7.71ms	remaining: 7.71ms
1:	learn: 0.6311927	total: 15.3ms	remaining: 0us
Fitting:  CatBoostClassifier finished. Elapsed time:  7 Seconds
5/12 Fitting:  CatBoostClassifier BorderlineSMOTE-base started...
0:	learn: 0.6547329	total: 9.39ms	remaining: 9.39ms
1:	learn: 0.6274111	total: 17.7ms	remaining: 0us
Fitting:  CatBoostClassifier finished. Elapsed time:  10 Seconds
6/12 Fitting:  CatBoostClassifier BorderlineSMOTE-clean started...
0:	learn: 0.6483474	total: 9.79ms	remaining: 9.79ms
1:	learn: 0.6229336	total: 20ms	remaining: 0us
Fitting:  CatBoostClassifier finished. Elapsed time:  7 Seconds
7/12 Fitting:  CatBoostClassifi

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting:  CatBoostClassifier finished. Elapsed time:  5 Seconds
12/12 Fitting:  CatBoostClassifier TomekLink-clean started...
0:	learn: 0.2417537	total: 6.83ms	remaining: 6.83ms
1:	learn: 0.2392659	total: 13.4ms	remaining: 0us


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting:  CatBoostClassifier finished. Elapsed time:  5 Seconds
Model fitting finished.


In [71]:
evaluation[evaluation.model == "CatBoostClassifier"].sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
41,CatBoostClassifier,SMOTE clean,0.66194,0.665942,0.66194,0.659889,0.577818,0.896762,0.577818,0.681677
43,CatBoostClassifier,BorderlineSMOTE clean,0.661331,0.661358,0.661331,0.661318,0.6522,0.895979,0.6522,0.740074
42,CatBoostClassifier,BorderlineSMOTE base,0.663872,0.664034,0.663872,0.663789,0.637092,0.892341,0.637092,0.727028
45,CatBoostClassifier,ADASYN clean,0.664307,0.666074,0.664307,0.663793,0.603617,0.892305,0.603617,0.702841
40,CatBoostClassifier,SMOTE base,0.654683,0.655249,0.654683,0.654368,0.617338,0.891524,0.617338,0.711806
44,CatBoostClassifier,ADASYN base,0.651139,0.651863,0.651139,0.650785,0.659217,0.886768,0.659217,0.74352
47,CatBoostClassifier,NearMiss clean,0.5903,0.597886,0.5903,0.582205,0.55648,0.878745,0.55648,0.66548
46,CatBoostClassifier,NearMiss base,0.601136,0.6014,0.601136,0.600875,0.494525,0.877348,0.494525,0.608945
39,CatBoostClassifier,clean,0.934487,0.873266,0.934487,0.90284,0.934539,0.873363,0.934539,0.902916
48,CatBoostClassifier,TomekLink clean,0.934314,0.872943,0.934314,0.902586,0.934539,0.873363,0.934539,0.902916


##### 2.2.1 LGBMClassifier

In [72]:
light = LGBMClassifier()

In [73]:
fit_model(light, "LGBMClassifier")

Starting model fitting.
1/12 Fitting:  LGBMClassifier BASE started...
[LightGBM] [Info] Number of positive: 1409, number of negative: 19262
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007946 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 20671, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.068163 -> initscore=-2.615254
[LightGBM] [Info] Start training from score -2.615254
Fitting:  LGBMClassifier finished. Elapsed time:  1 Seconds
2/12 Fitting:  LGBMClassifier CLEAN started...
[LightGBM] [Info] Number of positive: 1268, number of negative: 18087
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 193

In [74]:
evaluation[evaluation.model == "LGBMClassifier"].sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
50,LGBMClassifier,base,0.966233,0.967414,0.966233,0.960975,0.934078,0.9262,0.934078,0.905538
51,LGBMClassifier,clean,0.969827,0.970722,0.969827,0.965572,0.935262,0.919491,0.935262,0.90581
60,LGBMClassifier,TomekLink clean,0.970576,0.971474,0.970576,0.966558,0.934539,0.907075,0.934539,0.905202
61,LGBMClassifier,TomekLink clean,0.970576,0.971474,0.970576,0.966558,0.934539,0.907075,0.934539,0.905202
52,LGBMClassifier,SMOTE base,0.934768,0.935281,0.934768,0.934749,0.843097,0.899642,0.843097,0.867461
54,LGBMClassifier,BorderlineSMOTE base,0.950239,0.951003,0.950239,0.950218,0.85811,0.898887,0.85811,0.876185
56,LGBMClassifier,ADASYN base,0.934076,0.935013,0.934076,0.934036,0.837115,0.898519,0.837115,0.863496
53,LGBMClassifier,SMOTE clean,0.938298,0.939119,0.938298,0.938269,0.837854,0.89759,0.837854,0.864063
55,LGBMClassifier,BorderlineSMOTE clean,0.954691,0.955218,0.954691,0.954678,0.859433,0.89728,0.859433,0.87661
57,LGBMClassifier,ADASYN clean,0.937171,0.938041,0.937171,0.937158,0.833876,0.895523,0.833876,0.86105


#### 2.3 Stacking

In [75]:
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm', LinearSVC(random_state=42))
]
stack = StackingClassifier(estimators=base_models)

In [76]:
# stack.fit(X_train_base, y_train_base)

#### Model comparision

In [90]:
evaluation.sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
0,RandomForest,base,0.999710,0.999710,0.999710,0.999709,0.933853,0.938237,0.933853,0.903827
26,XGBClassifier-tuned,base,0.997484,0.997489,0.997484,0.997463,0.935320,0.932893,0.935320,0.907973
14,XGBClassifier,base,0.999226,0.999227,0.999226,0.999224,0.936675,0.931477,0.936675,0.911829
37,XGBClassifier-tuned,TomekLink clean,0.998291,0.998294,0.998291,0.998280,0.935865,0.928727,0.935865,0.906797
36,XGBClassifier-tuned,TomekLink clean,0.998291,0.998294,0.998291,0.998280,0.935865,0.928727,0.935865,0.906797
...,...,...,...,...,...,...,...,...,...,...
39,CatBoostClassifier,clean,0.934487,0.873266,0.934487,0.902840,0.934539,0.873363,0.934539,0.902916
85,GaussianNB,TomekLink clean,0.934314,0.872943,0.934314,0.902586,0.934539,0.873363,0.934539,0.902916
74,GaussianNB,base,0.931837,0.868320,0.931837,0.898958,0.931821,0.868290,0.931821,0.898934
38,CatBoostClassifier,base,0.931837,0.868320,0.931837,0.898958,0.931821,0.868290,0.931821,0.898934


In [91]:
evaluation.sort_values(by=["test_rec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
14,XGBClassifier,base,0.999226,0.999227,0.999226,0.999224,0.936675,0.931477,0.936675,0.911829
36,XGBClassifier-tuned,TomekLink clean,0.998291,0.998294,0.998291,0.998280,0.935865,0.928727,0.935865,0.906797
37,XGBClassifier-tuned,TomekLink clean,0.998291,0.998294,0.998291,0.998280,0.935865,0.928727,0.935865,0.906797
25,XGBClassifier,TomekLink clean,0.999482,0.999482,0.999482,0.999481,0.935744,0.918460,0.935744,0.908218
24,XGBClassifier,TomekLink clean,0.999482,0.999482,0.999482,0.999481,0.935744,0.918460,0.935744,0.908218
...,...,...,...,...,...,...,...,...,...,...
59,LGBMClassifier,NearMiss clean,1.000000,1.000000,1.000000,1.000000,0.449427,0.890356,0.449427,0.566033
70,LinearSVC,NearMiss base,0.584812,0.586065,0.584812,0.583295,0.440795,0.882601,0.440795,0.556070
82,GaussianNB,NearMiss base,0.584812,0.586065,0.584812,0.583295,0.439892,0.882476,0.439892,0.555178
8,RandomForest,NearMiss base,0.999290,0.999291,0.999290,0.999290,0.423411,0.884953,0.423411,0.537736


In [92]:
evaluation.sort_values(by=["test_f1"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
14,XGBClassifier,base,0.999226,0.999227,0.999226,0.999224,0.936675,0.931477,0.936675,0.911829
25,XGBClassifier,TomekLink clean,0.999482,0.999482,0.999482,0.999481,0.935744,0.918460,0.935744,0.908218
24,XGBClassifier,TomekLink clean,0.999482,0.999482,0.999482,0.999481,0.935744,0.918460,0.935744,0.908218
26,XGBClassifier-tuned,base,0.997484,0.997489,0.997484,0.997463,0.935320,0.932893,0.935320,0.907973
2,RandomForest,SMOTE base,0.999689,0.999689,0.999689,0.999689,0.917146,0.899655,0.917146,0.907058
...,...,...,...,...,...,...,...,...,...,...
59,LGBMClassifier,NearMiss clean,1.000000,1.000000,1.000000,1.000000,0.449427,0.890356,0.449427,0.566033
70,LinearSVC,NearMiss base,0.584812,0.586065,0.584812,0.583295,0.440795,0.882601,0.440795,0.556070
82,GaussianNB,NearMiss base,0.584812,0.586065,0.584812,0.583295,0.439892,0.882476,0.439892,0.555178
8,RandomForest,NearMiss base,0.999290,0.999291,0.999290,0.999290,0.423411,0.884953,0.423411,0.537736


#### 3. Einfache SVM / Bayes-Classifier zum Vergleich

##### 3.1 LinearSVC

In [98]:
svc = LinearSVC()

In [99]:
fit_model(svc, "LinearSVC")

Starting model fitting.
1/12 Fitting:  LinearSVC BASE started...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
2/12 Fitting:  LinearSVC CLEAN started...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting:  LinearSVC finished. Elapsed time:  1 Seconds
3/12 Fitting:  LinearSVC SMOTE-base started...
Fitting:  LinearSVC finished. Elapsed time:  1 Seconds
4/12 Fitting:  LinearSVC SMOTE-clean started...
Fitting:  LinearSVC finished. Elapsed time:  3 Seconds
5/12 Fitting:  LinearSVC BorderlineSMOTE-base started...
Fitting:  LinearSVC finished. Elapsed time:  1 Seconds
6/12 Fitting:  LinearSVC BorderlineSMOTE-clean started...
Fitting:  LinearSVC finished. Elapsed time:  1 Seconds
7/12 Fitting:  LinearSVC ADASYN-base started...
Fitting:  LinearSVC finished. Elapsed time:  1 Seconds
8/12 Fitting:  LinearSVC ADASYN-clean started...
Fitting:  LinearSVC finished. Elapsed time:  1 Seconds
9/12 Fitting:  LinearSVC NearMiss-base started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
10/12 Fitting:  LinearSVC NearMiss-clean started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
11/12 Fitting:  LinearSVC TomekLink-base started...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
12/12 Fitting:  LinearSVC TomekLink-clean started...
Fitting:  LinearSVC finished. Elapsed time:  0 Seconds
Model fitting finished.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [101]:
evaluation[evaluation.model == "LinearSVC"].sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
86,LinearSVC,base,0.627352,0.907659,0.627352,0.719082,0.629191,0.908366,0.629191,0.720492
64,LinearSVC,SMOTE base,0.657486,0.657511,0.657486,0.657473,0.655492,0.908212,0.655492,0.740861
112,LinearSVC,SMOTE base,0.657512,0.657537,0.657512,0.657499,0.655492,0.908212,0.655492,0.740861
100,LinearSVC,SMOTE base,0.657486,0.657511,0.657486,0.657473,0.655492,0.908212,0.655492,0.740861
116,LinearSVC,ADASYN base,0.658053,0.658324,0.658053,0.657868,0.640027,0.908105,0.640027,0.728972
68,LinearSVC,ADASYN base,0.658079,0.658352,0.658079,0.657892,0.640027,0.908105,0.640027,0.728972
104,LinearSVC,ADASYN base,0.65738,0.657478,0.65738,0.657301,0.645558,0.906955,0.645558,0.733276
98,LinearSVC,base,0.655314,0.905175,0.655314,0.740765,0.655379,0.905144,0.655379,0.740805
101,LinearSVC,SMOTE clean,0.672057,0.672159,0.672057,0.672009,0.646775,0.904783,0.646775,0.735926
113,LinearSVC,SMOTE clean,0.67203,0.672132,0.67203,0.671981,0.646775,0.904783,0.646775,0.735926


In [102]:
svc = LinearSVC(class_weight="balanced") # ohne balanced wird die SVC in einigen Fällen nur auf 0 trainiert, d.h. precision = 0

In [103]:
fit_model(svc, "LinearSVC-balanced")

Starting model fitting.
1/12 Fitting:  LinearSVC-balanced BASE started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  2 Seconds
2/12 Fitting:  LinearSVC-balanced CLEAN started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  2 Seconds
3/12 Fitting:  LinearSVC-balanced SMOTE-base started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  1 Seconds
4/12 Fitting:  LinearSVC-balanced SMOTE-clean started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  1 Seconds
5/12 Fitting:  LinearSVC-balanced BorderlineSMOTE-base started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  1 Seconds
6/12 Fitting:  LinearSVC-balanced BorderlineSMOTE-clean started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  1 Seconds
7/12 Fitting:  LinearSVC-balanced ADASYN-base started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  1 Seconds
8/12 Fitting:  LinearSVC-balanced ADASYN-clean started...
Fitting:  LinearSVC-balanced finished. Elapsed time:  1 Seconds
9/

In [104]:
evaluation[evaluation.model == "LinearSVC-balanced"].sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
124,LinearSVC-balanced,SMOTE base,0.657486,0.657511,0.657486,0.657473,0.655492,0.908212,0.655492,0.740861
128,LinearSVC-balanced,ADASYN base,0.65738,0.657476,0.65738,0.657302,0.645558,0.906955,0.645558,0.733276
122,LinearSVC-balanced,base,0.655314,0.905175,0.655314,0.740765,0.655379,0.905144,0.655379,0.740805
125,LinearSVC-balanced,SMOTE clean,0.67203,0.672132,0.67203,0.671981,0.646775,0.904783,0.646775,0.735926
123,LinearSVC-balanced,clean,0.666494,0.909782,0.666494,0.750728,0.658469,0.904086,0.658469,0.74476
132,LinearSVC-balanced,TomekLink clean,0.667116,0.909612,0.667116,0.751092,0.658348,0.904076,0.658348,0.744669
133,LinearSVC-balanced,TomekLink clean,0.667116,0.909612,0.667116,0.751092,0.658348,0.904076,0.658348,0.744669
129,LinearSVC-balanced,ADASYN clean,0.662515,0.662915,0.662515,0.662471,0.634237,0.903982,0.634237,0.726333
126,LinearSVC-balanced,BorderlineSMOTE base,0.67018,0.670192,0.67018,0.670174,0.671069,0.903379,0.671069,0.75257
127,LinearSVC-balanced,BorderlineSMOTE clean,0.702162,0.702171,0.702162,0.702158,0.676432,0.901332,0.676432,0.758002


##### 3.2 Bayes-Classifier

In [82]:
from sklearn.naive_bayes import GaussianNB

In [83]:
bayes = GaussianNB()

In [84]:
fit_model(svc, "GaussianNB")

Starting model fitting.
1/12 Fitting:  GaussianNB BASE started...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting:  GaussianNB finished. Elapsed time:  1 Seconds
2/12 Fitting:  GaussianNB CLEAN started...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
3/12 Fitting:  GaussianNB SMOTE-base started...
Fitting:  GaussianNB finished. Elapsed time:  1 Seconds
4/12 Fitting:  GaussianNB SMOTE-clean started...
Fitting:  GaussianNB finished. Elapsed time:  1 Seconds
5/12 Fitting:  GaussianNB BorderlineSMOTE-base started...
Fitting:  GaussianNB finished. Elapsed time:  1 Seconds
6/12 Fitting:  GaussianNB BorderlineSMOTE-clean started...
Fitting:  GaussianNB finished. Elapsed time:  1 Seconds
7/12 Fitting:  GaussianNB ADASYN-base started...
Fitting:  GaussianNB finished. Elapsed time:  1 Seconds
8/12 Fitting:  GaussianNB ADASYN-clean started...
Fitting:  GaussianNB finished. Elapsed time:  1 Seconds
9/12 Fitting:  GaussianNB NearMiss-base started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
10/12 Fitting:  GaussianNB NearMiss-clean started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
11/12 Fitting:  GaussianNB TomekLink-base started...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
12/12 Fitting:  GaussianNB TomekLink-clean started...
Fitting:  GaussianNB finished. Elapsed time:  0 Seconds
Model fitting finished.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [85]:
evaluation[evaluation.model == "GaussianNB"].sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
76,GaussianNB,SMOTE base,0.657486,0.657511,0.657486,0.657473,0.655492,0.908212,0.655492,0.740861
80,GaussianNB,ADASYN base,0.658105,0.658377,0.658105,0.657919,0.640027,0.908105,0.640027,0.728972
77,GaussianNB,SMOTE clean,0.672057,0.672159,0.672057,0.672009,0.646775,0.904783,0.646775,0.735926
78,GaussianNB,BorderlineSMOTE base,0.670206,0.670217,0.670206,0.6702,0.670843,0.903361,0.670843,0.752402
81,GaussianNB,ADASYN clean,0.660724,0.660814,0.660724,0.660504,0.666667,0.901573,0.666667,0.750843
79,GaussianNB,BorderlineSMOTE clean,0.702162,0.702171,0.702162,0.702158,0.676432,0.901332,0.676432,0.758002
83,GaussianNB,NearMiss clean,0.604101,0.604319,0.604101,0.603894,0.465341,0.882856,0.465341,0.583016
82,GaussianNB,NearMiss base,0.584812,0.586065,0.584812,0.583295,0.439892,0.882476,0.439892,0.555178
75,GaussianNB,clean,0.934487,0.873266,0.934487,0.90284,0.934539,0.873363,0.934539,0.902916
84,GaussianNB,TomekLink clean,0.934314,0.872943,0.934314,0.902586,0.934539,0.873363,0.934539,0.902916
