#### Imports & Downloads

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_val_score

import Vectorizer

### Einladen der Daten

#### Originaldaten ohne umfassende Vorverarbeitung

In [2]:
current_dir = os.getcwd()
csv_path_train = os.path.abspath(os.path.join(current_dir, '../../../data/twitter_hate-speech/train_basic_cleaned.csv'))
df = pd.read_csv(csv_path_train, encoding='utf-8', index_col=0)

df.head()

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,@user when a father is dysfunctional and is s...
2,0,@user @user thanks for #lyft credit i can't us...
3,0,bihday your majesty
4,0,#model i love u take with u all the time in ...
5,0,factsguide: society now #motivation


In [3]:
df['label'].value_counts()
positive = len(df[df['label'] == 1])
negative = len(df[df['label'] == 0])
print("Positive:", positive)
print("Negative:", negative)
print("Verhältnis:", negative / positive)

Positive: 2013
Negative: 27517
Verhältnis: 13.669647292598112


#### Vorverarbeitete Daten

In [4]:
current_dir = os.getcwd()
csv_path_train = os.path.abspath(os.path.join(current_dir, '../../../data/twitter_hate-speech/train_cleaned.csv'))
df_cleaned = pd.read_csv(csv_path_train, encoding='utf-8', index_col=0)

df_cleaned.head()

Unnamed: 0_level_0,label,tweet,tweet_cleaned,user_handle,hashtags,emojis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,@user when a father is dysfunctional and is s...,father selfish drag kid run,1,['#run'],
2,0,@user @user thanks for #lyft credit i can't us...,thank lyft credit use cause offer van,2,"['#lyft', '#disapointed', '#getthanked']",
3,0,bihday your majesty,bihday majesty,0,[],
4,0,#model i love u take with u all the time in ...,model take time mobile phone kiss sunglass mou...,0,['#model'],":mobile_phone:,:kissing_face_with_smiling_eyes..."
5,0,factsguide: society now #motivation,factsguide society motivation,0,['#motivation'],


In [5]:
df_cleaned.dropna(subset=['tweet_cleaned'], inplace=True)  #irgendwoher kommt eine na row in tweet_cleaned, dadurch funktioniert der Vectorizer nicht

In [6]:
df_cleaned['label'].value_counts()
positive = len(df_cleaned[df_cleaned['label'] == 1])
negative = len(df_cleaned[df_cleaned['label'] == 0])
print("Positive:", positive)
print("Negative:", negative)
print("Verhältnis:", negative / positive)

Positive: 1811
Negative: 25838
Verhältnis: 14.267255659856433


Verhältnis hat sich durch die Bereinigung sogar noch weiter verschlechter

### Train/Test Split

In [7]:
def print_pos_neg(y_train, y_test):
    positive = np.count_nonzero(y_train == 1)
    negative = np.count_nonzero(y_train == 0)
    print("Train")
    print("- Positive:", positive)
    print("- Negative:", negative)
    print("- Verhältnis:", negative / positive)

    positive = np.count_nonzero(y_test == 1)
    negative = np.count_nonzero(y_test == 0)
    print("Test")
    print("- Positive:", positive)
    print("- Negative:", negative)
    print("- Verhältnis:", negative / positive)

In [8]:
X_base = df["tweet"]
y_base = df["label"]

In [9]:
# mit stratify
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(X_base, y_base, test_size=0.3, stratify=y_base,
                                                                        random_state=42)
print_pos_neg(y_train_base, y_test_base)

Train
- Positive: 1409
- Negative: 19262
- Verhältnis: 13.67068843151171
Test
- Positive: 604
- Negative: 8255
- Verhältnis: 13.667218543046358


In [10]:
X_clean = df_cleaned["tweet_cleaned"]
y_clean = df_cleaned["label"]

In [11]:
# mit stratify
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean, y_clean, test_size=0.3,
                                                                            stratify=y_clean, random_state=42)
print_pos_neg(y_train_clean, y_test_clean)

Train
- Positive: 1268
- Negative: 18086
- Verhältnis: 14.263406940063092
Test
- Positive: 543
- Negative: 7752
- Verhältnis: 14.276243093922652


### Vektorisierung

In [12]:
X_train_base, X_test_base = Vectorizer.get_count_vectorized(X_train_base, X_test_base)

In [13]:
print("X_train_base shape", X_train_base.shape)
print("y_train_base shape", y_train_base.shape)

print("X_test_base shape", X_test_base.shape)
print("y_test_base shape", y_test_base.shape)

X_train_base shape (20671, 5000)
y_train_base shape (20671,)
X_test_base shape (8859, 5000)
y_test_base shape (8859,)


In [14]:
X_train_clean, X_test_clean = Vectorizer.get_count_vectorized(X_train_clean, X_test_clean)

In [15]:
print("X_train_clean shape", X_train_clean.shape)
print("y_train_clean shape", y_train_clean.shape)

print("X_test_clean shape", X_test_clean.shape)
print("y_test_clean shape", y_test_clean.shape)

X_train_clean shape (19354, 5000)
y_train_clean shape (19354,)
X_test_clean shape (8295, 5000)
y_test_clean shape (8295,)


### 2. Ensemble Models

In [16]:
evaluation = pd.DataFrame(
    columns=["model", "variant", "train_acc", "train_prec", "train_rec", "train_f1", "test_acc", "test_prec",
             "test_rec", "test_f1"])

In [17]:
def add_to_eval_df(model, model_name, variant, x_data_train, y_data_train, x_data_test, y_data_test):
    train_acc = model.score(x_data_train, y_data_train)
    train_precision = precision_score(y_data_train, model.predict(x_data_train))
    train_recall = recall_score(y_data_train, model.predict(x_data_train))
    train_f1 = f1_score(y_data_train, model.predict(x_data_train))

    test_acc = model.score(x_data_test, y_data_test)
    test_precision = precision_score(y_data_test, model.predict(x_data_test))
    test_recall = recall_score(y_data_test, model.predict(x_data_test))
    test_f1 = f1_score(y_data_test, model.predict(x_data_test))

    evaluation.loc[len(evaluation.index)] = [model_name, variant, train_acc, train_precision, train_recall, train_f1,
                                             test_acc, test_precision, test_recall, test_f1]

In [18]:
def evaluate_model(model, x_test, y_test, sampling_method):
    pred = model.predict(x_test)
    accscore = metrics.accuracy_score(pred, y_test)

    print(f'{sampling_method} model accuracy for classification is =', str('{:04.2f}'.format(accscore * 100)) + '%')
    print('------------------------------------------------')
    print('Confusion Matrix:')
    print(pd.DataFrame(confusion_matrix(y_test, pred)))
    print('------------------------------------------------')
    print('Classification Report:')
    print(classification_report(y_test, pred))

#### 2.1 Bagging

##### 2.1.1a RandomForest untuned

In [19]:
rf = RandomForestClassifier(n_jobs=-1)
rf.fit(X_train_clean, y_train_clean)

In [20]:
evaluate_model(rf, X_test_clean, y_test_clean, "none")
add_to_eval_df(rf, "RandomForestClassifier", "clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)

none model accuracy for classification is = 95.24%
------------------------------------------------
Confusion Matrix:
      0    1
0  7653   99
1   296  247
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      7752
           1       0.71      0.45      0.56       543

    accuracy                           0.95      8295
   macro avg       0.84      0.72      0.77      8295
weighted avg       0.95      0.95      0.95      8295



In [21]:
cv = cross_val_score(rf, X_train_clean, y_train_clean, cv=10, scoring='f1', verbose=10)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] START .....................................................................
[CV] END ................................ score: (test=0.525) total time=  10.8s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.8s remaining:    0.0s


[CV] END ................................ score: (test=0.515) total time=   2.7s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   13.6s remaining:    0.0s


[CV] END ................................ score: (test=0.559) total time=   2.6s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   16.2s remaining:    0.0s


[CV] END ................................ score: (test=0.551) total time=   2.5s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   18.8s remaining:    0.0s


[CV] END ................................ score: (test=0.462) total time=   2.6s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   21.6s remaining:    0.0s


[CV] END ................................ score: (test=0.567) total time=   2.6s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   24.3s remaining:    0.0s


[CV] END ................................ score: (test=0.529) total time=   2.6s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   27.0s remaining:    0.0s


[CV] END ................................ score: (test=0.500) total time=   2.6s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   29.7s remaining:    0.0s


[CV] END ................................ score: (test=0.552) total time=   2.7s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   32.5s remaining:    0.0s


[CV] END ................................ score: (test=0.570) total time=   2.6s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   35.2s finished


In [22]:
np.mean(cv)

0.5330083345631517

##### 2.1.1b RandomForest tuned

In [23]:
param_grid = {
    'n_estimators': [100, 150, 200, 250],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [9, 11, 13],
}

In [24]:
random_search = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1),
                                   param_grid,
                                   scoring='f1',
                                   verbose=10)
random_search.fit(X_train_clean, y_train_clean)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START max_depth=3, max_features=log2, max_leaf_nodes=11, n_estimators=200
[CV 1/5; 1/10] END max_depth=3, max_features=log2, max_leaf_nodes=11, n_estimators=200;, score=0.000 total time=   0.4s
[CV 2/5; 1/10] START max_depth=3, max_features=log2, max_leaf_nodes=11, n_estimators=200
[CV 2/5; 1/10] END max_depth=3, max_features=log2, max_leaf_nodes=11, n_estimators=200;, score=0.000 total time=   0.4s
[CV 3/5; 1/10] START max_depth=3, max_features=log2, max_leaf_nodes=11, n_estimators=200
[CV 3/5; 1/10] END max_depth=3, max_features=log2, max_leaf_nodes=11, n_estimators=200;, score=0.000 total time=   0.4s
[CV 4/5; 1/10] START max_depth=3, max_features=log2, max_leaf_nodes=11, n_estimators=200
[CV 4/5; 1/10] END max_depth=3, max_features=log2, max_leaf_nodes=11, n_estimators=200;, score=0.000 total time=   0.4s
[CV 5/5; 1/10] START max_depth=3, max_features=log2, max_leaf_nodes=11, n_estimators=200
[CV 5/5; 1/10]

In [25]:
print(random_search.best_estimator_)

RandomForestClassifier(max_depth=9, max_features=None, max_leaf_nodes=11,
                       n_estimators=150, n_jobs=-1)


In [26]:
rf_tuned = RandomForestClassifier(**random_search.best_params_)
rf_tuned.fit(X_train_clean, y_train_clean)

In [27]:
evaluate_model(rf_tuned, X_test_clean, y_test_clean, "none")
add_to_eval_df(rf_tuned, "RandomForestClassifier-tuned", "clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)

none model accuracy for classification is = 94.48%
------------------------------------------------
Confusion Matrix:
      0    1
0  7714   38
1   420  123
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      7752
           1       0.76      0.23      0.35       543

    accuracy                           0.94      8295
   macro avg       0.86      0.61      0.66      8295
weighted avg       0.94      0.94      0.93      8295



##### 2.1.2a BalancedRandomForest untuned

In [28]:
brf = BalancedRandomForestClassifier(n_jobs=-1)
brf.fit(X_train_clean, y_train_clean)



In [29]:
evaluate_model(brf, X_test_clean, y_test_clean, "none")
add_to_eval_df(brf, "BalancedRandomForestClassifier", "clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)

none model accuracy for classification is = 83.09%
------------------------------------------------
Confusion Matrix:
      0     1
0  6462  1290
1   113   430
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.83      0.90      7752
           1       0.25      0.79      0.38       543

    accuracy                           0.83      8295
   macro avg       0.62      0.81      0.64      8295
weighted avg       0.93      0.83      0.87      8295



In [30]:
cv = cross_val_score(brf, X_train_clean, y_train_clean, cv=10, scoring='f1', verbose=10)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] START .....................................................................
[CV] END ................................ score: (test=0.415) total time=   1.1s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV] END ................................ score: (test=0.383) total time=   0.9s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.1s remaining:    0.0s


[CV] END ................................ score: (test=0.353) total time=   0.9s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.1s remaining:    0.0s


[CV] END ................................ score: (test=0.422) total time=   0.8s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.0s remaining:    0.0s


[CV] END ................................ score: (test=0.377) total time=   0.8s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.9s remaining:    0.0s


[CV] END ................................ score: (test=0.427) total time=   0.8s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    5.8s remaining:    0.0s


[CV] END ................................ score: (test=0.425) total time=   0.8s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    6.7s remaining:    0.0s


[CV] END ................................ score: (test=0.405) total time=   0.8s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    7.7s remaining:    0.0s


[CV] END ................................ score: (test=0.407) total time=   0.8s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    8.5s remaining:    0.0s


[CV] END ................................ score: (test=0.448) total time=   0.9s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    9.5s finished


In [31]:
np.mean(cv)

0.4062549059505797

##### 2.1.2b BalancedRandomForest tuned

In [32]:
param_grid = {
    'n_estimators': [100, 150, 200, 250],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [9, 11, 13],
}

In [33]:
random_search = RandomizedSearchCV(BalancedRandomForestClassifier(n_jobs=-1),
                                   param_grid,
                                   scoring='f1',
                                   verbose=1)
random_search.fit(X_train_clean, y_train_clean)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




In [34]:
brf_tuned = BalancedRandomForestClassifier(**random_search.best_params_)
brf_tuned.fit(X_train_clean, y_train_clean)



In [35]:
evaluate_model(brf_tuned, X_test_clean, y_test_clean, "none")
add_to_eval_df(brf, "BalancedRandomForestClassifier-tuned", "clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)

none model accuracy for classification is = 86.02%
------------------------------------------------
Confusion Matrix:
      0     1
0  6742  1010
1   150   393
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.87      0.92      7752
           1       0.28      0.72      0.40       543

    accuracy                           0.86      8295
   macro avg       0.63      0.80      0.66      8295
weighted avg       0.93      0.86      0.89      8295



#### 2.2 Boosting

##### 2.2.1a XGBClassifier

In [36]:
xgb = XGBClassifier()
xgb.fit(X_train_clean, y_train_clean)

In [37]:
evaluate_model(xgb, X_test_clean, y_test_clean, "none")
add_to_eval_df(xgb, "XGBClassifier", "clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)

none model accuracy for classification is = 95.03%
------------------------------------------------
Confusion Matrix:
      0    1
0  7705   47
1   365  178
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      7752
           1       0.79      0.33      0.46       543

    accuracy                           0.95      8295
   macro avg       0.87      0.66      0.72      8295
weighted avg       0.94      0.95      0.94      8295



##### 2.2.1b XGBClassifier tuning
https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning
https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

Tuning mit GridSearchCV dauert bei großer param-Liste sehr lange, daher einzelne Schritte mit jeweils nur 1 bis max. 2 params


In [38]:
param_test1 = {
    'max_depth': range(3, 10, 2),
}
gsearch1 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test1, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [39]:
gsearch1.fit(X_train_clean, y_train_clean)
gsearch1.best_score_, gsearch1.best_params_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


(0.8850753455086607, {'max_depth': 9})

In [40]:
param_test2 = {
    'min_child_weight': range(1, 6, 2),
}
gsearch2 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test2, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [41]:
gsearch2.fit(X_train_clean, y_train_clean)
gsearch2.best_score_, gsearch2.best_params_

Fitting 5 folds for each of 3 candidates, totalling 15 fits


(0.8850753455086607, {'min_child_weight': 1})

In [42]:
param_test3 = {
    'gamma': [i / 10.0 for i in range(0, 5)],
}
gsearch3 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test3, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [43]:
gsearch3.fit(X_train_clean, y_train_clean)
gsearch3.best_score_, gsearch3.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


(0.8863399229125143, {'gamma': 0.4})

In [44]:
param_test4 = {
    'subsample': [i / 10.0 for i in range(6, 10)],
}
gsearch4 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=1,
    gamma=0.2,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test4, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [45]:
gsearch4.fit(X_train_clean, y_train_clean)
gsearch4.best_score_, gsearch4.best_params_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


(0.8885172223991376, {'subsample': 0.9})

In [46]:
param_test5 = {
    'colsample_bytree': [i / 10.0 for i in range(6, 10)],
}
gsearch5 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=1,
    gamma=0.2,
    subsample=0.9,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [47]:
gsearch5.fit(X_train_clean, y_train_clean)
gsearch5.best_score_, gsearch5.best_params_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


(0.889120325690512, {'colsample_bytree': 0.9})

In [48]:
param_test6 = {
    'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=1,
    gamma=0.2,
    subsample=0.8,
    colsample_bytree=0.7,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
), param_grid=param_test6, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

In [49]:
gsearch6.fit(X_train_clean, y_train_clean)
gsearch6.best_score_, gsearch6.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


(0.8858064219089586, {'reg_alpha': 0.01})

In [50]:
xgb_tune = XGBClassifier(
    learning_rate=0.1,
    n_estimators=140,
    max_depth=9,
    min_child_weight=1,
    gamma=0.2,
    subsample=0.8,
    colsample_bytree=0.7,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27,
    reg_alpha=0.01
)
xgb_tune.fit(X_train_clean, y_train_clean)

In [51]:
evaluate_model(xgb_tune, X_test_clean, y_test_clean, "none")
add_to_eval_df(xgb_tune, "XGBClassifier-tuned", "clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)

none model accuracy for classification is = 94.95%
------------------------------------------------
Confusion Matrix:
      0    1
0  7714   38
1   381  162
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      7752
           1       0.81      0.30      0.44       543

    accuracy                           0.95      8295
   macro avg       0.88      0.65      0.70      8295
weighted avg       0.94      0.95      0.94      8295



##### 2.2.1 CatBoostClassifier

In [52]:
cat = CatBoostClassifier(iterations=100,
                         depth=5,
                         learning_rate=0.1,
                         loss_function='Logloss',
                         verbose=True)
cat.fit(X_train_clean, y_train_clean)

0:	learn: 0.5857646	total: 163ms	remaining: 16.2s
1:	learn: 0.5047629	total: 183ms	remaining: 8.95s
2:	learn: 0.4403289	total: 206ms	remaining: 6.66s
3:	learn: 0.3911966	total: 226ms	remaining: 5.41s
4:	learn: 0.3540777	total: 246ms	remaining: 4.67s
5:	learn: 0.3224361	total: 266ms	remaining: 4.17s
6:	learn: 0.2998539	total: 284ms	remaining: 3.78s
7:	learn: 0.2794173	total: 303ms	remaining: 3.49s
8:	learn: 0.2652137	total: 324ms	remaining: 3.27s
9:	learn: 0.2532695	total: 344ms	remaining: 3.09s
10:	learn: 0.2440774	total: 365ms	remaining: 2.95s
11:	learn: 0.2376842	total: 384ms	remaining: 2.81s
12:	learn: 0.2307308	total: 405ms	remaining: 2.71s
13:	learn: 0.2255237	total: 426ms	remaining: 2.62s
14:	learn: 0.2212797	total: 449ms	remaining: 2.55s
15:	learn: 0.2180272	total: 470ms	remaining: 2.47s
16:	learn: 0.2151227	total: 490ms	remaining: 2.39s
17:	learn: 0.2122265	total: 512ms	remaining: 2.33s
18:	learn: 0.2100145	total: 532ms	remaining: 2.27s
19:	learn: 0.2077474	total: 554ms	remaini

<catboost.core.CatBoostClassifier at 0x1a28f16dcc0>

In [53]:
evaluate_model(cat, X_test_clean, y_test_clean, "none")
add_to_eval_df(cat, "CatBoostClassifier", "clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)

none model accuracy for classification is = 94.60%
------------------------------------------------
Confusion Matrix:
      0    1
0  7732   20
1   428  115
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      7752
           1       0.85      0.21      0.34       543

    accuracy                           0.95      8295
   macro avg       0.90      0.60      0.66      8295
weighted avg       0.94      0.95      0.93      8295



##### 2.2.1 LGBMClassifier

In [54]:
light = LGBMClassifier()
light.fit(X_train_clean, y_train_clean)

TypeError: Expected np.float32 or np.float64, met type(int64)

In [None]:
evaluate_model(light, X_test_clean, y_test_clean, "none")
add_to_eval_df(light, "LGBMClassifier", "clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)

#### 2.3 Stacking

##### 2.3.1 Stacking: RandomForest + LinearSVC

In [None]:
base_models = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('svc', LinearSVC(random_state=42))
]
stack = StackingClassifier(estimators=base_models, n_jobs=-1)

In [None]:
stack.fit(X_train_base, y_train_base)

In [None]:
evaluate_model(stack, X_test_clean, y_test_clean, "none")
add_to_eval_df(stack, "StackingClassifier", "clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)

#### Model comparision

In [55]:
evaluation.drop_duplicates(inplace=True)

In [56]:
evaluation.sort_values(by=["test_prec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
6,CatBoostClassifier,clean,0.948073,0.904615,0.231861,0.369115,0.945992,0.851852,0.211786,0.339233
5,XGBClassifier-tuned,clean,0.954893,0.89899,0.350946,0.504821,0.949488,0.81,0.298343,0.43607
4,XGBClassifier,clean,0.957942,0.912727,0.395899,0.552255,0.950332,0.791111,0.327808,0.463542
1,RandomForestClassifier-tuned,clean,0.943836,0.756374,0.210568,0.329426,0.944786,0.763975,0.226519,0.349432
0,RandomForestClassifier,clean,0.999793,1.0,0.996845,0.99842,0.952381,0.713873,0.45488,0.555681
2,BalancedRandomForestClassifier,clean,0.866901,0.329776,0.999211,0.49589,0.830862,0.25,0.791897,0.380027
3,BalancedRandomForestClassifier-tuned,clean,0.866901,0.329776,0.999211,0.49589,0.830862,0.25,0.791897,0.380027


In [57]:
evaluation.sort_values(by=["test_rec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
2,BalancedRandomForestClassifier,clean,0.866901,0.329776,0.999211,0.49589,0.830862,0.25,0.791897,0.380027
3,BalancedRandomForestClassifier-tuned,clean,0.866901,0.329776,0.999211,0.49589,0.830862,0.25,0.791897,0.380027
0,RandomForestClassifier,clean,0.999793,1.0,0.996845,0.99842,0.952381,0.713873,0.45488,0.555681
4,XGBClassifier,clean,0.957942,0.912727,0.395899,0.552255,0.950332,0.791111,0.327808,0.463542
5,XGBClassifier-tuned,clean,0.954893,0.89899,0.350946,0.504821,0.949488,0.81,0.298343,0.43607
1,RandomForestClassifier-tuned,clean,0.943836,0.756374,0.210568,0.329426,0.944786,0.763975,0.226519,0.349432
6,CatBoostClassifier,clean,0.948073,0.904615,0.231861,0.369115,0.945992,0.851852,0.211786,0.339233


In [58]:
evaluation.sort_values(by=["test_f1"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
0,RandomForestClassifier,clean,0.999793,1.0,0.996845,0.99842,0.952381,0.713873,0.45488,0.555681
4,XGBClassifier,clean,0.957942,0.912727,0.395899,0.552255,0.950332,0.791111,0.327808,0.463542
5,XGBClassifier-tuned,clean,0.954893,0.89899,0.350946,0.504821,0.949488,0.81,0.298343,0.43607
2,BalancedRandomForestClassifier,clean,0.866901,0.329776,0.999211,0.49589,0.830862,0.25,0.791897,0.380027
3,BalancedRandomForestClassifier-tuned,clean,0.866901,0.329776,0.999211,0.49589,0.830862,0.25,0.791897,0.380027
1,RandomForestClassifier-tuned,clean,0.943836,0.756374,0.210568,0.329426,0.944786,0.763975,0.226519,0.349432
6,CatBoostClassifier,clean,0.948073,0.904615,0.231861,0.369115,0.945992,0.851852,0.211786,0.339233


#### Auswertung

- keine Unterschiede zu TF-IDF erkennbar
