In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

import pandas as pd

from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [2]:
from src.functions.vectorize_functions import vectorize_tfidf, vectorize_glove, vectorize_w2v

#### Funktionen

In [3]:
def simple_evaluation(y_test, y_pred):
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [4]:
evaluation = pd.DataFrame(
    columns=["model", "train_acc", "train_prec", "train_rec", "train_f1", "test_acc",
             "test_prec",
             "test_rec", "test_f1"])


def add_to_eval_df(model, model_name, x_data_train, y_data_train, x_data_test, y_data_test):
    train_acc = model.score(x_data_train, y_data_train)
    train_precision = precision_score(y_data_train, model.predict(x_data_train))
    train_recall = recall_score(y_data_train, model.predict(x_data_train))
    train_f1 = f1_score(y_data_train, model.predict(x_data_train))

    test_acc = model.score(x_data_test, y_data_test)
    test_precision = precision_score(y_data_test, model.predict(x_data_test))
    test_recall = recall_score(y_data_test, model.predict(x_data_test))
    test_f1 = f1_score(y_data_test, model.predict(x_data_test))

    evaluation.loc[len(evaluation.index)] = [model_name, train_acc, train_precision, train_recall,
                                             train_f1,
                                             test_acc, test_precision, test_recall, test_f1]

In [22]:
evaluation_tuning = pd.DataFrame(
    columns=["model", "tuning", "best_params", "train_acc", "train_prec", "train_rec", "train_f1", "test_acc",
             "test_prec",
             "test_rec", "test_f1"])


def add_to_tuning_eval_df(model, model_name, tuning, best_params, x_data_train, y_data_train, x_data_test, y_data_test):
    train_acc = model.score(x_data_train, y_data_train)
    train_precision = precision_score(y_data_train, model.predict(x_data_train))
    train_recall = recall_score(y_data_train, model.predict(x_data_train))
    train_f1 = f1_score(y_data_train, model.predict(x_data_train))

    test_acc = model.score(x_data_test, y_data_test)
    test_precision = precision_score(y_data_test, model.predict(x_data_test))
    test_recall = recall_score(y_data_test, model.predict(x_data_test))
    test_f1 = f1_score(y_data_test, model.predict(x_data_test))

    evaluation_tuning.loc[len(evaluation_tuning.index)] = [model_name, tuning, tuning_params, train_acc, train_precision, train_recall,
                                             train_f1,
                                             test_acc, test_precision, test_recall, test_f1]

#### Daten einlesen

In [5]:
df_train = pd.read_csv("..\\..\\..\\data\\mixed_dataset\\train_cleaned.csv", index_col=0)
df_train = df_train[df_train.tweet.notna() & df_train.tweet_cleaned.notna()]
df_train.head()

Unnamed: 0,label,tweet,tweet_cleaned,user_handle,hashtags,emojis
140232,0,"Nicki's ""that's not good"" pout bought to you b...","nickis "" good pout buy botox trade mark mkr",0,['#mkr'],__trade_mark__
6083,0,@user @user @user @user @user @user greater g...,great game ever fun possible,6,"['#fun', '#possible']",
62913,0,I love that you enjoy being trans.,love enjoy tran,0,[],
13525,0,yes good shapely eurotrash manbooty,yes good eurotrash,0,[],
6318,0,i can't wait for euro 2016 to sta on friday #e...,wait euro sta friday euro,0,['#euro2016'],


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81904 entries, 140232 to 145236
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   label          81904 non-null  int64 
 1   tweet          81904 non-null  object
 2   tweet_cleaned  81904 non-null  object
 3   user_handle    81904 non-null  int64 
 4   hashtags       81904 non-null  object
 5   emojis         7605 non-null   object
dtypes: int64(2), object(4)
memory usage: 4.4+ MB


#### Vergleich von Vektorisierungsarten

##### TF-IDF

In [7]:
X_train_tf, X_test_tf, y_train_tf, y_test_tf, tfidf_vectorizer = vectorize_tfidf(df=df_train,
                                                                                 text_column="tweet_cleaned",
                                                                                 label_column="label")

##### Word2Vec

In [8]:
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = vectorize_w2v(df=df_train,
                                                                 text_column="tweet_cleaned",
                                                                 label_column="label")

##### GloVe

In [9]:
X_train_gl, X_test_gl, y_train_gl, y_test_gl = vectorize_glove(df=df_train,
                                                               text_column="tweet_cleaned",
                                                               label_column="label")

##### Vergleich der Vektorisierungsarten mit Baseline-Modell (RandomForest)

In [10]:
rf_base_tf = RandomForestClassifier(n_jobs=-1)
rf_base_tf.fit(X_train_tf, y_train_tf)
y_pred_base_tf = rf_base_tf.predict(X_test_tf)

In [11]:
simple_evaluation(y_test_tf, y_pred_base_tf)

Accuracy: 0.8488523522708774

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.98      0.91     20155
           1       0.71      0.27      0.39      4417

    accuracy                           0.85     24572
   macro avg       0.78      0.62      0.65     24572
weighted avg       0.83      0.85      0.82     24572



In [12]:
rf_base_w2v = RandomForestClassifier(n_jobs=-1)
rf_base_w2v.fit(X_train_w2v, y_train_w2v)
y_pred_base_w2v = rf_base_w2v.predict(X_test_w2v)

In [13]:
simple_evaluation(y_test_w2v, y_pred_base_w2v)

Accuracy: 0.8338759563731076

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.99      0.91     20155
           1       0.71      0.13      0.22      4417

    accuracy                           0.83     24572
   macro avg       0.78      0.56      0.56     24572
weighted avg       0.82      0.83      0.78     24572



In [14]:
rf_base_gl = RandomForestClassifier(n_jobs=-1)
rf_base_gl.fit(X_train_gl, y_train_gl)
y_pred_base_gl = rf_base_gl.predict(X_test_gl)

In [15]:
simple_evaluation(y_test_gl, y_pred_base_gl)

Accuracy: 0.8291551359270715

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.99      0.90     20155
           1       0.68      0.09      0.16      4417

    accuracy                           0.83     24572
   macro avg       0.76      0.54      0.53     24572
weighted avg       0.81      0.83      0.77     24572



###### Entscheidung: TF-IDF
==> W2V und GLOVE sind beide schlechter als TF-IDF bei F1 und Recall des Targets

W2V und GLOVE werden nicht mehr betrachtet

In [19]:
X_train, X_test, y_train, y_test = X_train_tf, X_test_tf, y_train_tf, y_test_tf

In [20]:
print("X_train", X_train.shape)
print("y_train", y_train.shape)
print("X_test", X_test.shape)
print("y_test", y_test.shape)

X_train (57332, 13198)
y_train (57332,)
X_test (24572, 13198)
y_test (24572,)


#### Vergleich von Modellen ohne Hyperparamtertuning

##### RandomForestClassifier

In [21]:
rf_base = RandomForestClassifier(n_jobs=-1)
rf_base.fit(X_train, y_train)

add_to_eval_df(model=rf_base, model_name="RandomForestClassifier", x_data_train=X_train, y_data_train=y_train, x_data_test=X_test, y_data_test=y_test)

KeyboardInterrupt: 

##### BalancedRandomForestClassifier

In [None]:
brf_base = BalancedRandomForestClassifier(n_jobs=-1)
brf_base.fit(X_train, y_train)

add_to_eval_df(model=brf_base, model_name="BalancedRandomForestClassifier", x_data_train=X_train, y_data_train=y_train, x_data_test=X_test, y_data_test=y_test)

##### XGBClassifier

In [None]:
xgb_base = XGBClassifier(n_jobs=-1)
xgb_base.fit(X_train, y_train)

add_to_eval_df(model=xgb_base, model_name="XGBClassifier", x_data_train=X_train, y_data_train=y_train, x_data_test=X_test, y_data_test=y_test)

##### CatBoostClassifier

In [None]:
cat_base = CatBoostClassifier()
cat_base.fit(X_train, y_train)

add_to_eval_df(model=cat_base, model_name="CatBoostClassifier", x_data_train=X_train, y_data_train=y_train, x_data_test=X_test, y_data_test=y_test)

##### LGBMClassifier

In [None]:
lgb_base = LGBMClassifier(n_jobs=-1)
lgb_base.fit(X_train, y_train)

add_to_eval_df(model=lgb_base, model_name="LGBMClassifier", x_data_train=X_train, y_data_train=y_train, x_data_test=X_test, y_data_test=y_test)

##### StackingClassifier

In [None]:
base_models_stack = [
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', LinearSVC(random_state=42))
    ]
stack_base = StackingClassifier(estimators=base_models_stack, n_jobs=-1)
stack_base.fit(X_train, y_train)

add_to_eval_df(model=stack_base, model_name="StackingClassifier", x_data_train=X_train, y_data_train=y_train, x_data_test=X_test, y_data_test=y_test)

##### VotingClassifier

In [None]:
base_models_vote = [
        ('lr', LogisticRegression(random_state=42)), ('rf', RandomForestClassifier(n_estimators=50, random_state=42)), ('gnb', GaussianNB())
    ]

vote_base = VotingClassifier(estimators=base_models_vote, voting='hard', n_jobs=-1)
add_to_eval_df(model=vote_base, model_name="VotingClassifier", x_data_train=X_train, y_data_train=y_train, x_data_test=X_test, y_data_test=y_test)

##### Vergleich der Modelle ohne Tuning

In [None]:
evaluation.head(10)

In [None]:
evaluation.sort_values(by=["test_f1"], ascending=False)

###### Entscheidung

#### Hyperparametertuning ausgewählter Modelle

##### Modell 1: RandomForestClassifier

In [None]:
param_grid = {
    'n_estimators': [100, 150, 200, 250],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [9, 11, 13],
}

##### Modell 2:

##### Modell 3: