In [1]:
from sklearn.naive_bayes import MultinomialNB # ideal für counting features wie bow oder tfidf https://towardsdatascience.com/why-how-to-use-the-naive-bayes-algorithms-in-a-regulated-industry-with-sklearn-python-code-dbd8304ab2cf
from sklearn.naive_bayes import GaussianNB # für Features in Decimal Form geeignet
from sklearn.naive_bayes import ComplementNB # ähnlich wie Multinomial, soll sich aber besser für imbalanced data eignen
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
    matthews_corrcoef, 
)
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import joblib

In [2]:
def evaluate(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print("Accuracy:", accuracy)
    print("MCC:", mcc)
    print("\n")
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(y_test, y_pred)))
    print("\n")
    print("Classification Report")
    print(classification_report(y_test, y_pred))

## Evaluation neue Vectorize-Funktionen (08.12.)

In [3]:
%run ../../../functions/vectorize_functions.py

In [4]:
filepath_name = (('../../../../data/twitter_hate-speech/train_cleaned.csv'))
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

In [6]:
X_train, X_test, y_train, y_test, vectorizer_w2v = vectorize_w2v(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label")

In [7]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(type(X_train))

(14124, 300)
(6054, 300)
(14124,)
(6054,)
<class 'numpy.ndarray'>


In [8]:
# save vectorizer

joblib.dump(vectorizer_w2v, 'joblib_models/vectorizer/vectorizer_w2v.joblib')

['joblib_models/vectorizer/vectorizer_w2v.joblib']

## GaussianNB

In [9]:
clf = GaussianNB()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [10]:
evaluate(y_test, y_pred)

F1 Score: 0.2825896762904638
Recall: 0.8801089918256131
Precision: 0.16831683168316833
Accuracy: 0.7291047241493228
MCC: 0.30744404457305663


Confusion Matrix
      0     1
0  4091  1596
1    44   323


Classification Report
              precision    recall  f1-score   support

           0       0.99      0.72      0.83      5687
           1       0.17      0.88      0.28       367

    accuracy                           0.73      6054
   macro avg       0.58      0.80      0.56      6054
weighted avg       0.94      0.73      0.80      6054



In [11]:
results_list = []

param_grid = {
    'priors' : [None,[.1,.9], [.25,.75], [.5,.5], [.75,.25],[.9,.1]],
    'var_smoothing' : [1.0e-10,1e-9, 0.01, 0.05, 0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 1]
}

nb = GaussianNB()

grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

for idx, params in enumerate(grid_search.cv_results_['params']):
    model = GaussianNB(**params)  
    model.fit(X_train, y_train)  

    y_train_pred = model.predict(X_train)

    y_test_pred = model.predict(X_test)
    
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_mcc = matthews_corrcoef(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_mcc = matthews_corrcoef(y_test, y_test_pred)

    result_dict = {
        'model': 'W2V',
        'priors': params['priors'],
        'var_smoothing': params['var_smoothing'],
        'train_f1': train_f1,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_accuracy': train_accuracy,
        'train_mcc': train_mcc,
        'test_f1': test_f1,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_accuracy': test_accuracy,
        'test_mcc': test_mcc
    }

    results_list.append(result_dict)

    filename = f'joblib_models/nb_w2v/model_nb_w2v_{idx}.joblib'
    joblib.dump(model, filename)

results_df = pd.DataFrame(results_list)

results_df.to_csv('eval_data/nb_w2v.csv', index=False)

print(results_df)


Fitting 3 folds for each of 66 candidates, totalling 198 fits
   model      priors  var_smoothing  train_f1  train_recall  train_precision  \
0    W2V        None   1.000000e-10  0.294941      0.852697         0.178308   
1    W2V        None   1.000000e-09  0.294941      0.852697         0.178308   
2    W2V        None   1.000000e-02  0.290928      0.859959         0.175079   
3    W2V        None   5.000000e-02  0.276465      0.885892         0.163790   
4    W2V        None   1.000000e-01  0.264857      0.910788         0.154959   
..   ...         ...            ...       ...           ...              ...   
61   W2V  [0.9, 0.1]   4.000000e-01  0.222061      0.953320         0.125667   
62   W2V  [0.9, 0.1]   5.000000e-01  0.216501      0.959544         0.122016   
63   W2V  [0.9, 0.1]   6.000000e-01  0.211942      0.964730         0.119048   
64   W2V  [0.9, 0.1]   8.000000e-01  0.207746      0.970954         0.116317   
65   W2V  [0.9, 0.1]   1.000000e+00  0.204587      0.97614

In [13]:
results_df.sort_values(by=['test_f1'],ascending=False)

Unnamed: 0,model,priors,var_smoothing,train_f1,train_recall,train_precision,train_accuracy,train_mcc,test_f1,test_recall,test_precision,test_accuracy,test_mcc
0,W2V,,1.000000e-10,0.294941,0.852697,0.178308,0.721750,0.303789,0.282590,0.880109,0.168317,0.729105,0.307444
1,W2V,,1.000000e-09,0.294941,0.852697,0.178308,0.721750,0.303789,0.282590,0.880109,0.168317,0.729105,0.307444
56,W2V,"[0.9, 0.1]",1.000000e-09,0.294097,0.852697,0.177691,0.720617,0.302865,0.281359,0.880109,0.167444,0.727453,0.306117
55,W2V,"[0.9, 0.1]",1.000000e-10,0.294097,0.852697,0.177691,0.720617,0.302865,0.281359,0.880109,0.167444,0.727453,0.306117
45,W2V,"[0.75, 0.25]",1.000000e-09,0.291792,0.853734,0.175968,0.717148,0.300541,0.279170,0.880109,0.165896,0.724480,0.303749
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31,W2V,"[0.25, 0.75]",8.000000e-01,0.194453,0.978216,0.107956,0.446828,0.200448,0.181266,0.975477,0.099916,0.465808,0.198287
43,W2V,"[0.5, 0.5]",1.000000e+00,0.194085,0.980290,0.107705,0.444350,0.200347,0.180535,0.975477,0.099472,0.463165,0.197122
20,W2V,"[0.1, 0.9]",8.000000e-01,0.191532,0.985477,0.106074,0.432172,0.197436,0.177800,0.975477,0.097814,0.453089,0.192712
32,W2V,"[0.25, 0.75]",1.000000e+00,0.190305,0.987552,0.105298,0.426437,0.195918,0.176355,0.975477,0.096940,0.447638,0.190346
