In [25]:
from sklearn.naive_bayes import MultinomialNB # ideal für counting features wie bow oder tfidf https://towardsdatascience.com/why-how-to-use-the-naive-bayes-algorithms-in-a-regulated-industry-with-sklearn-python-code-dbd8304ab2cf
from sklearn.naive_bayes import GaussianNB # für Features in Decimal Form geeignet
from sklearn.naive_bayes import ComplementNB # ähnlich wie Multinomial, soll sich aber besser für imbalanced data eignen
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
    matthews_corrcoef, 
)
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import joblib

In [26]:
def evaluate(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print("Accuracy:", accuracy)
    print("MCC:", mcc)
    print("\n")
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(y_test, y_pred)))
    print("\n")
    print("Classification Report")
    print(classification_report(y_test, y_pred))

## Evaluation neue Vectorize-Funktionen (08.12.)

In [27]:
%run ../../../functions/vectorize_functions.py

In [28]:
filepath_name = (('../../../../data/twitter_hate-speech/train_cleaned.csv'))
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

In [29]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf, vectorizer_tfidf = vectorize_tfidf(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label")

In [30]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train_tfidf.shape)
print(y_test_tfidf.shape)
print(type(X_train_tfidf))

(14124, 15658)
(6054, 15658)
(14124,)
(6054,)
<class 'scipy.sparse._csr.csr_matrix'>


In [31]:
# save vectorizer

joblib.dump(vectorizer_tfidf, 'joblib_models/vectorizer/vectorizer_tfidf.joblib')

['joblib_models/vectorizer/vectorizer_tfidf.joblib']

## Multinomial NB

In [32]:
clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf,y_train_tfidf)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)

In [33]:
evaluate(y_test_tfidf, y_pred_tfidf)

F1 Score: 0.08854166666666667
Recall: 0.04632152588555858
Precision: 1.0
Accuracy: 0.9421869838123554
MCC: 0.20889231109144332


Confusion Matrix
      0   1
0  5687   0
1   350  17


Classification Report
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      5687
           1       1.00      0.05      0.09       367

    accuracy                           0.94      6054
   macro avg       0.97      0.52      0.53      6054
weighted avg       0.95      0.94      0.92      6054



In [34]:
results_list = []

param_grid = {
    'alpha' : [1.0e-10, 0.01, 0.05, 0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 1],
    'fit_prior' : [True, False],
    'class_prior' : [None,[.1,.9], [.25,.75], [.5,.5], [.75,.25],[.9,.1]]
}

nb = MultinomialNB()

grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train_tfidf)

for idx, params in enumerate(grid_search.cv_results_['params']):
    model = MultinomialNB(**params)  
    model.fit(X_train_tfidf, y_train_tfidf)  

    y_train_pred_tfidf = model.predict(X_train_tfidf)

    y_test_pred_tfidf = model.predict(X_test_tfidf)

    train_accuracy = accuracy_score(y_train_tfidf, y_train_pred_tfidf)
    train_recall = recall_score(y_train_tfidf, y_train_pred_tfidf)
    train_precision = precision_score(y_train_tfidf, y_train_pred_tfidf)
    train_f1 = f1_score(y_train_tfidf, y_train_pred_tfidf)
    train_mcc = matthews_corrcoef(y_train_tfidf, y_train_pred_tfidf)

    test_accuracy = accuracy_score(y_test_tfidf, y_test_pred_tfidf)
    test_recall = recall_score(y_test_tfidf, y_test_pred_tfidf)
    test_precision = precision_score(y_test_tfidf, y_test_pred_tfidf)
    test_f1 = f1_score(y_test_tfidf, y_test_pred_tfidf)
    test_mcc = matthews_corrcoef(y_test_tfidf, y_test_pred_tfidf)

    result_dict = {
        'model': 'TF-IDF (Multi)',
        'alpha': params['alpha'],
        'fit_prior': params['fit_prior'],
        'class_prior': params['class_prior'],
        'train_f1': train_f1,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_accuracy': train_accuracy,
        'train_mcc': train_mcc,
        'test_f1': test_f1,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_accuracy': test_accuracy,
        'test_mcc': test_mcc
    }

    results_list.append(result_dict)

    filename = f'joblib_models/nb_mn_tfidf/model_tfidf_mn_{idx}.joblib'
    joblib.dump(model, filename)

results_df = pd.DataFrame(results_list)

results_df.to_csv('eval_data/nb_mn_tfidf.csv', index=False)

print(results_df)


Fitting 3 folds for each of 120 candidates, totalling 360 fits
              model         alpha  fit_prior   class_prior  train_f1  \
0    TF-IDF (Multi)  1.000000e-10       True          None  0.929748   
1    TF-IDF (Multi)  1.000000e-10      False          None  0.863841   
2    TF-IDF (Multi)  1.000000e-10       True    [0.1, 0.9]  0.710652   
3    TF-IDF (Multi)  1.000000e-10      False    [0.1, 0.9]  0.710652   
4    TF-IDF (Multi)  1.000000e-10       True  [0.25, 0.75]  0.772563   
..              ...           ...        ...           ...       ...   
115  TF-IDF (Multi)  1.000000e+00      False    [0.5, 0.5]  0.654046   
116  TF-IDF (Multi)  1.000000e+00       True  [0.75, 0.25]  0.600000   
117  TF-IDF (Multi)  1.000000e+00      False  [0.75, 0.25]  0.600000   
118  TF-IDF (Multi)  1.000000e+00       True    [0.9, 0.1]  0.292294   
119  TF-IDF (Multi)  1.000000e+00      False    [0.9, 0.1]  0.292294   

     train_recall  train_precision  train_accuracy  train_mcc   test_f1 

In [35]:
results_df.sort_values(by=['test_f1'],ascending=False)

Unnamed: 0,model,alpha,fit_prior,class_prior,train_f1,train_recall,train_precision,train_accuracy,train_mcc,test_f1,test_recall,test_precision,test_accuracy,test_mcc
45,TF-IDF (Multi),0.10,False,"[0.75, 0.25]",0.857558,0.918050,0.804545,0.979184,0.848528,0.609043,0.623978,0.594805,0.951437,0.583359
44,TF-IDF (Multi),0.10,True,"[0.75, 0.25]",0.857558,0.918050,0.804545,0.979184,0.848528,0.609043,0.623978,0.594805,0.951437,0.583359
33,TF-IDF (Multi),0.05,False,"[0.75, 0.25]",0.877580,0.948133,0.816801,0.981946,0.870705,0.600780,0.629428,0.574627,0.949290,0.574444
32,TF-IDF (Multi),0.05,True,"[0.75, 0.25]",0.877580,0.948133,0.816801,0.981946,0.870705,0.600780,0.629428,0.574627,0.949290,0.574444
20,TF-IDF (Multi),0.01,True,"[0.75, 0.25]",0.910331,0.968880,0.858456,0.986973,0.905258,0.593707,0.591281,0.596154,0.950942,0.567608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,TF-IDF (Multi),0.60,True,,0.437247,0.280083,0.996310,0.950793,0.514729,0.209246,0.117166,0.977273,0.946316,0.328671
118,TF-IDF (Multi),1.00,True,"[0.9, 0.1]",0.292294,0.171162,1.000000,0.943430,0.401702,0.131980,0.070845,0.962963,0.943508,0.253087
119,TF-IDF (Multi),1.00,False,"[0.9, 0.1]",0.292294,0.171162,1.000000,0.943430,0.401702,0.131980,0.070845,0.962963,0.943508,0.253087
96,TF-IDF (Multi),0.80,True,,0.286222,0.167012,1.000000,0.943146,0.396746,0.122449,0.065395,0.960000,0.943178,0.242693


## Complement NB

In [36]:
clf_tfidf_comp = ComplementNB()
clf_tfidf_comp.fit(X_train_tfidf,y_train_tfidf)
y_pred_tfidf_comp = clf_tfidf_comp.predict(X_test_tfidf)

In [37]:
evaluate(y_test_tfidf, y_pred_tfidf_comp)

F1 Score: 0.525730180806676
Recall: 0.5149863760217984
Precision: 0.5369318181818182
Accuracy: 0.9436736042286091
MCC: 0.495924850355595


Confusion Matrix
      0    1
0  5524  163
1   178  189


Classification Report
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      5687
           1       0.54      0.51      0.53       367

    accuracy                           0.94      6054
   macro avg       0.75      0.74      0.75      6054
weighted avg       0.94      0.94      0.94      6054



In [38]:
results_list = []

param_grid = {
    'alpha' : [1.0e-10, 0.01, 0.05, 0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 1],
    'norm' : [True, False]
}

cnb = ComplementNB()

grid_search = GridSearchCV(estimator=cnb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train_tfidf)

for idx, params in enumerate(grid_search.cv_results_['params']):
    model = ComplementNB(**params)  
    model.fit(X_train_tfidf, y_train_tfidf)  

    y_train_pred_tfidf = model.predict(X_train_tfidf)

    y_test_pred_tfidf = model.predict(X_test_tfidf)

    train_accuracy = accuracy_score(y_train_tfidf, y_train_pred_tfidf)
    train_recall = recall_score(y_train_tfidf, y_train_pred_tfidf)
    train_precision = precision_score(y_train_tfidf, y_train_pred_tfidf)
    train_f1 = f1_score(y_train_tfidf, y_train_pred_tfidf)
    train_mcc = matthews_corrcoef(y_train_tfidf, y_train_pred_tfidf)

    test_accuracy = accuracy_score(y_test_tfidf, y_test_pred_tfidf)
    test_recall = recall_score(y_test_tfidf, y_test_pred_tfidf)
    test_precision = precision_score(y_test_tfidf, y_test_pred_tfidf)
    test_f1 = f1_score(y_test_tfidf, y_test_pred_tfidf)
    test_mcc = matthews_corrcoef(y_test_tfidf, y_test_pred_tfidf)

    result_dict = {
        'model': 'TF-IDF (Comp)',
        'alpha': params['alpha'],
        'norm': params['norm'],
        'train_f1': train_f1,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_accuracy': train_accuracy,
        'train_mcc': train_mcc,
        'test_f1': test_f1,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_accuracy': test_accuracy,
        'test_mcc': test_mcc
    }

    results_list.append(result_dict)

    filename = f'joblib_models/nb_comp_tfidf/model_tfidf_comp_{idx}.joblib'
    joblib.dump(model, filename)

results_df_comp = pd.DataFrame(results_list)

results_df_comp.to_csv('eval_data/nb_comp_tfidf.csv', index=False)

print(results_df_comp)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
            model         alpha   norm  train_f1  train_recall  \
0   TF-IDF (Comp)  1.000000e-10   True  0.212335      1.000000   
1   TF-IDF (Comp)  1.000000e-10  False  0.863841      0.993776   
2   TF-IDF (Comp)  1.000000e-02   True  0.517862      1.000000   
3   TF-IDF (Comp)  1.000000e-02  False  0.792718      0.993776   
4   TF-IDF (Comp)  5.000000e-02   True  0.760948      0.982365   
5   TF-IDF (Comp)  5.000000e-02  False  0.720393      0.987552   
6   TF-IDF (Comp)  1.000000e-01   True  0.844875      0.949170   
7   TF-IDF (Comp)  1.000000e-01  False  0.681901      0.975104   
8   TF-IDF (Comp)  2.000000e-01   True  0.851620      0.845436   
9   TF-IDF (Comp)  2.000000e-01  False  0.665948      0.961618   
10  TF-IDF (Comp)  4.000000e-01   True  0.791027      0.695021   
11  TF-IDF (Comp)  4.000000e-01  False  0.670043      0.892116   
12  TF-IDF (Comp)  5.000000e-01   True  0.745993      0.627593   
13  TF-IDF (Com

In [39]:
results_df_comp.sort_values(by=['test_f1'],ascending=False)

Unnamed: 0,model,alpha,norm,train_f1,train_recall,train_precision,train_accuracy,train_mcc,test_f1,test_recall,test_precision,test_accuracy,test_mcc
6,TF-IDF (Comp),0.1,True,0.844875,0.94917,0.761231,0.976211,0.838105,0.586248,0.662125,0.525974,0.943343,0.560516
8,TF-IDF (Comp),0.2,True,0.85162,0.845436,0.857895,0.979892,0.840862,0.57732,0.53406,0.628205,0.952593,0.554426
10,TF-IDF (Comp),0.4,True,0.791027,0.695021,0.917808,0.974936,0.786483,0.549565,0.430518,0.759615,0.957218,0.552514
11,TF-IDF (Comp),0.4,False,0.670043,0.892116,0.536494,0.940031,0.664367,0.542553,0.694823,0.445026,0.928973,0.52084
4,TF-IDF (Comp),0.05,True,0.760948,0.982365,0.620984,0.957873,0.762556,0.541267,0.768392,0.417778,0.921044,0.530185
9,TF-IDF (Comp),0.2,False,0.665948,0.961618,0.509341,0.934155,0.672712,0.537879,0.773842,0.412192,0.919392,0.527965
3,TF-IDF (Comp),0.01,False,0.792718,0.993776,0.659326,0.964528,0.793706,0.529762,0.72752,0.416537,0.921705,0.513245
19,TF-IDF (Comp),1.0,False,0.654046,0.716805,0.601393,0.948244,0.629128,0.52573,0.514986,0.536932,0.943674,0.495925
15,TF-IDF (Comp),0.6,False,0.668622,0.827801,0.560787,0.943996,0.653749,0.525565,0.60218,0.466245,0.934093,0.495408
13,TF-IDF (Comp),0.5,False,0.673171,0.858921,0.553476,0.943076,0.662264,0.525253,0.637602,0.446565,0.930129,0.497846
