In [1]:
from sklearn.naive_bayes import MultinomialNB # ideal für counting features wie bow oder tfidf https://towardsdatascience.com/why-how-to-use-the-naive-bayes-algorithms-in-a-regulated-industry-with-sklearn-python-code-dbd8304ab2cf
from sklearn.naive_bayes import GaussianNB # für Features in Decimal Form geeignet
from sklearn.naive_bayes import ComplementNB # ähnlich wie Multinomial, soll sich aber besser für imbalanced data eignen
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
)
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import pickle
import joblib

In [12]:
def evaluate(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print("\n")
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(y_test, y_pred)))
    print("\n")
    print("Classification Report")
    print(classification_report(y_test, y_pred))

## Daten Einladen und TFIDF Vektorisierung

In [3]:
%run ../../functions/vectorize_functions.py

In [4]:
filepath_name = (('../../../data/mixed_dataset/train_cleaned.csv'))
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

In [5]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf, vectorizer_tfidf = vectorize_tfidf(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label")

In [6]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train_tfidf.shape)
print(y_test_tfidf.shape)
print(type(X_train_tfidf))

(57332, 13198)
(24572, 13198)
(57332,)
(24572,)
<class 'scipy.sparse._csr.csr_matrix'>


In [7]:
# save vectorizer

#with open('saved_models/vectorizer/vectorizer_nb_tfidf.pkl','wb') as f:
 #   pickle.dump(vectorizer_tfidf, f)

joblib.dump(vectorizer_tfidf, 'saved_models/vectorizer/vectorizer_tfidf.joblib')

['saved_models/vectorizer/vectorizer_tfidf.joblib']

## Multinomial NB

In [8]:
clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf,y_train_tfidf)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)

In [13]:
evaluate(y_test_tfidf, y_pred_tfidf)

Accuracy: 0.8317190297900049
F1 Score: 0.17809580600278274
Recall: 0.10142630744849446
Precision: 0.7296416938110749


Confusion Matrix
       0    1
0  19989  166
1   3969  448


Classification Report
              precision    recall  f1-score   support

           0       0.83      0.99      0.91     20155
           1       0.73      0.10      0.18      4417

    accuracy                           0.83     24572
   macro avg       0.78      0.55      0.54     24572
weighted avg       0.82      0.83      0.78     24572



In [18]:
# Model speichern
#with open('saved_models/model_nb_tfidf_mn.pkl','wb') as f:
 #   pickle.dump(clf_tfidf,f)
joblib.dump(clf_tfidf, 'saved_models/tfidf_mn/model_tfidf_nm.joblib')

['saved_models/tfidf_mn/model_tfidf_nmjoblib']

In [19]:
results_list = []

param_grid = {
    'alpha' : [1.0e-10, 0.01, 0.1, 0.25, 0.5, 0.75, 1],
    'fit_prior' : [True, False],
    'class_prior' : [None, [.25,.75], [.5,.5]]
}

nb = MultinomialNB()

grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train_tfidf)

for idx, params in enumerate(grid_search.cv_results_['params']):
    model = MultinomialNB(**params)  
    model.fit(X_train_tfidf, y_train_tfidf)  

    y_train_pred_tfidf = model.predict(X_train_tfidf)

    y_test_pred_tfidf = model.predict(X_test_tfidf)

    train_accuracy = accuracy_score(y_train_tfidf, y_train_pred_tfidf)
    train_recall = recall_score(y_train_tfidf, y_train_pred_tfidf)
    train_precision = precision_score(y_train_tfidf, y_train_pred_tfidf)
    train_f1 = f1_score(y_train_tfidf, y_train_pred_tfidf)

    test_accuracy = accuracy_score(y_test_tfidf, y_test_pred_tfidf)
    test_recall = recall_score(y_test_tfidf, y_test_pred_tfidf)
    test_precision = precision_score(y_test_tfidf, y_test_pred_tfidf)
    test_f1 = f1_score(y_test_tfidf, y_test_pred_tfidf)

    test_report = classification_report(y_test_tfidf, y_test_pred_tfidf, output_dict=True)

    result_dict = {
        'model': 'TF-IDF (Multi)',
        'alpha': params['alpha'],
        'fit_prior': params['fit_prior'],
        'class_prior': params['class_prior'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1,
        'test_recall_1': test_report['1']['recall'],
    }

    results_list.append(result_dict)

    filename = f'saved_models/tfidf_mn/model_tfidf_comp_{idx}.joblib'
    #with open(filename,'wb') as file:
     #   pickle.dump(model,file)
    joblib.dump(model, filename)



results_df = pd.DataFrame(results_list)

results_df.to_csv('eval_data/nb_grid_tfidf_mixed_dataset.csv', index=False)

print(results_df)


Fitting 3 folds for each of 42 candidates, totalling 126 fits
             model         alpha  fit_prior   class_prior  train_accuracy  \
0   TF-IDF (Multi)  1.000000e-10       True          None        0.867526   
1   TF-IDF (Multi)  1.000000e-10      False          None        0.783629   
2   TF-IDF (Multi)  1.000000e-10       True  [0.25, 0.75]        0.593787   
3   TF-IDF (Multi)  1.000000e-10      False  [0.25, 0.75]        0.593787   
4   TF-IDF (Multi)  1.000000e-10       True    [0.5, 0.5]        0.783629   
5   TF-IDF (Multi)  1.000000e-10      False    [0.5, 0.5]        0.783629   
6   TF-IDF (Multi)  1.000000e-02       True          None        0.866846   
7   TF-IDF (Multi)  1.000000e-02      False          None        0.780594   
8   TF-IDF (Multi)  1.000000e-02       True  [0.25, 0.75]        0.571688   
9   TF-IDF (Multi)  1.000000e-02      False  [0.25, 0.75]        0.571688   
10  TF-IDF (Multi)  1.000000e-02       True    [0.5, 0.5]        0.780594   
11  TF-IDF (Mu

In [20]:
results_df.sort_values(by=['test_f1'],ascending=False)

Unnamed: 0,model,alpha,fit_prior,class_prior,train_accuracy,train_recall,train_precision,train_f1,test_accuracy,test_recall,test_precision,test_f1,test_recall_1
28,TF-IDF (Multi),0.5,True,"[0.5, 0.5]",0.778396,0.834334,0.439396,0.575637,0.728309,0.747113,0.372503,0.497138,0.747113
25,TF-IDF (Multi),0.5,False,,0.778396,0.834334,0.439396,0.575637,0.728309,0.747113,0.372503,0.497138,0.747113
29,TF-IDF (Multi),0.5,False,"[0.5, 0.5]",0.778396,0.834334,0.439396,0.575637,0.728309,0.747113,0.372503,0.497138,0.747113
23,TF-IDF (Multi),0.25,False,"[0.5, 0.5]",0.773512,0.865511,0.435306,0.57927,0.722692,0.759565,0.368398,0.496155,0.759565
22,TF-IDF (Multi),0.25,True,"[0.5, 0.5]",0.773512,0.865511,0.435306,0.57927,0.722692,0.759565,0.368398,0.496155,0.759565
19,TF-IDF (Multi),0.25,False,,0.773512,0.865511,0.435306,0.57927,0.722692,0.759565,0.368398,0.496155,0.759565
17,TF-IDF (Multi),0.1,False,"[0.5, 0.5]",0.774454,0.879454,0.437334,0.584172,0.725623,0.750509,0.370184,0.495812,0.750509
16,TF-IDF (Multi),0.1,True,"[0.5, 0.5]",0.774454,0.879454,0.437334,0.584172,0.725623,0.750509,0.370184,0.495812,0.750509
13,TF-IDF (Multi),0.1,False,,0.774454,0.879454,0.437334,0.584172,0.725623,0.750509,0.370184,0.495812,0.750509
31,TF-IDF (Multi),0.75,False,,0.785164,0.801123,0.446351,0.573289,0.735878,0.717908,0.376827,0.494233,0.717908


In [21]:
results_df.sort_values(by=['test_recall_1'],ascending=False)

Unnamed: 0,model,alpha,fit_prior,class_prior,train_accuracy,train_recall,train_precision,train_f1,test_accuracy,test_recall,test_precision,test_f1,test_recall_1
27,TF-IDF (Multi),0.5,False,"[0.25, 0.75]",0.450359,0.987994,0.245336,0.393066,0.424792,0.97736,0.235246,0.379216,0.97736
26,TF-IDF (Multi),0.5,True,"[0.25, 0.75]",0.450359,0.987994,0.245336,0.393066,0.424792,0.97736,0.235246,0.379216,0.97736
32,TF-IDF (Multi),0.75,True,"[0.25, 0.75]",0.444098,0.98383,0.242709,0.389363,0.418566,0.975549,0.23307,0.37625,0.975549
33,TF-IDF (Multi),0.75,False,"[0.25, 0.75]",0.444098,0.98383,0.242709,0.389363,0.418566,0.975549,0.23307,0.37625,0.975549
20,TF-IDF (Multi),0.25,True,"[0.25, 0.75]",0.47797,0.990608,0.255373,0.406065,0.447949,0.970342,0.241874,0.387225,0.970342
21,TF-IDF (Multi),0.25,False,"[0.25, 0.75]",0.47797,0.990608,0.255373,0.406065,0.447949,0.970342,0.241874,0.387225,0.970342
38,TF-IDF (Multi),1.0,True,"[0.25, 0.75]",0.445284,0.980345,0.24266,0.389027,0.419624,0.96921,0.232587,0.375148,0.96921
39,TF-IDF (Multi),1.0,False,"[0.25, 0.75]",0.445284,0.980345,0.24266,0.389027,0.419624,0.96921,0.232587,0.375148,0.96921
14,TF-IDF (Multi),0.1,True,"[0.25, 0.75]",0.516919,0.991867,0.270604,0.425203,0.481564,0.947023,0.250659,0.396399,0.947023
15,TF-IDF (Multi),0.1,False,"[0.25, 0.75]",0.516919,0.991867,0.270604,0.425203,0.481564,0.947023,0.250659,0.396399,0.947023


## Complement NB

In [22]:
clf_tfidf_comp = ComplementNB()
clf_tfidf_comp.fit(X_train_tfidf,y_train_tfidf)
y_pred_tfidf_comp = clf_tfidf_comp.predict(X_test_tfidf)

In [23]:
evaluate(y_test_tfidf, y_pred_tfidf_comp)

Accuracy: 0.7451163926420316
F1 Score: 0.4908544020811316
Recall: 0.6834955852388499
Precision: 0.38292744799594114


Confusion Matrix
       0     1
0  15290  4865
1   1398  3019


Classification Report
              precision    recall  f1-score   support

           0       0.92      0.76      0.83     20155
           1       0.38      0.68      0.49      4417

    accuracy                           0.75     24572
   macro avg       0.65      0.72      0.66     24572
weighted avg       0.82      0.75      0.77     24572



In [24]:
# Model speichern
#with open('saved_models/tfidf_comp/model_nb_tfidf_comp.pkl','wb') as f:
 #   pickle.dump(clf_tfidf_comp,f)
joblib.dump(clf_tfidf, 'saved_models/tfidf_comp/model_tfidf_comp.joblib')

['saved_models/tfidf_comp/model_tfidf_comp.joblib']

In [25]:
results_list = []

param_grid = {
    'alpha' : [1.0e-10, 0.01, 0.1, 0.25, 0.5, 0.75, 1],
    'norm' : [True, False]
}

cnb = ComplementNB()

grid_search = GridSearchCV(estimator=cnb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train_tfidf)

for idx, params in enumerate(grid_search.cv_results_['params']):
    model = ComplementNB(**params)  
    model.fit(X_train_tfidf, y_train_tfidf)  

    y_train_pred_tfidf = model.predict(X_train_tfidf)

    y_test_pred_tfidf = model.predict(X_test_tfidf)

    train_accuracy = accuracy_score(y_train_tfidf, y_train_pred_tfidf)
    train_recall = recall_score(y_train_tfidf, y_train_pred_tfidf)
    train_precision = precision_score(y_train_tfidf, y_train_pred_tfidf)
    train_f1 = f1_score(y_train_tfidf, y_train_pred_tfidf)

    test_accuracy = accuracy_score(y_test_tfidf, y_test_pred_tfidf)
    test_recall = recall_score(y_test_tfidf, y_test_pred_tfidf)
    test_precision = precision_score(y_test_tfidf, y_test_pred_tfidf)
    test_f1 = f1_score(y_test_tfidf, y_test_pred_tfidf)

    test_report = classification_report(y_test_tfidf, y_test_pred_tfidf, output_dict=True)

    result_dict = {
        'model': 'TF-IDF (Comp)',
        'alpha': params['alpha'],
        'norm': params['norm'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1,
        'test_recall_1': test_report['1']['recall'],
    }

    results_list.append(result_dict)

    filename = f'saved_models/tfidf_comp/model_nb_tfidf_comp_{idx}.joblib'
    #with open(filename,'wb') as file:
     #   pickle.dump(model,file)
    joblib.dump(model, filename)

results_df_comp = pd.DataFrame(results_list)

results_df_comp.to_csv('eval_data/cnb_grid_tfidf_mixed_dataset.csv', index=False)

print(results_df_comp)


Fitting 3 folds for each of 14 candidates, totalling 42 fits
            model         alpha   norm  train_accuracy  train_recall  \
0   TF-IDF (Comp)  1.000000e-10   True        0.305693      1.000000   
1   TF-IDF (Comp)  1.000000e-10  False        0.783629      0.886909   
2   TF-IDF (Comp)  1.000000e-02   True        0.440173      0.998354   
3   TF-IDF (Comp)  1.000000e-02  False        0.780594      0.886232   
4   TF-IDF (Comp)  1.000000e-01   True        0.645434      0.963981   
5   TF-IDF (Comp)  1.000000e-01  False        0.774454      0.879454   
6   TF-IDF (Comp)  2.500000e-01   True        0.775082      0.864349   
7   TF-IDF (Comp)  2.500000e-01  False        0.773512      0.865511   
8   TF-IDF (Comp)  5.000000e-01   True        0.842095      0.684353   
9   TF-IDF (Comp)  5.000000e-01  False        0.778396      0.834334   
10  TF-IDF (Comp)  7.500000e-01   True        0.855909      0.545217   
11  TF-IDF (Comp)  7.500000e-01  False        0.785164      0.801123   
12 

In [26]:
results_df_comp.sort_values(by=['test_f1'],ascending=False)

Unnamed: 0,model,alpha,norm,train_accuracy,train_recall,train_precision,train_f1,test_accuracy,test_recall,test_precision,test_f1,test_recall_1
8,TF-IDF (Comp),0.5,True,0.842095,0.684353,0.549568,0.609599,0.796191,0.575957,0.447966,0.503962,0.575957
9,TF-IDF (Comp),0.5,False,0.778396,0.834334,0.439396,0.575637,0.728309,0.747113,0.372503,0.497138,0.747113
6,TF-IDF (Comp),0.25,True,0.775082,0.864349,0.437148,0.580637,0.724483,0.757528,0.369928,0.497103,0.757528
7,TF-IDF (Comp),0.25,False,0.773512,0.865511,0.435306,0.57927,0.722692,0.759565,0.368398,0.496155,0.759565
5,TF-IDF (Comp),0.1,False,0.774454,0.879454,0.437334,0.584172,0.725623,0.750509,0.370184,0.495812,0.750509
11,TF-IDF (Comp),0.75,False,0.785164,0.801123,0.446351,0.573289,0.735878,0.717908,0.376827,0.494233,0.717908
13,TF-IDF (Comp),1.0,False,0.791914,0.762006,0.453812,0.568847,0.745116,0.683496,0.382927,0.490854,0.683496
3,TF-IDF (Comp),0.01,False,0.780594,0.886232,0.44525,0.592715,0.726884,0.700249,0.364741,0.479646,0.700249
10,TF-IDF (Comp),0.75,True,0.855909,0.545217,0.612398,0.576858,0.818452,0.45755,0.494616,0.475362,0.45755
1,TF-IDF (Comp),1e-10,False,0.783629,0.886909,0.449086,0.596257,0.725094,0.673308,0.358919,0.468236,0.673308


In [27]:
results_df_comp.sort_values(by=['test_recall_1'],ascending=False)

Unnamed: 0,model,alpha,norm,train_accuracy,train_recall,train_precision,train_f1,test_accuracy,test_recall,test_precision,test_f1,test_recall_1
0,TF-IDF (Comp),1e-10,True,0.305693,1.0,0.206008,0.341636,0.296435,0.966493,0.199402,0.330597,0.966493
2,TF-IDF (Comp),0.01,True,0.440173,0.998354,0.243241,0.391176,0.415717,0.957211,0.229833,0.370666,0.957211
4,TF-IDF (Comp),0.1,True,0.645434,0.963981,0.332843,0.494831,0.597143,0.887254,0.294219,0.441901,0.887254
7,TF-IDF (Comp),0.25,False,0.773512,0.865511,0.435306,0.57927,0.722692,0.759565,0.368398,0.496155,0.759565
6,TF-IDF (Comp),0.25,True,0.775082,0.864349,0.437148,0.580637,0.724483,0.757528,0.369928,0.497103,0.757528
5,TF-IDF (Comp),0.1,False,0.774454,0.879454,0.437334,0.584172,0.725623,0.750509,0.370184,0.495812,0.750509
9,TF-IDF (Comp),0.5,False,0.778396,0.834334,0.439396,0.575637,0.728309,0.747113,0.372503,0.497138,0.747113
11,TF-IDF (Comp),0.75,False,0.785164,0.801123,0.446351,0.573289,0.735878,0.717908,0.376827,0.494233,0.717908
3,TF-IDF (Comp),0.01,False,0.780594,0.886232,0.44525,0.592715,0.726884,0.700249,0.364741,0.479646,0.700249
13,TF-IDF (Comp),1.0,False,0.791914,0.762006,0.453812,0.568847,0.745116,0.683496,0.382927,0.490854,0.683496


## Test Dataset

In [28]:
filepath_name_test = (('../../../data/mixed_dataset/test_cleaned.csv'))
df_test = pd.read_csv(filepath_name_test, encoding ='utf-8')

In [29]:
df_test = df_test[df_test['tweet_cleaned'].notna()]

In [30]:
#with open('saved_models/vectorizer/vectorizer_nb_tfidf.pkl','rb') as f:
 #   vectorizer_tfidf_saved = pickle.load(f)
vec_tfidf_save = joblib.load('saved_models/vectorizer/vectorizer_tfidf.joblib')

In [31]:
x_test = vec_tfidf_save.transform(df_test['tweet_cleaned'])

In [32]:
print(x_test.shape)

(32466, 13198)


In [33]:
y_test = df_test['label']

In [34]:
y_test

0        0
1        0
2        0
3        0
4        0
        ..
32577    0
32578    0
32579    0
32580    0
32581    1
Name: label, Length: 32466, dtype: int64

# Test bestes F1-Model

In [36]:
# import Comp Model 8 
#with open('saved_models/tfidf_comp/model_nb_tfidf_comp_8.pkl','rb') as f:
 #   clf_comp_8 = pickle.load(f)
clf_comp_8 = joblib.load('saved_models/tfidf_comp/model_nb_tfidf_comp_8.joblib')

In [37]:
y_pred = clf_comp_8.predict(x_test)

In [38]:
evaluate(y_test, y_pred)

Accuracy: 0.7743793507053532
F1 Score: 0.514932785908218
Recall: 0.5685041672759176
Precision: 0.47058823529411764


Confusion Matrix
       0     1
0  21253  4374
1   2951  3888


Classification Report
              precision    recall  f1-score   support

           0       0.88      0.83      0.85     25627
           1       0.47      0.57      0.51      6839

    accuracy                           0.77     32466
   macro avg       0.67      0.70      0.68     32466
weighted avg       0.79      0.77      0.78     32466



In [39]:
# Save Confusionmatrix
confusion_matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))

In [40]:
confusion_matrix_df

Unnamed: 0,0,1
0,21253,4374
1,2951,3888


In [52]:
confusion_matrix_df.to_csv("eval_data/nb_comp_tfidf_confusion_matrix.csv", index=True)

In [42]:
# Save Classificationreport
classification_report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()

In [43]:
classification_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.878078,0.829321,0.853003,25627.0
1,0.470588,0.568504,0.514933,6839.0
accuracy,0.774379,0.774379,0.774379,0.774379
macro avg,0.674333,0.698912,0.683968,32466.0
weighted avg,0.79224,0.774379,0.781788,32466.0


In [44]:
classification_report_df.to_csv("eval_data/nb_comp_tfidf_classification_report.csv", index=True)

# Test bestes Recall(1)-Model

In [45]:
clf_comp_0 = joblib.load('saved_models/tfidf_comp/model_nb_tfidf_comp_0.joblib')

In [46]:
y_pred_rec = clf_comp_0.predict(x_test)

In [47]:
evaluate(y_test, y_pred_rec)

Accuracy: 0.2587630136142426
F1 Score: 0.35920649713753156
Recall: 0.9862553004825266
Precision: 0.21959239484307852


Confusion Matrix
      0      1
0  1656  23971
1    94   6745


Classification Report
              precision    recall  f1-score   support

           0       0.95      0.06      0.12     25627
           1       0.22      0.99      0.36      6839

    accuracy                           0.26     32466
   macro avg       0.58      0.53      0.24     32466
weighted avg       0.79      0.26      0.17     32466



In [48]:
# Save Confusionmatrix
confusion_matrix_df_rec = pd.DataFrame(confusion_matrix(y_test, y_pred))

In [49]:
confusion_matrix_df_rec

Unnamed: 0,0,1
0,21253,4374
1,2951,3888


In [51]:
confusion_matrix_df_rec.to_csv("eval_data/nb_comp_tfidf_rec_confusion_matrix.csv", index=True)

In [55]:
# Save Classificationreport
classification_report_df_rec = pd.DataFrame(classification_report(y_test, y_pred_rec, output_dict=True)).transpose()

In [56]:
classification_report_df_rec

Unnamed: 0,precision,recall,f1-score,support
0,0.946286,0.064619,0.120977,25627.0
1,0.219592,0.986255,0.359206,6839.0
accuracy,0.258763,0.258763,0.258763,0.258763
macro avg,0.582939,0.525437,0.240092,32466.0
weighted avg,0.793207,0.258763,0.171161,32466.0


In [57]:
classification_report_df_rec.to_csv("eval_data/nb_comp_tfidf_rec_classification_report.csv", index=True)