In [1]:
from sklearn.naive_bayes import MultinomialNB # ideal für counting features wie bow oder tfidf https://towardsdatascience.com/why-how-to-use-the-naive-bayes-algorithms-in-a-regulated-industry-with-sklearn-python-code-dbd8304ab2cf
from sklearn.naive_bayes import GaussianNB # für Features in Decimal Form geeignet
from sklearn.naive_bayes import ComplementNB # ähnlich wie Multinomial, soll sich aber besser für imbalanced data eignen
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
    matthews_corrcoef, 
)
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import pickle
import joblib

In [2]:
def evaluate(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print("Accuracy:", accuracy)
    print("MCC:", mcc)
    print("\n")
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(y_test, y_pred)))
    print("\n")
    print("Classification Report")
    print(classification_report(y_test, y_pred))

## Evaluation neue Vectorize-Funktionen (08.12.)

In [3]:
%run ../../functions/vectorize_functions.py

In [4]:
filepath_name = (('../../../data/mixed_dataset/train_cleaned.csv'))
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

In [5]:
X_train, X_test, y_train, y_test, vectorizer_w2v  = vectorize_w2v(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label")

In [6]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(type(X_train))

(57332, 300)
(24572, 300)
(57332,)
(24572,)
<class 'numpy.ndarray'>


In [7]:
# save vectorizer

joblib.dump(vectorizer_w2v, 'saved_models/vectorizer/vectorizer_w2v.joblib')

['saved_models/vectorizer/vectorizer_w2v.joblib']

## GaussianNB

In [8]:
clf = GaussianNB()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [9]:
evaluate(y_test, y_pred)

F1 Score: 0.42211377937588296
Recall: 0.7441702513017886
Precision: 0.2946132472886977
Accuracy: 0.6337294481523685
MCC: 0.27278263117780793


Confusion Matrix
       0     1
0  12285  7870
1   1130  3287


Classification Report
              precision    recall  f1-score   support

           0       0.92      0.61      0.73     20155
           1       0.29      0.74      0.42      4417

    accuracy                           0.63     24572
   macro avg       0.61      0.68      0.58     24572
weighted avg       0.80      0.63      0.68     24572



In [10]:
# Model speichern

joblib.dump(clf, 'saved_models/nb_w2v/model_nb_w2v.joblib')

['saved_models/w2v/model_nb_w2v.joblib']

In [10]:
results_list = []

param_grid = {
    'priors' : [None,[.1,.9], [.25,.75], [.5,.5], [.75,.25],[.9,.1]],
    'var_smoothing' : [1.0e-10,1e-9, 0.01, 0.05, 0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 1]
}

nb = GaussianNB()

grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

for idx, params in enumerate(grid_search.cv_results_['params']):
    model = GaussianNB(**params)  
    model.fit(X_train, y_train)  

    y_train_pred = model.predict(X_train)

    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_mcc = matthews_corrcoef(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_mcc = matthews_corrcoef(y_test, y_test_pred)

    test_report = classification_report(y_test, y_test_pred, output_dict=True)

    result_dict = {
        'model': 'W2V',
        'priors': params['priors'],
        'var_smoothing': params['var_smoothing'],
        'train_f1': train_f1,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_accuracy': train_accuracy,
        'train_mcc': train_mcc,
        'test_f1': test_f1,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_accuracy': test_accuracy,
        'test_mcc': test_mcc,
    }

    results_list.append(result_dict)

    filename = f'saved_models/nb_w2v/model_nb_w2v_{idx}.joblib'
    joblib.dump(model, filename)

results_df = pd.DataFrame(results_list)

results_df.to_csv('eval_data/nb_w2v_mixed_dataset.csv', index=False)

print(results_df)


Fitting 3 folds for each of 66 candidates, totalling 198 fits
   model      priors  var_smoothing  train_f1  train_recall  train_precision  \
0    W2V        None   1.000000e-10  0.431948      0.744868         0.304167   
1    W2V        None   1.000000e-09  0.431948      0.744868         0.304167   
2    W2V        None   1.000000e-02  0.431779      0.750387         0.303090   
3    W2V        None   5.000000e-02  0.429237      0.766751         0.298043   
4    W2V        None   1.000000e-01  0.427459      0.783792         0.293861   
..   ...         ...            ...       ...           ...              ...   
61   W2V  [0.9, 0.1]   4.000000e-01  0.422285      0.798122         0.287093   
62   W2V  [0.9, 0.1]   5.000000e-01  0.421974      0.800639         0.286481   
63   W2V  [0.9, 0.1]   6.000000e-01  0.421112      0.798412         0.285972   
64   W2V  [0.9, 0.1]   8.000000e-01  0.421110      0.790376         0.287015   
65   W2V  [0.9, 0.1]   1.000000e+00  0.421624      0.77856

In [11]:
results_df.sort_values(by=['test_f1'],ascending=False)

Unnamed: 0,model,priors,var_smoothing,train_f1,train_recall,train_precision,train_accuracy,train_mcc,test_f1,test_recall,test_precision,test_accuracy,test_mcc
56,W2V,"[0.9, 0.1]",1.000000e-09,0.433883,0.727053,0.309203,0.658219,0.287882,0.425462,0.727870,0.300580,0.646630,0.276255
55,W2V,"[0.9, 0.1]",1.000000e-10,0.433883,0.727053,0.309203,0.658219,0.287882,0.425462,0.727870,0.300580,0.646630,0.276255
57,W2V,"[0.9, 0.1]",1.000000e-02,0.433519,0.732088,0.307934,0.655341,0.287798,0.424525,0.731266,0.299074,0.643619,0.275169
58,W2V,"[0.9, 0.1]",5.000000e-02,0.432170,0.748451,0.303792,0.645695,0.287413,0.423691,0.748245,0.295511,0.634096,0.275526
59,W2V,"[0.9, 0.1]",1.000000e-01,0.430575,0.763846,0.299780,0.636050,0.286748,0.423385,0.765678,0.292586,0.625102,0.276933
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18,W2V,"[0.1, 0.9]",5.000000e-01,0.379590,0.929899,0.238467,0.452417,0.233189,0.375839,0.931854,0.235388,0.443635,0.227586
32,W2V,"[0.25, 0.75]",1.000000e+00,0.376724,0.936483,0.235787,0.441778,0.229605,0.372770,0.936609,0.232690,0.433420,0.223015
19,W2V,"[0.1, 0.9]",6.000000e-01,0.376301,0.938420,0.235334,0.439615,0.229345,0.372726,0.941816,0.232337,0.430164,0.224402
20,W2V,"[0.1, 0.9]",8.000000e-01,0.370421,0.950136,0.230055,0.418178,0.221405,0.365760,0.949966,0.226480,0.407781,0.213019


In [12]:
results_df.sort_values(by=['test_recall'],ascending=False)

Unnamed: 0,model,priors,var_smoothing,train_f1,train_recall,train_precision,train_accuracy,train_mcc,test_f1,test_recall,test_precision,test_accuracy,test_mcc
21,W2V,"[0.1, 0.9]",1.000000e+00,0.365747,0.959431,0.225939,0.400562,0.214979,0.360819,0.956984,0.222322,0.390526,0.205001
20,W2V,"[0.1, 0.9]",8.000000e-01,0.370421,0.950136,0.230055,0.418178,0.221405,0.365760,0.949966,0.226480,0.407781,0.213019
19,W2V,"[0.1, 0.9]",6.000000e-01,0.376301,0.938420,0.235334,0.439615,0.229345,0.372726,0.941816,0.232337,0.430164,0.224402
32,W2V,"[0.25, 0.75]",1.000000e+00,0.376724,0.936483,0.235787,0.441778,0.229605,0.372770,0.936609,0.232690,0.433420,0.223015
18,W2V,"[0.1, 0.9]",5.000000e-01,0.379590,0.929899,0.238467,0.452417,0.233189,0.375839,0.931854,0.235388,0.443635,0.227586
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,W2V,,1.000000e-09,0.431948,0.744868,0.304167,0.647073,0.286729,0.422114,0.744170,0.294613,0.633729,0.272783
0,W2V,,1.000000e-10,0.431948,0.744868,0.304167,0.647073,0.286729,0.422114,0.744170,0.294613,0.633729,0.272783
57,W2V,"[0.9, 0.1]",1.000000e-02,0.433519,0.732088,0.307934,0.655341,0.287798,0.424525,0.731266,0.299074,0.643619,0.275169
55,W2V,"[0.9, 0.1]",1.000000e-10,0.433883,0.727053,0.309203,0.658219,0.287882,0.425462,0.727870,0.300580,0.646630,0.276255


## Test Dataset

In [16]:
filepath_name_test = (('../../../data/mixed_dataset/test_cleaned.csv'))
df_test = pd.read_csv(filepath_name_test, encoding ='utf-8')

In [17]:
df_test = df_test[df_test['tweet_cleaned'].notna()]

In [18]:
#with open('saved_models/vectorizer/vectorizer_nb_w2v.pkl','rb') as f:
 #   vectorizer_w2v_saved = pickle.load(f)

vectorizer_w2v_saved = joblib.load('saved_models/vectorizer/vectorizer_w2v.joblib')

In [40]:
x_test_tokenized = df_test['tweet_cleaned'].map(word_tokenize)

In [27]:
def w2v_vector(tokenized_tweet, vector_size):
        vec = np.zeros(vector_size).reshape((1, vector_size))
        count = 0
        for word in tokenized_tweet:
            try:
                vec += vectorizer_w2v_saved.wv[word].reshape((1, vector_size))
                count += 1
            except KeyError:

                continue
        if count != 0:
            vec /= count
        return vec

In [23]:
import numpy as np

In [28]:
X_test_w2v = np.zeros((len(x_test_tokenized), 300))
for i in range(len(x_test_tokenized)):
    X_test_w2v[i, :] = w2v_vector(x_test_tokenized.iloc[i], 300)

In [58]:
#X_test, X_empty, y_test, y_empty, vectorizer_w2v  = vectorize_w2v(df=df_test, text_column='tweet_cleaned', 
 #                                                                                label_column="label", test_size=1)

In [29]:
y_test = df_test['label']

In [30]:
print(X_test_w2v.shape)
print(y_test.shape)
print(type(X_test_w2v))

(32466, 300)
(32466,)
<class 'numpy.ndarray'>


# Test bestes F1-Model

In [31]:
# import Model 0
#with open('saved_models/w2v/model_nb_w2v_0.pkl','rb') as f:
 #   clf_w2v_0 = pickle.load(f)
clf_w2v_0 = joblib.load('saved_models/w2v/model_nb_w2v_0.joblib')

In [32]:
y_pred = clf_w2v_0.predict(X_test_w2v)

In [33]:
evaluate(y_test, y_pred)

Accuracy: 0.5861824678124807
F1 Score: 0.4465955431066442
Recall: 0.7926597455768387
Precision: 0.31087280651450855


Confusion Matrix
       0      1
0  13610  12017
1   1418   5421


Classification Report
              precision    recall  f1-score   support

           0       0.91      0.53      0.67     25627
           1       0.31      0.79      0.45      6839

    accuracy                           0.59     32466
   macro avg       0.61      0.66      0.56     32466
weighted avg       0.78      0.59      0.62     32466



In [34]:
# Save Confusionmatrix
confusion_matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))

In [35]:
confusion_matrix_df

Unnamed: 0,0,1
0,13610,12017
1,1418,5421


In [36]:
confusion_matrix_df.to_csv("eval_data/nb_w2v_confusion_matrix.csv", index=True)

In [37]:
# Save Classificationreport
classification_report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()

In [38]:
classification_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.905643,0.531081,0.669536,25627.0
1,0.310873,0.79266,0.446596,6839.0
accuracy,0.586182,0.586182,0.586182,0.586182
macro avg,0.608258,0.66187,0.558066,32466.0
weighted avg,0.780354,0.586182,0.622574,32466.0


In [39]:
classification_report_df.to_csv("eval_data/nb_w2v_classification_report.csv", index=True)