In [1]:
from sklearn.naive_bayes import MultinomialNB # ideal für counting features wie bow oder tfidf https://towardsdatascience.com/why-how-to-use-the-naive-bayes-algorithms-in-a-regulated-industry-with-sklearn-python-code-dbd8304ab2cf
from sklearn.naive_bayes import GaussianNB # für Features in Decimal Form geeignet
from sklearn.naive_bayes import ComplementNB # ähnlich wie Multinomial, soll sich aber besser für imbalanced data eignen
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
)
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import pickle
import joblib

In [2]:
def evaluate(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print("\n")
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(y_test, y_pred)))
    print("\n")
    print("Classification Report")
    print(classification_report(y_test, y_pred))

## Evaluation neue Vectorize-Funktionen (08.12.)

In [3]:
%run ../../functions/vectorize_functions.py

In [4]:
filepath_name = (('../../../data/mixed_dataset/train_cleaned.csv'))
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

In [5]:
X_train, X_test, y_train, y_test, vectorizer_w2v  = vectorize_w2v(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label")

In [6]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(type(X_train))

(57332, 300)
(24572, 300)
(57332,)
(24572,)
<class 'numpy.ndarray'>


In [7]:
# save vectorizer

#with open('saved_models/vectorizer/vectorizer_nb_w2v.pkl','wb') as f:
 #   pickle.dump(vectorizer_w2v, f)

joblib.dump(vectorizer_w2v, 'saved_models/vectorizer/vectorizer_w2v.joblib')

['saved_models/vectorizer/vectorizer_w2v.joblib']

## GaussianNB

In [8]:
clf = GaussianNB()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [9]:
evaluate(y_test, y_pred)

Accuracy: 0.636252645287319
F1 Score: 0.4250611089669367
Recall: 0.7480190174326465
Precision: 0.2968820199478839


Confusion Matrix
       0     1
0  12330  7825
1   1113  3304


Classification Report
              precision    recall  f1-score   support

           0       0.92      0.61      0.73     20155
           1       0.30      0.75      0.43      4417

    accuracy                           0.64     24572
   macro avg       0.61      0.68      0.58     24572
weighted avg       0.81      0.64      0.68     24572



In [10]:
# Model speichern
#with open('saved_models/model_nb_tfidf_mn.pkl','wb') as f:
 #   pickle.dump(clf_tfidf,f)
joblib.dump(clf, 'saved_models/w2v/model_nb_w2v.joblib')

['saved_models/w2v/model_nb_w2v.joblib']

In [13]:
results_list = []

param_grid = {
    'priors' : [None, [.25,.75], [.5,.5]],
    'var_smoothing' : [1.0e-10,1e-9, 0.01, 0.1, 0.25, 0.5, 0.75, 1]
}

nb = GaussianNB()

grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

for idx, params in enumerate(grid_search.cv_results_['params']):
    model = GaussianNB(**params)  
    model.fit(X_train, y_train)  

    y_train_pred = model.predict(X_train)

    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    test_report = classification_report(y_test, y_test_pred, output_dict=True)

    result_dict = {
        'model': 'W2V',
        'priors': params['priors'],
        'var_smoothing': params['var_smoothing'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1,
        'test_recall_1': test_report['1']['recall'],
    }

    results_list.append(result_dict)

    filename = f'saved_models/w2v/model_nb_w2v_{idx}.joblib'
    #with open(filename,'wb') as file:
     #   pickle.dump(model,file)
    joblib.dump(model, filename)

results_df = pd.DataFrame(results_list)

results_df.to_csv('eval_data/nb_grid_w2v_mixed_dataset.csv', index=False)

print(results_df)


Fitting 3 folds for each of 24 candidates, totalling 72 fits
   model        priors  var_smoothing  train_accuracy  train_recall  \
0    W2V          None   1.000000e-10        0.648399      0.743997   
1    W2V          None   1.000000e-09        0.648399      0.743997   
2    W2V          None   1.000000e-02        0.645887      0.748451   
3    W2V          None   1.000000e-01        0.624608      0.781274   
4    W2V          None   2.500000e-01        0.602055      0.807417   
5    W2V          None   5.000000e-01        0.583671      0.822909   
6    W2V          None   7.500000e-01        0.577949      0.826201   
7    W2V          None   1.000000e+00        0.578944      0.823683   
8    W2V  [0.25, 0.75]   1.000000e-10        0.602805      0.806352   
9    W2V  [0.25, 0.75]   1.000000e-09        0.602805      0.806352   
10   W2V  [0.25, 0.75]   1.000000e-02        0.598357      0.810515   
11   W2V  [0.25, 0.75]   1.000000e-01        0.567519      0.843242   
12   W2V  [0.25,

In [14]:
results_df.sort_values(by=['test_f1'],ascending=False)

Unnamed: 0,model,priors,var_smoothing,train_accuracy,train_recall,train_precision,train_f1,test_accuracy,test_recall,test_precision,test_f1,test_recall_1
0,W2V,,1e-10,0.648399,0.743997,0.304945,0.432585,0.636253,0.748019,0.296882,0.425061,0.748019
1,W2V,,1e-09,0.648399,0.743997,0.304945,0.432585,0.636253,0.748019,0.296882,0.425061,0.748019
2,W2V,,0.01,0.645887,0.748451,0.303924,0.432302,0.633078,0.753,0.295618,0.42456,0.753
3,W2V,,0.1,0.624608,0.781274,0.295222,0.428518,0.61391,0.783337,0.288574,0.421771,0.783337
17,W2V,"[0.5, 0.5]",1e-09,0.622637,0.78263,0.294216,0.427661,0.609067,0.784695,0.28595,0.419156,0.784695
16,W2V,"[0.5, 0.5]",1e-10,0.622637,0.78263,0.294216,0.427661,0.609067,0.784695,0.28595,0.419156,0.784695
18,W2V,"[0.5, 0.5]",0.01,0.619427,0.787858,0.293067,0.427218,0.606056,0.790355,0.285096,0.419037,0.790355
4,W2V,,0.25,0.602055,0.807417,0.285925,0.422303,0.591161,0.807788,0.279514,0.415318,0.807788
9,W2V,"[0.25, 0.75]",1e-09,0.602805,0.806352,0.286186,0.422441,0.589126,0.804845,0.277973,0.413228,0.804845
8,W2V,"[0.25, 0.75]",1e-10,0.602805,0.806352,0.286186,0.422441,0.589126,0.804845,0.277973,0.413228,0.804845


In [15]:
results_df.sort_values(by=['test_recall_1'],ascending=False)

Unnamed: 0,model,priors,var_smoothing,train_accuracy,train_recall,train_precision,train_f1,test_accuracy,test_recall,test_precision,test_f1,test_recall_1
15,W2V,"[0.25, 0.75]",1.0,0.447289,0.931352,0.236933,0.377764,0.440217,0.933892,0.234535,0.374915,0.933892
14,W2V,"[0.25, 0.75]",0.75,0.466127,0.918474,0.241669,0.382654,0.457431,0.922345,0.238762,0.37933,0.922345
13,W2V,"[0.25, 0.75]",0.5,0.490442,0.900368,0.248079,0.388982,0.482541,0.906724,0.245585,0.38649,0.906724
23,W2V,"[0.5, 0.5]",1.0,0.498116,0.895043,0.250284,0.391181,0.491779,0.899026,0.247986,0.388742,0.899026
22,W2V,"[0.5, 0.5]",0.75,0.510326,0.887684,0.254088,0.395087,0.502605,0.889065,0.250782,0.391213,0.889065
12,W2V,"[0.25, 0.75]",0.25,0.529582,0.873451,0.260091,0.400826,0.520389,0.880915,0.256832,0.39771,0.880915
21,W2V,"[0.5, 0.5]",0.5,0.528501,0.874419,0.259765,0.400541,0.520674,0.875028,0.256113,0.396248,0.875028
20,W2V,"[0.5, 0.5]",0.25,0.560106,0.848567,0.270327,0.410031,0.548999,0.850351,0.264936,0.404001,0.850351
11,W2V,"[0.25, 0.75]",0.1,0.567519,0.843242,0.273138,0.412622,0.555307,0.846049,0.267234,0.406174,0.846049
6,W2V,,0.75,0.577949,0.826201,0.275836,0.413591,0.569998,0.827485,0.271565,0.408928,0.827485


## Test Dataset

In [16]:
filepath_name_test = (('../../../data/mixed_dataset/test_cleaned.csv'))
df_test = pd.read_csv(filepath_name_test, encoding ='utf-8')

In [17]:
df_test = df_test[df_test['tweet_cleaned'].notna()]

In [18]:
#with open('saved_models/vectorizer/vectorizer_nb_w2v.pkl','rb') as f:
 #   vectorizer_w2v_saved = pickle.load(f)

vectorizer_w2v_saved = joblib.load('saved_models/vectorizer/vectorizer_w2v.joblib')

In [40]:
x_test_tokenized = df_test['tweet_cleaned'].map(word_tokenize)

In [27]:
def w2v_vector(tokenized_tweet, vector_size):
        vec = np.zeros(vector_size).reshape((1, vector_size))
        count = 0
        for word in tokenized_tweet:
            try:
                vec += vectorizer_w2v_saved.wv[word].reshape((1, vector_size))
                count += 1
            except KeyError:

                continue
        if count != 0:
            vec /= count
        return vec

In [23]:
import numpy as np

In [28]:
X_test_w2v = np.zeros((len(x_test_tokenized), 300))
for i in range(len(x_test_tokenized)):
    X_test_w2v[i, :] = w2v_vector(x_test_tokenized.iloc[i], 300)

In [58]:
#X_test, X_empty, y_test, y_empty, vectorizer_w2v  = vectorize_w2v(df=df_test, text_column='tweet_cleaned', 
 #                                                                                label_column="label", test_size=1)

In [29]:
y_test = df_test['label']

In [30]:
print(X_test_w2v.shape)
print(y_test.shape)
print(type(X_test_w2v))

(32466, 300)
(32466,)
<class 'numpy.ndarray'>


# Test bestes F1-Model

In [31]:
# import Model 0
#with open('saved_models/w2v/model_nb_w2v_0.pkl','rb') as f:
 #   clf_w2v_0 = pickle.load(f)
clf_w2v_0 = joblib.load('saved_models/w2v/model_nb_w2v_0.joblib')

In [32]:
y_pred = clf_w2v_0.predict(X_test_w2v)

In [33]:
evaluate(y_test, y_pred)

Accuracy: 0.5861824678124807
F1 Score: 0.4465955431066442
Recall: 0.7926597455768387
Precision: 0.31087280651450855


Confusion Matrix
       0      1
0  13610  12017
1   1418   5421


Classification Report
              precision    recall  f1-score   support

           0       0.91      0.53      0.67     25627
           1       0.31      0.79      0.45      6839

    accuracy                           0.59     32466
   macro avg       0.61      0.66      0.56     32466
weighted avg       0.78      0.59      0.62     32466



In [34]:
# Save Confusionmatrix
confusion_matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))

In [35]:
confusion_matrix_df

Unnamed: 0,0,1
0,13610,12017
1,1418,5421


In [36]:
confusion_matrix_df.to_csv("eval_data/nb_w2v_confusion_matrix.csv", index=True)

In [37]:
# Save Classificationreport
classification_report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()

In [38]:
classification_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.905643,0.531081,0.669536,25627.0
1,0.310873,0.79266,0.446596,6839.0
accuracy,0.586182,0.586182,0.586182,0.586182
macro avg,0.608258,0.66187,0.558066,32466.0
weighted avg,0.780354,0.586182,0.622574,32466.0


In [39]:
classification_report_df.to_csv("eval_data/nb_w2v_classification_report.csv", index=True)