In [1]:
from sklearn.naive_bayes import MultinomialNB # ideal für counting features wie bow oder tfidf https://towardsdatascience.com/why-how-to-use-the-naive-bayes-algorithms-in-a-regulated-industry-with-sklearn-python-code-dbd8304ab2cf
from sklearn.naive_bayes import GaussianNB # für Features in Decimal Form geeignet
from sklearn.naive_bayes import ComplementNB # ähnlich wie Multinomial, soll sich aber besser für imbalanced data eignen
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
)
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

In [2]:
def evaluate(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print(pd.DataFrame(confusion_matrix(y_test, y_pred)))

## Evaluation neue Vectorize-Funktionen (08.12.)

In [3]:
%run ../../functions/vectorize_functions.py

In [4]:
filepath_name = (('../../../data/new_datasets/train_cleaned.csv'))
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

In [5]:
X_train, X_test, y_train, y_test = vectorize_w2v(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label")

In [6]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(type(X_train))

(42879, 300)
(18378, 300)
(42879,)
(18378,)
<class 'numpy.ndarray'>


## GaussianNB

In [7]:
clf = GaussianNB()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [8]:
evaluate(y_test, y_pred)

Accuracy: 0.7317988899771466
F1 Score: 0.47669603991931203
Recall: 0.5728502168920643
Precision: 0.4081818181818182
       0     1
0  11204  3255
1   1674  2245


In [9]:
results_list = []

param_grid = {
    'priors' : [None, [.25,.75], [.5,.5]],
    'var_smoothing' : [1.0e-10,1e-9, 0.01, 0.1, 0.25, 0.5, 0.75, 1]
}

nb = GaussianNB()

grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

for params in grid_search.cv_results_['params']:
    model = GaussianNB(**params)  
    model.fit(X_train, y_train)  

    y_train_pred = model.predict(X_train)

    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    result_dict = {
        'model': 'W2V',
        'priors': params['priors'],
        'var_smoothing': params['var_smoothing'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('nb_grid_w2v_new_dataset.csv', index=False)

print(results_df)


Fitting 3 folds for each of 24 candidates, totalling 72 fits
   model        priors  var_smoothing  train_accuracy  train_recall  \
0    W2V          None   1.000000e-10        0.731244      0.584930   
1    W2V          None   1.000000e-09        0.731244      0.584930   
2    W2V          None   1.000000e-02        0.730964      0.583079   
3    W2V          None   1.000000e-01        0.730941      0.575675   
4    W2V          None   2.500000e-01        0.733972      0.556402   
5    W2V          None   5.000000e-01        0.739360      0.520035   
6    W2V          None   7.500000e-01        0.746916      0.485845   
7    W2V          None   1.000000e+00        0.753073      0.446320   
8    W2V  [0.25, 0.75]   1.000000e-10        0.677954      0.697735   
9    W2V  [0.25, 0.75]   1.000000e-09        0.677954      0.697735   
10   W2V  [0.25, 0.75]   1.000000e-02        0.676158      0.699260   
11   W2V  [0.25, 0.75]   1.000000e-01        0.664241      0.715483   
12   W2V  [0.25,