In [1]:
from sklearn.naive_bayes import MultinomialNB # ideal für counting features wie bow oder tfidf https://towardsdatascience.com/why-how-to-use-the-naive-bayes-algorithms-in-a-regulated-industry-with-sklearn-python-code-dbd8304ab2cf
from sklearn.naive_bayes import GaussianNB # für Features in Decimal Form geeignet
from sklearn.naive_bayes import ComplementNB # ähnlich wie Multinomial, soll sich aber besser für imbalanced data eignen
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
)
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

In [2]:
def evaluate(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print(pd.DataFrame(confusion_matrix(y_test, y_pred)))

## Evaluation neue Vectorize-Funktionen (08.12.)

In [3]:
%run ../../functions/vectorize_functions.py

In [4]:
filepath_name = (('../../../data/mixed_dataset/train_cleaned.csv'))
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

In [5]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = vectorize_tfidf(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label")

In [6]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train_tfidf.shape)
print(y_test_tfidf.shape)
print(type(X_train_tfidf))

(57332, 13198)
(24572, 13198)
(57332,)
(24572,)
<class 'scipy.sparse._csr.csr_matrix'>


## Multinomial NB

In [7]:
clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf,y_train_tfidf)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)

In [8]:
evaluate(y_test_tfidf, y_pred_tfidf)

Accuracy: 0.8317190297900049
F1 Score: 0.17809580600278274
Recall: 0.10142630744849446
Precision: 0.7296416938110749
       0    1
0  19989  166
1   3969  448


In [9]:
results_list = []

param_grid = {
    'alpha' : [1.0e-10, 0.01, 0.1, 0.25, 0.5, 0.75, 1],
    'fit_prior' : [True, False],
    'class_prior' : [None, [.25,.75], [.5,.5]]
}

nb = MultinomialNB()

grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train_tfidf)

for params in grid_search.cv_results_['params']:
    model = MultinomialNB(**params)  
    model.fit(X_train_tfidf, y_train_tfidf)  

    y_train_pred_tfidf = model.predict(X_train_tfidf)

    y_test_pred_tfidf = model.predict(X_test_tfidf)

    train_accuracy = accuracy_score(y_train_tfidf, y_train_pred_tfidf)
    train_recall = recall_score(y_train_tfidf, y_train_pred_tfidf)
    train_precision = precision_score(y_train_tfidf, y_train_pred_tfidf)
    train_f1 = f1_score(y_train_tfidf, y_train_pred_tfidf)

    test_accuracy = accuracy_score(y_test_tfidf, y_test_pred_tfidf)
    test_recall = recall_score(y_test_tfidf, y_test_pred_tfidf)
    test_precision = precision_score(y_test_tfidf, y_test_pred_tfidf)
    test_f1 = f1_score(y_test_tfidf, y_test_pred_tfidf)

    result_dict = {
        'model': 'TF-IDF (Multi)',
        'alpha': params['alpha'],
        'fit_prior': params['fit_prior'],
        'class_prior': params['class_prior'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('eval_data/nb_grid_tfidf_mixed_dataset.csv', index=False)

print(results_df)


Fitting 3 folds for each of 42 candidates, totalling 126 fits
             model         alpha  fit_prior   class_prior  train_accuracy  \
0   TF-IDF (Multi)  1.000000e-10       True          None        0.867526   
1   TF-IDF (Multi)  1.000000e-10      False          None        0.783629   
2   TF-IDF (Multi)  1.000000e-10       True  [0.25, 0.75]        0.593787   
3   TF-IDF (Multi)  1.000000e-10      False  [0.25, 0.75]        0.593787   
4   TF-IDF (Multi)  1.000000e-10       True    [0.5, 0.5]        0.783629   
5   TF-IDF (Multi)  1.000000e-10      False    [0.5, 0.5]        0.783629   
6   TF-IDF (Multi)  1.000000e-02       True          None        0.866846   
7   TF-IDF (Multi)  1.000000e-02      False          None        0.780594   
8   TF-IDF (Multi)  1.000000e-02       True  [0.25, 0.75]        0.571688   
9   TF-IDF (Multi)  1.000000e-02      False  [0.25, 0.75]        0.571688   
10  TF-IDF (Multi)  1.000000e-02       True    [0.5, 0.5]        0.780594   
11  TF-IDF (Mu

## Complement NB

In [10]:
clf_tfidf_comp = ComplementNB()
clf_tfidf_comp.fit(X_train_tfidf,y_train_tfidf)
y_pred_tfidf_comp = clf_tfidf_comp.predict(X_test_tfidf)

In [11]:
evaluate(y_test_tfidf, y_pred_tfidf_comp)

Accuracy: 0.7451163926420316
F1 Score: 0.4908544020811316
Recall: 0.6834955852388499
Precision: 0.38292744799594114
       0     1
0  15290  4865
1   1398  3019


In [12]:
results_list = []

param_grid = {
    'alpha' : [1.0e-10, 0.01, 0.1, 0.25, 0.5, 0.75, 1],
    'norm' : [True, False]
}

cnb = ComplementNB()

grid_search = GridSearchCV(estimator=cnb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train_tfidf)

for params in grid_search.cv_results_['params']:
    model = ComplementNB(**params)  
    model.fit(X_train_tfidf, y_train_tfidf)  

    y_train_pred_tfidf = model.predict(X_train_tfidf)

    y_test_pred_tfidf = model.predict(X_test_tfidf)

    train_accuracy = accuracy_score(y_train_tfidf, y_train_pred_tfidf)
    train_recall = recall_score(y_train_tfidf, y_train_pred_tfidf)
    train_precision = precision_score(y_train_tfidf, y_train_pred_tfidf)
    train_f1 = f1_score(y_train_tfidf, y_train_pred_tfidf)

    test_accuracy = accuracy_score(y_test_tfidf, y_test_pred_tfidf)
    test_recall = recall_score(y_test_tfidf, y_test_pred_tfidf)
    test_precision = precision_score(y_test_tfidf, y_test_pred_tfidf)
    test_f1 = f1_score(y_test_tfidf, y_test_pred_tfidf)

    result_dict = {
        'model': 'TF-IDF (Comp)',
        'alpha': params['alpha'],
        'norm': params['norm'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('eval_data/cnb_grid_tfidf_mixed_dataset.csv', index=False)

print(results_df)


Fitting 3 folds for each of 14 candidates, totalling 42 fits
            model         alpha   norm  train_accuracy  train_recall  \
0   TF-IDF (Comp)  1.000000e-10   True        0.305693      1.000000   
1   TF-IDF (Comp)  1.000000e-10  False        0.783629      0.886909   
2   TF-IDF (Comp)  1.000000e-02   True        0.440173      0.998354   
3   TF-IDF (Comp)  1.000000e-02  False        0.780594      0.886232   
4   TF-IDF (Comp)  1.000000e-01   True        0.645434      0.963981   
5   TF-IDF (Comp)  1.000000e-01  False        0.774454      0.879454   
6   TF-IDF (Comp)  2.500000e-01   True        0.775082      0.864349   
7   TF-IDF (Comp)  2.500000e-01  False        0.773512      0.865511   
8   TF-IDF (Comp)  5.000000e-01   True        0.842095      0.684353   
9   TF-IDF (Comp)  5.000000e-01  False        0.778396      0.834334   
10  TF-IDF (Comp)  7.500000e-01   True        0.855909      0.545217   
11  TF-IDF (Comp)  7.500000e-01  False        0.785164      0.801123   
12 