In [1]:
from sklearn.naive_bayes import MultinomialNB # ideal für counting features wie bow oder tfidf https://towardsdatascience.com/why-how-to-use-the-naive-bayes-algorithms-in-a-regulated-industry-with-sklearn-python-code-dbd8304ab2cf
from sklearn.naive_bayes import GaussianNB # für Features in Decimal Form geeignet
from sklearn.naive_bayes import ComplementNB # ähnlich wie Multinomial, soll sich aber besser für imbalanced data eignen
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
)
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

In [2]:
def evaluate(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print(pd.DataFrame(confusion_matrix(y_test, y_pred)))

## Evaluation neue Vectorize-Funktionen (08.12.)

In [3]:
%run ../../functions/vectorize_functions.py

In [4]:
filepath_name = (('../../../data/twitter_hate-speech/train_cleaned.csv'))
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

In [5]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = vectorize_tfidf(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label")

In [6]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train_tfidf.shape)
print(y_test_tfidf.shape)
print(type(X_train_tfidf))

(13737, 4925)
(5888, 4925)
(13737,)
(5888,)
<class 'scipy.sparse._csr.csr_matrix'>


## Multinomial NB

In [7]:
clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf,y_train_tfidf)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)

In [8]:
evaluate(y_test_tfidf, y_pred_tfidf)

Accuracy: 0.9458220108695652
F1 Score: 0.27334851936218685
Recall: 0.16042780748663102
Precision: 0.9230769230769231
      0   1
0  5509   5
1   314  60


In [14]:
results_list = []

param_grid = {
    'alpha' : [1.0e-10, 0.01, 0.1, 0.25, 0.5, 0.75, 1],
    'fit_prior' : [True, False],
    'class_prior' : [None, [.25,.75], [.5,.5]]
}

nb = MultinomialNB()

grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train_tfidf)

for params in grid_search.cv_results_['params']:
    model = MultinomialNB(**params)  
    model.fit(X_train_tfidf, y_train_tfidf)  

    y_train_pred_tfidf = model.predict(X_train_tfidf)

    y_test_pred_tfidf = model.predict(X_test_tfidf)

    train_accuracy = accuracy_score(y_train_tfidf, y_train_pred_tfidf)
    train_recall = recall_score(y_train_tfidf, y_train_pred_tfidf)
    train_precision = precision_score(y_train_tfidf, y_train_pred_tfidf)
    train_f1 = f1_score(y_train_tfidf, y_train_pred_tfidf)

    test_accuracy = accuracy_score(y_test_tfidf, y_test_pred_tfidf)
    test_recall = recall_score(y_test_tfidf, y_test_pred_tfidf)
    test_precision = precision_score(y_test_tfidf, y_test_pred_tfidf)
    test_f1 = f1_score(y_test_tfidf, y_test_pred_tfidf)

    result_dict = {
        'model': 'TF-IDF (Multi)',
        'alpha': params['alpha'],
        'fit_prior': params['fit_prior'],
        'class_prior': params['class_prior'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('eval_data/nb_grid_tfidf.csv', index=False)

print(results_df)


Fitting 3 folds for each of 42 candidates, totalling 126 fits
             model         alpha  fit_prior   class_prior  train_accuracy  \
0   TF-IDF (Multi)  1.000000e-10       True          None        0.975177   
1   TF-IDF (Multi)  1.000000e-10      False          None        0.949989   
2   TF-IDF (Multi)  1.000000e-10       True  [0.25, 0.75]        0.911334   
3   TF-IDF (Multi)  1.000000e-10      False  [0.25, 0.75]        0.911334   
4   TF-IDF (Multi)  1.000000e-10       True    [0.5, 0.5]        0.949989   
5   TF-IDF (Multi)  1.000000e-10      False    [0.5, 0.5]        0.949989   
6   TF-IDF (Multi)  1.000000e-02       True          None        0.974303   
7   TF-IDF (Multi)  1.000000e-02      False          None        0.936667   
8   TF-IDF (Multi)  1.000000e-02       True  [0.25, 0.75]        0.868166   
9   TF-IDF (Multi)  1.000000e-02      False  [0.25, 0.75]        0.868166   
10  TF-IDF (Multi)  1.000000e-02       True    [0.5, 0.5]        0.936667   
11  TF-IDF (Mu

## Complement NB

In [15]:
clf_tfidf_comp = ComplementNB()
clf_tfidf_comp.fit(X_train_tfidf,y_train_tfidf)
y_pred_tfidf_comp = clf_tfidf_comp.predict(X_test_tfidf)

In [16]:
evaluate(y_test_tfidf, y_pred_tfidf_comp)

Accuracy: 0.8566576086956522
F1 Score: 0.4089635854341736
Recall: 0.7807486631016043
Precision: 0.27703984819734345
      0    1
0  4752  762
1    82  292


In [17]:
results_list = []

param_grid = {
    'alpha' : [1.0e-10, 0.01, 0.1, 0.25, 0.5, 0.75, 1],
    'norm' : [True, False]
}

cnb = ComplementNB()

grid_search = GridSearchCV(estimator=cnb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train_tfidf)

for params in grid_search.cv_results_['params']:
    model = ComplementNB(**params)  
    model.fit(X_train_tfidf, y_train_tfidf)  

    y_train_pred_tfidf = model.predict(X_train_tfidf)

    y_test_pred_tfidf = model.predict(X_test_tfidf)

    train_accuracy = accuracy_score(y_train_tfidf, y_train_pred_tfidf)
    train_recall = recall_score(y_train_tfidf, y_train_pred_tfidf)
    train_precision = precision_score(y_train_tfidf, y_train_pred_tfidf)
    train_f1 = f1_score(y_train_tfidf, y_train_pred_tfidf)

    test_accuracy = accuracy_score(y_test_tfidf, y_test_pred_tfidf)
    test_recall = recall_score(y_test_tfidf, y_test_pred_tfidf)
    test_precision = precision_score(y_test_tfidf, y_test_pred_tfidf)
    test_f1 = f1_score(y_test_tfidf, y_test_pred_tfidf)

    result_dict = {
        'model': 'TF-IDF (Comp)',
        'alpha': params['alpha'],
        'norm': params['norm'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('eval_data/cnb_grid_tfidf.csv', index=False)

print(results_df)


Fitting 3 folds for each of 14 candidates, totalling 42 fits
            model         alpha   norm  train_accuracy  train_recall  \
0   TF-IDF (Comp)  1.000000e-10   True        0.487151      1.000000   
1   TF-IDF (Comp)  1.000000e-10  False        0.949989      0.979570   
2   TF-IDF (Comp)  1.000000e-02   True        0.735459      1.000000   
3   TF-IDF (Comp)  1.000000e-02  False        0.936667      0.979570   
4   TF-IDF (Comp)  1.000000e-01   True        0.888112      0.979570   
5   TF-IDF (Comp)  1.000000e-01  False        0.911116      0.972043   
6   TF-IDF (Comp)  2.500000e-01   True        0.931644      0.913978   
7   TF-IDF (Comp)  2.500000e-01  False        0.894591      0.949462   
8   TF-IDF (Comp)  5.000000e-01   True        0.950571      0.830108   
9   TF-IDF (Comp)  5.000000e-01  False        0.883308      0.916129   
10  TF-IDF (Comp)  7.500000e-01   True        0.955012      0.759140   
11  TF-IDF (Comp)  7.500000e-01  False        0.882725      0.893548   
12 