In [1]:
from sklearn.naive_bayes import MultinomialNB # ideal für counting features wie bow oder tfidf https://towardsdatascience.com/why-how-to-use-the-naive-bayes-algorithms-in-a-regulated-industry-with-sklearn-python-code-dbd8304ab2cf
from sklearn.naive_bayes import GaussianNB # für Features in Decimal Form geeignet
from sklearn.naive_bayes import ComplementNB # ähnlich wie Multinomial, soll sich aber besser für imbalanced data eignen
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
)
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

In [2]:
def evaluate(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print(pd.DataFrame(confusion_matrix(y_test, y_pred)))

## Evaluation neue Vectorize-Funktionen (08.12.)

In [3]:
%run ../../functions/vectorize_functions.py

In [4]:
filepath_name = (('../../../data/new_datasets/train_cleaned.csv'))
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

In [5]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = vectorize_tfidf(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label")

In [6]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train_tfidf.shape)
print(y_test_tfidf.shape)
print(type(X_train_tfidf))

(42879, 10992)
(18378, 10992)
(42879,)
(18378,)
<class 'scipy.sparse._csr.csr_matrix'>


## Multinomial NB

In [7]:
clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf,y_train_tfidf)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)

In [8]:
evaluate(y_test_tfidf, y_pred_tfidf)

Accuracy: 0.8038959625639351
F1 Score: 0.19804183355585223
Recall: 0.11354937484052054
Precision: 0.7739130434782608
       0    1
0  14329  130
1   3474  445


In [9]:
results_list = []

param_grid = {
    'alpha' : [1.0e-10, 0.01, 0.1, 0.25, 0.5, 0.75, 1],
    'fit_prior' : [True, False],
    'class_prior' : [None, [.25,.75], [.5,.5]]
}

nb = MultinomialNB()

grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train_tfidf)

for params in grid_search.cv_results_['params']:
    model = MultinomialNB(**params)  
    model.fit(X_train_tfidf, y_train_tfidf)  

    y_train_pred_tfidf = model.predict(X_train_tfidf)

    y_test_pred_tfidf = model.predict(X_test_tfidf)

    train_accuracy = accuracy_score(y_train_tfidf, y_train_pred_tfidf)
    train_recall = recall_score(y_train_tfidf, y_train_pred_tfidf)
    train_precision = precision_score(y_train_tfidf, y_train_pred_tfidf)
    train_f1 = f1_score(y_train_tfidf, y_train_pred_tfidf)

    test_accuracy = accuracy_score(y_test_tfidf, y_test_pred_tfidf)
    test_recall = recall_score(y_test_tfidf, y_test_pred_tfidf)
    test_precision = precision_score(y_test_tfidf, y_test_pred_tfidf)
    test_f1 = f1_score(y_test_tfidf, y_test_pred_tfidf)

    result_dict = {
        'model': 'TF-IDF (Multi)',
        'alpha': params['alpha'],
        'fit_prior': params['fit_prior'],
        'class_prior': params['class_prior'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('eval_data/nb_grid_tfidf_new_dataset.csv', index=False)

print(results_df)


Fitting 3 folds for each of 42 candidates, totalling 126 fits
             model         alpha  fit_prior   class_prior  train_accuracy  \
0   TF-IDF (Multi)  1.000000e-10       True          None        0.842044   
1   TF-IDF (Multi)  1.000000e-10      False          None        0.788824   
2   TF-IDF (Multi)  1.000000e-10       True  [0.25, 0.75]        0.551902   
3   TF-IDF (Multi)  1.000000e-10      False  [0.25, 0.75]        0.551902   
4   TF-IDF (Multi)  1.000000e-10       True    [0.5, 0.5]        0.788824   
5   TF-IDF (Multi)  1.000000e-10      False    [0.5, 0.5]        0.788824   
6   TF-IDF (Multi)  1.000000e-02       True          None        0.840994   
7   TF-IDF (Multi)  1.000000e-02      False          None        0.787985   
8   TF-IDF (Multi)  1.000000e-02       True  [0.25, 0.75]        0.535600   
9   TF-IDF (Multi)  1.000000e-02      False  [0.25, 0.75]        0.535600   
10  TF-IDF (Multi)  1.000000e-02       True    [0.5, 0.5]        0.787985   
11  TF-IDF (Mu

## Complement NB

In [10]:
clf_tfidf_comp = ComplementNB()
clf_tfidf_comp.fit(X_train_tfidf,y_train_tfidf)
y_pred_tfidf_comp = clf_tfidf_comp.predict(X_test_tfidf)

In [11]:
evaluate(y_test_tfidf, y_pred_tfidf_comp)

Accuracy: 0.7346827728806181
F1 Score: 0.5010233319688908
Recall: 0.6246491451900995
Precision: 0.41824705279343927
       0     1
0  11054  3405
1   1471  2448


In [12]:
results_list = []

param_grid = {
    'alpha' : [1.0e-10, 0.01, 0.1, 0.25, 0.5, 0.75, 1],
    'norm' : [True, False]
}

cnb = ComplementNB()

grid_search = GridSearchCV(estimator=cnb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train_tfidf)

for params in grid_search.cv_results_['params']:
    model = ComplementNB(**params)  
    model.fit(X_train_tfidf, y_train_tfidf)  

    y_train_pred_tfidf = model.predict(X_train_tfidf)

    y_test_pred_tfidf = model.predict(X_test_tfidf)

    train_accuracy = accuracy_score(y_train_tfidf, y_train_pred_tfidf)
    train_recall = recall_score(y_train_tfidf, y_train_pred_tfidf)
    train_precision = precision_score(y_train_tfidf, y_train_pred_tfidf)
    train_f1 = f1_score(y_train_tfidf, y_train_pred_tfidf)

    test_accuracy = accuracy_score(y_test_tfidf, y_test_pred_tfidf)
    test_recall = recall_score(y_test_tfidf, y_test_pred_tfidf)
    test_precision = precision_score(y_test_tfidf, y_test_pred_tfidf)
    test_f1 = f1_score(y_test_tfidf, y_test_pred_tfidf)

    result_dict = {
        'model': 'TF-IDF (Comp)',
        'alpha': params['alpha'],
        'norm': params['norm'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('eval_data/cnb_grid_tfidf_new_dataset.csv', index=False)

print(results_df)


Fitting 3 folds for each of 14 candidates, totalling 42 fits
            model         alpha   norm  train_accuracy  train_recall  \
0   TF-IDF (Comp)  1.000000e-10   True        0.320507      0.999891   
1   TF-IDF (Comp)  1.000000e-10  False        0.788824      0.852896   
2   TF-IDF (Comp)  1.000000e-02   True        0.421278      0.997931   
3   TF-IDF (Comp)  1.000000e-02  False        0.787985      0.852134   
4   TF-IDF (Comp)  1.000000e-01   True        0.652650      0.957099   
5   TF-IDF (Comp)  1.000000e-01  False        0.785699      0.844512   
6   TF-IDF (Comp)  2.500000e-01   True        0.778213      0.840592   
7   TF-IDF (Comp)  2.500000e-01  False        0.785256      0.826546   
8   TF-IDF (Comp)  5.000000e-01   True        0.829497      0.663001   
9   TF-IDF (Comp)  5.000000e-01  False        0.787075      0.794425   
10  TF-IDF (Comp)  7.500000e-01   True        0.836330      0.538981   
11  TF-IDF (Comp)  7.500000e-01  False        0.790830      0.757622   
12 

In [26]:
# Save NB TFIDF Comp

import pickle

# save
with open('saved_models/model_nb_tfidf_comp.pkl','wb') as f:
    pickle.dump(clf_tfidf_comp,f)

In [29]:
# Test load
with open('saved_models/model_nb_tfidf_comp.pkl', 'rb') as f:
    clf_test = pickle.load(f)

In [30]:
clf_test.predict(X_test_tfidf)

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)