In [1]:
from sklearn.naive_bayes import MultinomialNB # ideal für counting features wie bow oder tfidf https://towardsdatascience.com/why-how-to-use-the-naive-bayes-algorithms-in-a-regulated-industry-with-sklearn-python-code-dbd8304ab2cf
from sklearn.naive_bayes import GaussianNB # für Features in Decimal Form geeignet
from sklearn.naive_bayes import ComplementNB # ähnlich wie Multinomial, soll sich aber besser für imbalanced data eignen
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
    matthews_corrcoef, 
)
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import joblib

In [2]:
def evaluate(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print("Accuracy:", accuracy)
    print("MCC:", mcc)
    print("\n")
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(y_test, y_pred)))
    print("\n")
    print("Classification Report")
    print(classification_report(y_test, y_pred))

## Daten Einladen und TFIDF Vektorisierung

In [3]:
%run ../../functions/vectorize_functions.py

In [4]:
filepath_name = (('../../../data/mixed_dataset/train_cleaned.csv'))
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

In [5]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf, vectorizer_tfidf = vectorize_tfidf(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label")

In [6]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train_tfidf.shape)
print(y_test_tfidf.shape)
print(type(X_train_tfidf))

(57332, 13198)
(24572, 13198)
(57332,)
(24572,)
<class 'scipy.sparse._csr.csr_matrix'>


In [7]:
# save vectorizer

joblib.dump(vectorizer_tfidf, 'saved_models/vectorizer/vectorizer_tfidf.joblib')

['saved_models/vectorizer/vectorizer_tfidf.joblib']

## Multinomial NB

In [8]:
clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf,y_train_tfidf)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)

In [9]:
evaluate(y_test_tfidf, y_pred_tfidf)

F1 Score: 0.17809580600278274
Recall: 0.10142630744849446
Precision: 0.7296416938110749
Accuracy: 0.8317190297900049
MCC: 0.22925329625422403


Confusion Matrix
       0    1
0  19989  166
1   3969  448


Classification Report
              precision    recall  f1-score   support

           0       0.83      0.99      0.91     20155
           1       0.73      0.10      0.18      4417

    accuracy                           0.83     24572
   macro avg       0.78      0.55      0.54     24572
weighted avg       0.82      0.83      0.78     24572



In [18]:
# Model speichern
#with open('saved_models/model_nb_tfidf_mn.pkl','wb') as f:
 #   pickle.dump(clf_tfidf,f)
joblib.dump(clf_tfidf, 'saved_models/tfidf_mn/model_tfidf_nm.joblib')

['saved_models/tfidf_mn/model_tfidf_nmjoblib']

In [10]:
results_list = []

param_grid = {
    'alpha' : [1.0e-10, 0.01, 0.05, 0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 1],
    'fit_prior' : [True, False],
    'class_prior' : [None,[.1,.9], [.25,.75], [.5,.5], [.75,.25],[.9,.1]]
}

nb = MultinomialNB()

grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train_tfidf)

for idx, params in enumerate(grid_search.cv_results_['params']):
    model = MultinomialNB(**params)  
    model.fit(X_train_tfidf, y_train_tfidf)  

    y_train_pred_tfidf = model.predict(X_train_tfidf)

    y_test_pred_tfidf = model.predict(X_test_tfidf)

    train_accuracy = accuracy_score(y_train_tfidf, y_train_pred_tfidf)
    train_recall = recall_score(y_train_tfidf, y_train_pred_tfidf)
    train_precision = precision_score(y_train_tfidf, y_train_pred_tfidf)
    train_f1 = f1_score(y_train_tfidf, y_train_pred_tfidf)
    train_mcc = matthews_corrcoef(y_train_tfidf, y_train_pred_tfidf)

    test_accuracy = accuracy_score(y_test_tfidf, y_test_pred_tfidf)
    test_recall = recall_score(y_test_tfidf, y_test_pred_tfidf)
    test_precision = precision_score(y_test_tfidf, y_test_pred_tfidf)
    test_f1 = f1_score(y_test_tfidf, y_test_pred_tfidf)
    test_mcc = matthews_corrcoef(y_test_tfidf, y_test_pred_tfidf)

    test_report = classification_report(y_test_tfidf, y_test_pred_tfidf, output_dict=True)

    result_dict = {
        'model': 'TF-IDF (Multi)',
        'alpha': params['alpha'],
        'fit_prior': params['fit_prior'],
        'class_prior': params['class_prior'],
        'train_f1': train_f1,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_accuracy': train_accuracy,
        'train_mcc': train_mcc,
        'test_f1': test_f1,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_accuracy': test_accuracy,
        'test_mcc': test_mcc
    }

    results_list.append(result_dict)

    filename = f'saved_models/nb_mn_tfidf/model_tfidf_mn_{idx}.joblib'
    joblib.dump(model, filename)



results_df = pd.DataFrame(results_list)

results_df.to_csv('eval_data/nb_mn_tfidf_mixed_dataset.csv', index=False)

print(results_df)


Fitting 3 folds for each of 120 candidates, totalling 360 fits
              model         alpha  fit_prior   class_prior  train_f1  \
0    TF-IDF (Multi)  1.000000e-10       True          None  0.474140   
1    TF-IDF (Multi)  1.000000e-10      False          None  0.596257   
2    TF-IDF (Multi)  1.000000e-10       True    [0.1, 0.9]  0.424579   
3    TF-IDF (Multi)  1.000000e-10      False    [0.1, 0.9]  0.424579   
4    TF-IDF (Multi)  1.000000e-10       True  [0.25, 0.75]  0.467960   
..              ...           ...        ...           ...       ...   
115  TF-IDF (Multi)  1.000000e+00      False    [0.5, 0.5]  0.568847   
116  TF-IDF (Multi)  1.000000e+00       True  [0.75, 0.25]  0.417216   
117  TF-IDF (Multi)  1.000000e+00      False  [0.75, 0.25]  0.417216   
118  TF-IDF (Multi)  1.000000e+00       True    [0.9, 0.1]  0.089803   
119  TF-IDF (Multi)  1.000000e+00      False    [0.9, 0.1]  0.089803   

     train_recall  train_precision  train_accuracy  train_mcc   test_f1 

In [11]:
results_df.sort_values(by=['test_f1'],ascending=False)

Unnamed: 0,model,alpha,fit_prior,class_prior,train_f1,train_recall,train_precision,train_accuracy,train_mcc,test_f1,test_recall,test_precision,test_accuracy,test_mcc
67,TF-IDF (Multi),0.4,False,"[0.5, 0.5]",0.577609,0.849245,0.437631,0.776251,0.491192,0.498135,0.755943,0.371454,0.726192,0.379166
66,TF-IDF (Multi),0.4,True,"[0.5, 0.5]",0.577609,0.849245,0.437631,0.776251,0.491192,0.498135,0.755943,0.371454,0.726192,0.379166
61,TF-IDF (Multi),0.4,False,,0.577609,0.849245,0.437631,0.776251,0.491192,0.498135,0.755943,0.371454,0.726192,0.379166
79,TF-IDF (Multi),0.5,False,"[0.5, 0.5]",0.575637,0.834334,0.439396,0.778396,0.486410,0.497138,0.747113,0.372503,0.728309,0.376905
78,TF-IDF (Multi),0.5,True,"[0.5, 0.5]",0.575637,0.834334,0.439396,0.778396,0.486410,0.497138,0.747113,0.372503,0.728309,0.376905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,TF-IDF (Multi),0.6,True,"[0.9, 0.1]",0.139747,0.075523,0.934132,0.832502,0.238515,0.089917,0.047544,0.826772,0.826998,0.172208
107,TF-IDF (Multi),0.8,False,"[0.9, 0.1]",0.109754,0.058288,0.937695,0.829659,0.209772,0.070716,0.036903,0.844560,0.825655,0.154045
106,TF-IDF (Multi),0.8,True,"[0.9, 0.1]",0.109754,0.058288,0.937695,0.829659,0.209772,0.070716,0.036903,0.844560,0.825655,0.154045
118,TF-IDF (Multi),1.0,True,"[0.9, 0.1]",0.089803,0.047153,0.940154,0.827810,0.188833,0.055629,0.028753,0.852349,0.824516,0.136814


In [12]:
results_df.sort_values(by=['test_recall'],ascending=False)

Unnamed: 0,model,alpha,fit_prior,class_prior,train_f1,train_recall,train_precision,train_accuracy,train_mcc,test_f1,test_recall,test_precision,test_accuracy,test_mcc
98,TF-IDF (Multi),0.8,True,"[0.1, 0.9]",0.327546,0.999225,0.195877,0.260901,0.137867,0.324466,0.998189,0.193717,0.252849,0.128844
99,TF-IDF (Multi),0.8,False,"[0.1, 0.9]",0.327546,0.999225,0.195877,0.260901,0.137867,0.324466,0.998189,0.193717,0.252849,0.128844
63,TF-IDF (Multi),0.4,False,"[0.1, 0.9]",0.332464,0.999710,0.199386,0.276809,0.152970,0.328613,0.998189,0.196681,0.266808,0.142149
62,TF-IDF (Multi),0.4,True,"[0.1, 0.9]",0.332464,0.999710,0.199386,0.276809,0.152970,0.328613,0.998189,0.196681,0.266808,0.142149
111,TF-IDF (Multi),1.0,False,"[0.1, 0.9]",0.326809,0.999032,0.195357,0.258564,0.135383,0.323535,0.997962,0.193062,0.249837,0.125515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,TF-IDF (Multi),0.6,True,"[0.9, 0.1]",0.139747,0.075523,0.934132,0.832502,0.238515,0.089917,0.047544,0.826772,0.826998,0.172208
107,TF-IDF (Multi),0.8,False,"[0.9, 0.1]",0.109754,0.058288,0.937695,0.829659,0.209772,0.070716,0.036903,0.844560,0.825655,0.154045
106,TF-IDF (Multi),0.8,True,"[0.9, 0.1]",0.109754,0.058288,0.937695,0.829659,0.209772,0.070716,0.036903,0.844560,0.825655,0.154045
118,TF-IDF (Multi),1.0,True,"[0.9, 0.1]",0.089803,0.047153,0.940154,0.827810,0.188833,0.055629,0.028753,0.852349,0.824516,0.136814


## Complement NB

In [13]:
clf_tfidf_comp = ComplementNB()
clf_tfidf_comp.fit(X_train_tfidf,y_train_tfidf)
y_pred_tfidf_comp = clf_tfidf_comp.predict(X_test_tfidf)

In [14]:
evaluate(y_test_tfidf, y_pred_tfidf_comp)

F1 Score: 0.4908544020811316
Recall: 0.6834955852388499
Precision: 0.38292744799594114
Accuracy: 0.7451163926420316
MCC: 0.36367726279687185


Confusion Matrix
       0     1
0  15290  4865
1   1398  3019


Classification Report
              precision    recall  f1-score   support

           0       0.92      0.76      0.83     20155
           1       0.38      0.68      0.49      4417

    accuracy                           0.75     24572
   macro avg       0.65      0.72      0.66     24572
weighted avg       0.82      0.75      0.77     24572



In [24]:
# Model speichern
#with open('saved_models/tfidf_comp/model_nb_tfidf_comp.pkl','wb') as f:
 #   pickle.dump(clf_tfidf_comp,f)
joblib.dump(clf_tfidf, 'saved_models/nb_comp_tfidf/model_comp_tfidf.joblib')

['saved_models/tfidf_comp/model_tfidf_comp.joblib']

In [15]:
results_list = []

param_grid = {
    'alpha' : [1.0e-10, 0.01, 0.05, 0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 1],
    'norm' : [True, False]
}

cnb = ComplementNB()

grid_search = GridSearchCV(estimator=cnb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train_tfidf)

for idx, params in enumerate(grid_search.cv_results_['params']):
    model = ComplementNB(**params)  
    model.fit(X_train_tfidf, y_train_tfidf)  

    y_train_pred_tfidf = model.predict(X_train_tfidf)

    y_test_pred_tfidf = model.predict(X_test_tfidf)

    train_accuracy = accuracy_score(y_train_tfidf, y_train_pred_tfidf)
    train_recall = recall_score(y_train_tfidf, y_train_pred_tfidf)
    train_precision = precision_score(y_train_tfidf, y_train_pred_tfidf)
    train_f1 = f1_score(y_train_tfidf, y_train_pred_tfidf)
    train_mcc = matthews_corrcoef(y_train_tfidf, y_train_pred_tfidf)

    test_accuracy = accuracy_score(y_test_tfidf, y_test_pred_tfidf)
    test_recall = recall_score(y_test_tfidf, y_test_pred_tfidf)
    test_precision = precision_score(y_test_tfidf, y_test_pred_tfidf)
    test_f1 = f1_score(y_test_tfidf, y_test_pred_tfidf)
    test_mcc = matthews_corrcoef(y_test_tfidf, y_test_pred_tfidf)

    test_report = classification_report(y_test_tfidf, y_test_pred_tfidf, output_dict=True)

    result_dict = {
        'model': 'TF-IDF (Comp)',
        'alpha': params['alpha'],
        'norm': params['norm'],
        'train_f1': train_f1,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_accuracy': train_accuracy,
        'train_mcc': train_mcc,
        'test_f1': test_f1,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_accuracy': test_accuracy,
        'test_mcc': test_mcc
    }

    results_list.append(result_dict)

    filename = f'saved_models/nb_comp_tfidf/model_nb_tfidf_comp_{idx}.joblib'
    joblib.dump(model, filename)

results_df_comp = pd.DataFrame(results_list)

results_df_comp.to_csv('eval_data/cnb_grid_tfidf_mixed_dataset.csv', index=False)

print(results_df_comp)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
            model         alpha   norm  train_f1  train_recall  \
0   TF-IDF (Comp)  1.000000e-10   True  0.341636      1.000000   
1   TF-IDF (Comp)  1.000000e-10  False  0.596257      0.886909   
2   TF-IDF (Comp)  1.000000e-02   True  0.391176      0.998354   
3   TF-IDF (Comp)  1.000000e-02  False  0.592715      0.886232   
4   TF-IDF (Comp)  5.000000e-02   True  0.446302      0.987607   
5   TF-IDF (Comp)  5.000000e-02  False  0.588338      0.883133   
6   TF-IDF (Comp)  1.000000e-01   True  0.494831      0.963981   
7   TF-IDF (Comp)  1.000000e-01  False  0.584172      0.879454   
8   TF-IDF (Comp)  2.000000e-01   True  0.558761      0.897463   
9   TF-IDF (Comp)  2.000000e-01  False  0.580468      0.870449   
10  TF-IDF (Comp)  4.000000e-01   True  0.610112      0.754163   
11  TF-IDF (Comp)  4.000000e-01  False  0.577609      0.849245   
12  TF-IDF (Comp)  5.000000e-01   True  0.609599      0.684353   
13  TF-IDF (Com

In [16]:
results_df_comp.sort_values(by=['test_f1'],ascending=False)

Unnamed: 0,model,alpha,norm,train_f1,train_recall,train_precision,train_accuracy,train_mcc,test_f1,test_recall,test_precision,test_accuracy,test_mcc
10,TF-IDF (Comp),0.4,True,0.610112,0.754163,0.512266,0.826362,0.519197,0.507594,0.639348,0.420864,0.777023,0.38485
12,TF-IDF (Comp),0.5,True,0.609599,0.684353,0.549568,0.842095,0.516948,0.503962,0.575957,0.447966,0.796191,0.382952
11,TF-IDF (Comp),0.4,False,0.577609,0.849245,0.437631,0.776251,0.491192,0.498135,0.755943,0.371454,0.726192,0.379166
13,TF-IDF (Comp),0.5,False,0.575637,0.834334,0.439396,0.778396,0.48641,0.497138,0.747113,0.372503,0.728309,0.376905
9,TF-IDF (Comp),0.2,False,0.580468,0.870449,0.435414,0.773338,0.498318,0.49641,0.759113,0.368786,0.72314,0.377274
7,TF-IDF (Comp),0.1,False,0.584172,0.879454,0.437334,0.774454,0.504387,0.495812,0.750509,0.370184,0.725623,0.375532
14,TF-IDF (Comp),0.6,True,0.597572,0.619675,0.576992,0.849648,0.505758,0.495452,0.524111,0.469765,0.808115,0.378281
15,TF-IDF (Comp),0.6,False,0.575133,0.822521,0.442149,0.781082,0.484014,0.494466,0.733303,0.372985,0.730466,0.372031
17,TF-IDF (Comp),0.8,False,0.571658,0.792312,0.447134,0.786105,0.475718,0.493843,0.712701,0.37782,0.737384,0.369463
5,TF-IDF (Comp),0.05,False,0.588338,0.883133,0.441097,0.777367,0.510009,0.492462,0.735794,0.370075,0.727373,0.369633


In [17]:
results_df_comp.sort_values(by=['test_recall'],ascending=False)

Unnamed: 0,model,alpha,norm,train_f1,train_recall,train_precision,train_accuracy,train_mcc,test_f1,test_recall,test_precision,test_accuracy,test_mcc
0,TF-IDF (Comp),1e-10,True,0.341636,1.0,0.206008,0.305693,0.177615,0.330597,0.966493,0.199402,0.296435,0.1331
2,TF-IDF (Comp),0.01,True,0.391176,0.998354,0.243241,0.440173,0.276542,0.370666,0.957211,0.229833,0.415717,0.225069
4,TF-IDF (Comp),0.05,True,0.446302,0.987607,0.28829,0.558554,0.357267,0.410251,0.927779,0.263351,0.520511,0.286081
6,TF-IDF (Comp),0.1,True,0.494831,0.963981,0.332843,0.645434,0.414999,0.441901,0.887254,0.294219,0.597143,0.324327
8,TF-IDF (Comp),0.2,True,0.558761,0.897463,0.405663,0.744663,0.47768,0.488026,0.805071,0.350138,0.696362,0.372429
9,TF-IDF (Comp),0.2,False,0.580468,0.870449,0.435414,0.773338,0.498318,0.49641,0.759113,0.368786,0.72314,0.377274
11,TF-IDF (Comp),0.4,False,0.577609,0.849245,0.437631,0.776251,0.491192,0.498135,0.755943,0.371454,0.726192,0.379166
7,TF-IDF (Comp),0.1,False,0.584172,0.879454,0.437334,0.774454,0.504387,0.495812,0.750509,0.370184,0.725623,0.375532
13,TF-IDF (Comp),0.5,False,0.575637,0.834334,0.439396,0.778396,0.48641,0.497138,0.747113,0.372503,0.728309,0.376905
5,TF-IDF (Comp),0.05,False,0.588338,0.883133,0.441097,0.777367,0.510009,0.492462,0.735794,0.370075,0.727373,0.369633


## Test Dataset

In [28]:
filepath_name_test = (('../../../data/mixed_dataset/test_cleaned.csv'))
df_test = pd.read_csv(filepath_name_test, encoding ='utf-8')

In [29]:
df_test = df_test[df_test['tweet_cleaned'].notna()]

In [30]:
#with open('saved_models/vectorizer/vectorizer_nb_tfidf.pkl','rb') as f:
 #   vectorizer_tfidf_saved = pickle.load(f)
vec_tfidf_save = joblib.load('saved_models/vectorizer/vectorizer_tfidf.joblib')

In [31]:
x_test = vec_tfidf_save.transform(df_test['tweet_cleaned'])

In [32]:
print(x_test.shape)

(32466, 13198)


In [33]:
y_test = df_test['label']

In [34]:
y_test

0        0
1        0
2        0
3        0
4        0
        ..
32577    0
32578    0
32579    0
32580    0
32581    1
Name: label, Length: 32466, dtype: int64

# Test bestes F1-Model

In [36]:
# import Comp Model 8 
#with open('saved_models/tfidf_comp/model_nb_tfidf_comp_8.pkl','rb') as f:
 #   clf_comp_8 = pickle.load(f)
clf_comp_8 = joblib.load('saved_models/tfidf_comp/model_nb_tfidf_comp_8.joblib')

In [37]:
y_pred = clf_comp_8.predict(x_test)

In [38]:
evaluate(y_test, y_pred)

Accuracy: 0.7743793507053532
F1 Score: 0.514932785908218
Recall: 0.5685041672759176
Precision: 0.47058823529411764


Confusion Matrix
       0     1
0  21253  4374
1   2951  3888


Classification Report
              precision    recall  f1-score   support

           0       0.88      0.83      0.85     25627
           1       0.47      0.57      0.51      6839

    accuracy                           0.77     32466
   macro avg       0.67      0.70      0.68     32466
weighted avg       0.79      0.77      0.78     32466



In [39]:
# Save Confusionmatrix
confusion_matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))

In [40]:
confusion_matrix_df

Unnamed: 0,0,1
0,21253,4374
1,2951,3888


In [52]:
confusion_matrix_df.to_csv("eval_data/nb_comp_tfidf_confusion_matrix.csv", index=True)

In [42]:
# Save Classificationreport
classification_report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()

In [43]:
classification_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.878078,0.829321,0.853003,25627.0
1,0.470588,0.568504,0.514933,6839.0
accuracy,0.774379,0.774379,0.774379,0.774379
macro avg,0.674333,0.698912,0.683968,32466.0
weighted avg,0.79224,0.774379,0.781788,32466.0


In [44]:
classification_report_df.to_csv("eval_data/nb_comp_tfidf_classification_report.csv", index=True)

# Test bestes Recall(1)-Model

In [45]:
clf_comp_0 = joblib.load('saved_models/tfidf_comp/model_nb_tfidf_comp_0.joblib')

In [46]:
y_pred_rec = clf_comp_0.predict(x_test)

In [47]:
evaluate(y_test, y_pred_rec)

Accuracy: 0.2587630136142426
F1 Score: 0.35920649713753156
Recall: 0.9862553004825266
Precision: 0.21959239484307852


Confusion Matrix
      0      1
0  1656  23971
1    94   6745


Classification Report
              precision    recall  f1-score   support

           0       0.95      0.06      0.12     25627
           1       0.22      0.99      0.36      6839

    accuracy                           0.26     32466
   macro avg       0.58      0.53      0.24     32466
weighted avg       0.79      0.26      0.17     32466



In [48]:
# Save Confusionmatrix
confusion_matrix_df_rec = pd.DataFrame(confusion_matrix(y_test, y_pred))

In [49]:
confusion_matrix_df_rec

Unnamed: 0,0,1
0,21253,4374
1,2951,3888


In [51]:
confusion_matrix_df_rec.to_csv("eval_data/nb_comp_tfidf_rec_confusion_matrix.csv", index=True)

In [55]:
# Save Classificationreport
classification_report_df_rec = pd.DataFrame(classification_report(y_test, y_pred_rec, output_dict=True)).transpose()

In [56]:
classification_report_df_rec

Unnamed: 0,precision,recall,f1-score,support
0,0.946286,0.064619,0.120977,25627.0
1,0.219592,0.986255,0.359206,6839.0
accuracy,0.258763,0.258763,0.258763,0.258763
macro avg,0.582939,0.525437,0.240092,32466.0
weighted avg,0.793207,0.258763,0.171161,32466.0


In [57]:
classification_report_df_rec.to_csv("eval_data/nb_comp_tfidf_rec_classification_report.csv", index=True)