In [1]:
from sklearn.naive_bayes import MultinomialNB # ideal für counting features wie bow oder tfidf https://towardsdatascience.com/why-how-to-use-the-naive-bayes-algorithms-in-a-regulated-industry-with-sklearn-python-code-dbd8304ab2cf
from sklearn.naive_bayes import GaussianNB # für Features in Decimal Form geeignet
from sklearn.naive_bayes import ComplementNB # ähnlich wie Multinomial, soll sich aber besser für imbalanced data eignen
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
    matthews_corrcoef, 
)
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import joblib

In [2]:
def evaluate(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print("Accuracy:", accuracy)
    print("MCC:", mcc)
    print("\n")
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(y_test, y_pred)))
    print("\n")
    print("Classification Report")
    print(classification_report(y_test, y_pred))

## Evaluation neue Vectorize-Funktionen (08.12.)

In [3]:
%run ../../functions/vectorize_functions.py

In [4]:
filepath_name = (('../../../data/mixed_dataset/train_cleaned.csv'))
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

## GaussianNB

In [5]:
#df_param = [df_cleaned]
#text_column_param = ['tweet_cleaned']
#label_column_param = ['label']
vector_size_param = [5, 10, 50, 100, 200, 300]
window_param = [1, 3, 5, 10, 30]
#min_count_param = [1, 2, 5, 10]
#test_size_param = [0.3]
#random_state_param = [42]

In [6]:
from itertools import product
#all_combinations = product(df_param, text_column_param, label_column_param, vector_size_param, window_param, min_count_param, test_size_param, 
 #                          random_state_param)
all_combinations = product(vector_size_param, window_param)

In [7]:
def used_parameters(vector_size=300, window=5):
    vector_size_res = vector_size
    window_res = window

    return  vector_size_res, window_res

In [8]:
results_list = []

In [10]:
model_nr = 0
for combination in all_combinations:
    #X_train, X_test, y_train, y_test = vectorize_w2v(*combination)
    vector_size_used, window_used = used_parameters(*combination)
    X_train, X_test, y_train, y_test, vectorizer_w2v_param = vectorize_w2v(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label", vector_size=vector_size_used,
                                                    window=window_used)

    param_grid = {
    'priors' : [None,[.1,.9], [.25,.75], [.5,.5], [.75,.25],[.9,.1]],
    'var_smoothing' : [1.0e-10, 1e-9, 0.01, 0.05, 0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 1]
    }
    
    nb = GaussianNB()  
    
    grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    for idx, params in enumerate(grid_search.cv_results_['params']):
        model = GaussianNB(**params)  
        model.fit(X_train, y_train)  

        y_train_pred = model.predict(X_train)

        y_test_pred = model.predict(X_test)

        train_accuracy = accuracy_score(y_train, y_train_pred)
        train_recall = recall_score(y_train, y_train_pred)
        train_precision = precision_score(y_train, y_train_pred)
        train_f1 = f1_score(y_train, y_train_pred)
        train_mcc = matthews_corrcoef(y_train, y_train_pred)

        test_accuracy = accuracy_score(y_test, y_test_pred)
        test_recall = recall_score(y_test, y_test_pred)
        test_precision = precision_score(y_test, y_test_pred)
        test_f1 = f1_score(y_test, y_test_pred)
        test_mcc = matthews_corrcoef(y_test, y_test_pred)

        result_dict = {
            'model': 'W2V Param',
            'vectorizer': model_nr,
            'model_nr': idx,
            'vector_size': vector_size_used, 
            'window': window_used,
            'priors': params['priors'],
            'var_smoothing': params['var_smoothing'],
            'train_f1': train_f1,
            'train_recall': train_recall,
            'train_precision': train_precision,
            'train_accuracy': train_accuracy,
            'train_mcc': train_mcc,
            'test_f1': test_f1,
            'test_recall': test_recall,
            'test_precision': test_precision,
            'test_accuracy': test_accuracy,
            'test_mcc': test_mcc
        }

        results_list.append(result_dict)

        filename = f'saved_models/nb_w2v_param/model_nb_w2v_param_{model_nr}_{idx}.joblib'
        joblib.dump(model, filename)

    filename = f'saved_models/vectorizer/vectorizer_w2v_param_{model_nr}.joblib'
    joblib.dump(vectorizer_w2v_param, filename)

    model_nr = model_nr + 1

results_df = pd.DataFrame(results_list)

results_df.to_csv('eval_data/nb_w2v_param_mixed_dataset.csv', index=False)

print(results_df)


Fitting 3 folds for each of 66 candidates, totalling 198 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 3 folds for each of 66 candidates, totalling 198 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 3 folds for each of 66 candidates, totalling 198 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 3 folds for each of 66 candidates, totalling 198 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 3 folds for each of 66 candidates, totalling 198 fits


  _warn_prf(average, modifier, msg_start, len(result))


Fitting 3 folds for each of 66 candidates, totalling 198 fits


  _warn_prf(average, modifier, msg_start, len(result))


Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 

In [11]:
results_df.sort_values(by=['test_f1'],ascending=False)

Unnamed: 0,model,vectorizer,model_nr,vector_size,window,priors,var_smoothing,train_f1,train_recall,train_precision,train_accuracy,train_mcc,test_f1,test_recall,test_precision,test_accuracy,test_mcc
1520,W2V Param,23,2,200,30,,1.000000e-02,0.452793,0.666344,0.342900,0.709865,0.310813,0.449742,0.670591,0.338321,0.705030,0.307224
1519,W2V Param,23,1,200,30,,1.000000e-09,0.452799,0.662955,0.343811,0.711348,0.310699,0.449527,0.667421,0.338890,0.706170,0.306795
1518,W2V Param,23,0,200,30,,1.000000e-10,0.452799,0.662955,0.343811,0.711348,0.310699,0.449527,0.667421,0.338890,0.706170,0.306795
1575,W2V Param,23,57,200,30,"[0.9, 0.1]",1.000000e-02,0.453875,0.619771,0.358038,0.731319,0.311600,0.449123,0.620557,0.351907,0.726355,0.305419
1521,W2V Param,23,3,200,30,,5.000000e-02,0.451293,0.674187,0.339162,0.704668,0.309049,0.448472,0.677836,0.335087,0.700309,0.305780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,W2V Param,0,61,5,3,"[0.9, 0.1]",4.000000e-01,0.000194,0.000097,0.333333,0.819839,0.002884,0.000000,0.000000,0.000000,0.820120,-0.005173
262,W2V Param,3,64,5,30,"[0.9, 0.1]",8.000000e-01,0.000580,0.000290,0.230769,0.819734,0.001984,0.000000,0.000000,0.000000,0.819917,-0.008448
263,W2V Param,3,65,5,30,"[0.9, 0.1]",1.000000e+00,0.000194,0.000097,0.166667,0.819787,-0.000359,0.000000,0.000000,0.000000,0.820120,-0.005173
394,W2V Param,5,64,10,3,"[0.9, 0.1]",8.000000e-01,0.000000,0.000000,0.000000,0.819856,0.000000,0.000000,0.000000,0.000000,0.820243,0.000000
