In [1]:
from sklearn.naive_bayes import MultinomialNB # ideal für counting features wie bow oder tfidf https://towardsdatascience.com/why-how-to-use-the-naive-bayes-algorithms-in-a-regulated-industry-with-sklearn-python-code-dbd8304ab2cf
from sklearn.naive_bayes import GaussianNB # für Features in Decimal Form geeignet
from sklearn.naive_bayes import ComplementNB # ähnlich wie Multinomial, soll sich aber besser für imbalanced data eignen
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
    matthews_corrcoef, 
)
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import joblib

In [2]:
def evaluate(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print("Accuracy:", accuracy)
    print("MCC:", mcc)
    print("\n")
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(y_test, y_pred)))
    print("\n")
    print("Classification Report")
    print(classification_report(y_test, y_pred))

## Evaluation neue Vectorize-Funktionen (08.12.)

In [3]:
%run ../../../functions/vectorize_functions.py

In [4]:
filepath_name = (('../../../../data/twitter_hate-speech/train_cleaned.csv'))
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

## GaussianNB

In [51]:
#df_param = [df_cleaned]
#text_column_param = ['tweet_cleaned']
#label_column_param = ['label']
vector_size_param = [5, 10, 50, 100, 200, 300]
window_param = [1, 3, 5, 10, 30]
#min_count_param = [1, 2, 5, 10]
#test_size_param = [0.3]
#random_state_param = [42]

In [52]:
from itertools import product
#all_combinations = product(df_param, text_column_param, label_column_param, vector_size_param, window_param, min_count_param, test_size_param, 
 #                          random_state_param)
all_combinations = product(vector_size_param, window_param)

In [53]:
def used_parameters(vector_size=300, window=5):
    vector_size_res = vector_size
    window_res = window

    return  vector_size_res, window_res

In [54]:
results_list = []

In [55]:
model_nr = 0
for combination in all_combinations:
    #X_train, X_test, y_train, y_test = vectorize_w2v(*combination)
    vector_size_used, window_used = used_parameters(*combination)
    X_train, X_test, y_train, y_test, vectorizer_w2v_param = vectorize_w2v(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label", vector_size=vector_size_used,
                                                    window=window_used)

    param_grid = {
    'priors' : [None,[.1,.9], [.25,.75], [.5,.5], [.75,.25],[.9,.1]],
    'var_smoothing' : [1.0e-10, 1e-9, 0.01, 0.05, 0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 1]
    }
    
    nb = GaussianNB()  
    
    grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    for idx, params in enumerate(grid_search.cv_results_['params']):
        model = GaussianNB(**params)  
        model.fit(X_train, y_train)  

        y_train_pred = model.predict(X_train)

        y_test_pred = model.predict(X_test)

        train_accuracy = accuracy_score(y_train, y_train_pred)
        train_recall = recall_score(y_train, y_train_pred)
        train_precision = precision_score(y_train, y_train_pred)
        train_f1 = f1_score(y_train, y_train_pred)
        train_mcc = matthews_corrcoef(y_train, y_train_pred)

        test_accuracy = accuracy_score(y_test, y_test_pred)
        test_recall = recall_score(y_test, y_test_pred)
        test_precision = precision_score(y_test, y_test_pred)
        test_f1 = f1_score(y_test, y_test_pred)
        test_mcc = matthews_corrcoef(y_test, y_test_pred)

        result_dict = {
            'model': 'W2V Param',
            'vectorizer': model_nr,
            'model_nr': idx,
            'vector_size': vector_size_used, 
            'window': window_used,
            'priors': params['priors'],
            'var_smoothing': params['var_smoothing'],
            'train_f1': train_f1,
            'train_recall': train_recall,
            'train_precision': train_precision,
            'train_accuracy': train_accuracy,
            'train_mcc': train_mcc,
            'test_f1': test_f1,
            'test_recall': test_recall,
            'test_precision': test_precision,
            'test_accuracy': test_accuracy,
            'test_mcc': test_mcc
        }

        results_list.append(result_dict)

        filename = f'joblib_models/nb_w2v_param/model_nb_w2v_param_{model_nr}_{idx}.joblib'
        joblib.dump(model, filename)

    filename = f'joblib_models/vectorizer/vectorizer_w2v_param_{model_nr}.joblib'
    joblib.dump(vectorizer_w2v_param, filename)

    model_nr = model_nr + 1

results_df = pd.DataFrame(results_list)

results_df.to_csv('eval_data/nb_w2v_param.csv', index=False)

print(results_df)


Fitting 3 folds for each of 66 candidates, totalling 198 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 3 folds for each of 66 candidates, totalling 198 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 3 folds for each of 66 candidates, totalling 198 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 3 folds for each of 66 candidates, totalling 198 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 3 folds for each of 66 candidates, totalling 198 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 3 folds for each of 66 candidates, totalling 198 fits
Fitting 

In [56]:
results_df.sort_values(by=['test_f1'],ascending=False)

Unnamed: 0,model,vectorizer,model_nr,vector_size,window,priors,var_smoothing,train_f1,train_recall,train_precision,train_accuracy,train_mcc,test_f1,test_recall,test_precision,test_accuracy,test_mcc
468,W2V Param,7,6,10,5,,0.4,0.441524,0.522822,0.382108,0.909728,0.399441,0.464088,0.572207,0.390335,0.919888,0.431503
525,W2V Param,7,63,10,5,"[0.9, 0.1]",0.6,0.445738,0.556017,0.371964,0.905622,0.405951,0.458811,0.599455,0.371622,0.914272,0.429056
459,W2V Param,6,63,10,3,"[0.9, 0.1]",0.6,0.434151,0.495851,0.386107,0.911781,0.390679,0.456100,0.544959,0.392157,0.921209,0.421378
458,W2V Param,6,62,10,3,"[0.9, 0.1]",0.5,0.435754,0.566390,0.354086,0.899887,0.396799,0.454638,0.607629,0.363192,0.911629,0.425971
402,W2V Param,6,6,10,3,,0.4,0.438722,0.477178,0.406002,0.916667,0.395530,0.449761,0.512262,0.400853,0.924017,0.413159
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,W2V Param,4,8,5,30,,0.6,0.000000,0.000000,0.000000,0.931747,0.000000,0.000000,0.000000,0.000000,0.939379,0.000000
273,W2V Param,4,9,5,30,,0.8,0.000000,0.000000,0.000000,0.931747,0.000000,0.000000,0.000000,0.000000,0.939379,0.000000
274,W2V Param,4,10,5,30,,1.0,0.000000,0.000000,0.000000,0.931747,0.000000,0.000000,0.000000,0.000000,0.939379,0.000000
327,W2V Param,4,63,5,30,"[0.9, 0.1]",0.6,0.000000,0.000000,0.000000,0.931747,0.000000,0.000000,0.000000,0.000000,0.939379,0.000000
