In [1]:
from sklearn.naive_bayes import MultinomialNB # ideal für counting features wie bow oder tfidf https://towardsdatascience.com/why-how-to-use-the-naive-bayes-algorithms-in-a-regulated-industry-with-sklearn-python-code-dbd8304ab2cf
from sklearn.naive_bayes import GaussianNB # für Features in Decimal Form geeignet
from sklearn.naive_bayes import ComplementNB # ähnlich wie Multinomial, soll sich aber besser für imbalanced data eignen
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
)
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

In [2]:
def evaluate(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print(pd.DataFrame(confusion_matrix(y_test, y_pred)))

## Evaluation neue Vectorize-Funktionen (08.12.)

In [3]:
%run ../../functions/vectorize_functions.py

In [4]:
filepath_name = (('../../../data/mixed_dataset/train_cleaned.csv'))
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

## GaussianNB

In [5]:
#df_param = [df_cleaned]
#text_column_param = ['tweet_cleaned']
#label_column_param = ['label']
vector_size_param = [10, 50, 100, 200, 300]
window_param = [3, 5, 10, 20, 35]
min_count_param = [1, 3, 5, 10]
#test_size_param = [0.3]
#random_state_param = [42]

In [6]:
from itertools import product
#all_combinations = product(df_param, text_column_param, label_column_param, vector_size_param, window_param, min_count_param, test_size_param, 
 #                          random_state_param)
all_combinations = product(vector_size_param, window_param, min_count_param)

In [7]:
def used_parameters(vector_size=300, window=5, min_count=1):
    vector_size_res = vector_size
    window_res = window
    min_count_res = min_count

    return  vector_size_res, window_res, min_count_res

In [8]:
results_list = []

In [9]:

for combination in all_combinations:
    #X_train, X_test, y_train, y_test = vectorize_w2v(*combination)
    vector_size_used, window_used, min_count_used = used_parameters(*combination)
    X_train, X_test, y_train, y_test = vectorize_w2v(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label", vector_size=vector_size_used,
                                                    window=window_used, min_count=min_count_used)
    
    model = GaussianNB()  
    model.fit(X_train, y_train)  

    y_train_pred = model.predict(X_train)

    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    result_dict = {
        'model': 'W2V Param',
        'vector_size': vector_size_used, 
        'window': window_used,
        'min_count': min_count_used,
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('eval_data/nb_grid_w2v_parameters_mixed_dataset.csv', index=False)

print(results_df)


        model  vector_size  window  min_count  train_accuracy  train_recall  \
0   W2V Param           10       3          1        0.809792      0.227343   
1   W2V Param           10       3          3        0.813176      0.158404   
2   W2V Param           10       3          5        0.813263      0.225988   
3   W2V Param           10       3         10        0.808711      0.223567   
4   W2V Param           10       5          1        0.810472      0.226375   
..        ...          ...     ...        ...             ...           ...   
95  W2V Param          300      20         10        0.671039      0.718145   
96  W2V Param          300      35          1        0.701319      0.686193   
97  W2V Param          300      35          3        0.692981      0.688807   
98  W2V Param          300      35          5        0.689249      0.694617   
99  W2V Param          300      35         10        0.681696      0.700813   

    train_precision  train_f1  test_accuracy  test_