# Naive Bayes Complement TF-IDF

In [1]:
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
)
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import joblib
from itertools import product
from sklearn.feature_extraction.text import TfidfVectorizer

#### Evaluations-Funktion definieren

In [2]:
def evaluate(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print("\n")
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(y_test, y_pred)))
    print("\n")
    print("Classification Report")
    print(classification_report(y_test, y_pred))

#### Vektorisierungs Funktionen laden

In [24]:
%run ../../functions/vectorize_functions.py

#### Dataset in DF laden (Mixed Datset)

In [3]:
filepath_name = (('../../../data/mixed_dataset/train_cleaned.csv'))
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

#### Vektorisierung vorbereiten: Parameter erstellen und kombinieren

In [13]:
def vectorize_tfidf_param(df, text_column, label_column, analyzer, ngram_range, max_features, norm, max_df=1.0, min_df=1, test_size=0.3, random_state=42):

    df = df[df[text_column].notna()]

    X_base = df[text_column]
    y_base = df[label_column]

    X_train_base, X_test_base, y_train_tfidf, y_test_tfidf = train_test_split(X_base, y_base, test_size=test_size,
                                                                              random_state=random_state)

    tfidf_vectorizer = TfidfVectorizer(analyzer = analyzer, ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features, norm = norm)

    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_base)
    X_test_tfidf = tfidf_vectorizer.transform(X_test_base)

    return X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf, tfidf_vectorizer

In [14]:
# Parameter zum Testen festlegen
analyzer_param = ['char', 'word']
ngram_range_param = [(1,4), (1,3), (1,1),(2,3),(1,15)]
#max_df_param = [0.51, 0.75, 1.0]
#min_df_param = [0.0, 0.1, 0.25, 0.4]
max_features_param = [None, 10, 50, 100]#, 100]
norm_param = ['l1', 'l2']

In [15]:
#all_combinations = product(analyzer_param, ngram_range_param, max_df_param, min_df_param, max_features_param, norm_param)
all_combinations = product(analyzer_param, ngram_range_param, max_features_param, norm_param)

In [16]:
def used_parameters(analyzer_param, ngram_range_param, max_features_param, norm_param):#max_df_param, min_df_param,
    analyzer_param = analyzer_param
    ngram_range_param = ngram_range_param
    #max_df_param = max_df_param
    #min_df_param = min_df_param
    max_features_param = max_features_param
    norm_param = norm_param

    return  analyzer_param, ngram_range_param, max_features_param, norm_param #max_df_param, min_df_param,

In [17]:
results_list = []

In [18]:
for combination in all_combinations:
    #max_df_used, min_df_used,max_df=max_df_used, min_df=min_df_used, 
    analyzer_used, ngram_range_used, max_features_used, norm_used = used_parameters(*combination)
    X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf, vectorizer_tfidf = vectorize_tfidf_param(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label", analyzer=analyzer_used, ngram_range=ngram_range_used, 
                                                                                                      max_features=max_features_used, norm=norm_used)
    
    param_grid = {
    'alpha' : [1.0e-10, 0.01, 0.1, 0.25, 0.5, 0.75, 1],
    'norm' : [True, False]
    }

    cnb = ComplementNB()

    grid_search = GridSearchCV(estimator=cnb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)
    grid_search.fit(X_train_tfidf, y_train_tfidf)

    for idx, params in enumerate(grid_search.cv_results_['params']):
        model = ComplementNB(**params)  
        model.fit(X_train_tfidf, y_train_tfidf)  

        y_train_pred_tfidf = model.predict(X_train_tfidf)

        y_test_pred_tfidf = model.predict(X_test_tfidf)

        train_accuracy = accuracy_score(y_train_tfidf, y_train_pred_tfidf)
        train_recall = recall_score(y_train_tfidf, y_train_pred_tfidf)
        train_precision = precision_score(y_train_tfidf, y_train_pred_tfidf)
        train_f1 = f1_score(y_train_tfidf, y_train_pred_tfidf)

        test_accuracy = accuracy_score(y_test_tfidf, y_test_pred_tfidf)
        test_recall = recall_score(y_test_tfidf, y_test_pred_tfidf)
        test_precision = precision_score(y_test_tfidf, y_test_pred_tfidf)
        test_f1 = f1_score(y_test_tfidf, y_test_pred_tfidf)

        #test_report = classification_report(y_test_tfidf, y_test_pred_tfidf, output_dict=True)

        result_dict = {
            'model': 'TF-IDF (Comp)',
            'alpha': params['alpha'],
            'norm': params['norm'],
            'analyzer': analyzer_used, 
            'ngram_range': ngram_range_used, 
            #'max_df':max_df_used, 
            #'min_df':min_df_used, 
            'max_features':max_features_used, 
            'norm_vectorizer':norm_used,
            'train_accuracy': train_accuracy,
            'train_recall': train_recall,
            'train_precision': train_precision,
            'train_f1': train_f1,
            'test_accuracy': test_accuracy,
            'test_recall': test_recall,
            'test_precision': test_precision,
            'test_f1': test_f1,
            #'test_recall_1': test_report['1']['recall'],
        }
        results_list.append(result_dict)


results_df_comp = pd.DataFrame(results_list)
results_df_comp.to_csv('eval_data/cnb_grid_tfidf_mixed_dataset_final_test.csv', index=False)

print(results_df_comp)

Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Fitting 3 folds for each

In [19]:
results_df_comp

Unnamed: 0,model,alpha,norm,analyzer,ngram_range,max_features,norm_vectorizer,train_accuracy,train_recall,train_precision,train_f1,test_accuracy,test_recall,test_precision,test_f1
0,TF-IDF (Comp),1.000000e-10,True,char,"(1, 4)",,l1,0.187958,1.000000,0.181562,0.307326,0.186025,0.999774,0.180865,0.306316
1,TF-IDF (Comp),1.000000e-10,False,char,"(1, 4)",,l1,0.821164,0.836948,0.502179,0.627719,0.756145,0.604709,0.386150,0.471325
2,TF-IDF (Comp),1.000000e-02,True,char,"(1, 4)",,l1,0.851950,0.429802,0.630719,0.511229,0.832736,0.363822,0.552804,0.438831
3,TF-IDF (Comp),1.000000e-02,False,char,"(1, 4)",,l1,0.788390,0.791344,0.450303,0.573987,0.753703,0.707041,0.396270,0.507887
4,TF-IDF (Comp),1.000000e-01,True,char,"(1, 4)",,l1,0.822734,0.021785,0.789474,0.042401,0.822318,0.016980,0.757576,0.033215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,TF-IDF (Comp),5.000000e-01,False,word,"(1, 15)",100.0,l2,0.655062,0.592758,0.282224,0.382386,0.645206,0.581164,0.272072,0.370632
1116,TF-IDF (Comp),7.500000e-01,True,word,"(1, 15)",100.0,l2,0.583950,0.692196,0.256946,0.374774,0.574068,0.680326,0.249191,0.364773
1117,TF-IDF (Comp),7.500000e-01,False,word,"(1, 15)",100.0,l2,0.655166,0.592564,0.282262,0.382381,0.645206,0.581164,0.272072,0.370632
1118,TF-IDF (Comp),1.000000e+00,True,word,"(1, 15)",100.0,l2,0.584630,0.691615,0.257201,0.374961,0.574638,0.679647,0.249356,0.364852


In [21]:
results_df_comp.sort_values(by=['test_f1'],ascending=False)

Unnamed: 0,model,alpha,norm,analyzer,ngram_range,max_features,norm_vectorizer,train_accuracy,train_recall,train_precision,train_f1,test_accuracy,test_recall,test_precision,test_f1
21,TF-IDF (Comp),0.25,False,char,"(1, 4)",,l2,0.795524,0.745933,0.458490,0.567911,0.765709,0.675345,0.408295,0.508914
3,TF-IDF (Comp),0.01,False,char,"(1, 4)",,l1,0.788390,0.791344,0.450303,0.573987,0.753703,0.707041,0.396270,0.507887
19,TF-IDF (Comp),0.10,False,char,"(1, 4)",,l2,0.782739,0.811290,0.443662,0.573629,0.747558,0.723794,0.390831,0.507581
806,TF-IDF (Comp),0.50,True,word,"(1, 1)",,l2,0.842095,0.684353,0.549568,0.609599,0.796191,0.575957,0.447966,0.503962
17,TF-IDF (Comp),0.01,False,char,"(1, 4)",,l2,0.789001,0.842854,0.453882,0.590029,0.745483,0.716550,0.387535,0.503020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248,TF-IDF (Comp),0.75,True,char,"(1, 1)",,l2,0.819664,0.000484,0.238095,0.000966,0.819713,0.000453,0.117647,0.000902
236,TF-IDF (Comp),1.00,True,char,"(1, 1)",,l1,0.819682,0.000097,0.083333,0.000193,0.819998,0.000226,0.125000,0.000452
320,TF-IDF (Comp),1.00,True,char,"(1, 1)",100.0,l1,0.819682,0.000097,0.083333,0.000193,0.819998,0.000226,0.125000,0.000452
318,TF-IDF (Comp),0.75,True,char,"(1, 1)",100.0,l1,0.819717,0.000194,0.166667,0.000387,0.819958,0.000226,0.111111,0.000452


In [22]:
results_df_comp.sort_values(by=['test_recall'],ascending=False)

Unnamed: 0,model,alpha,norm,analyzer,ngram_range,max_features,norm_vectorizer,train_accuracy,train_recall,train_precision,train_f1,test_accuracy,test_recall,test_precision,test_f1
280,TF-IDF (Comp),1.000000e-10,True,char,"(1, 1)",50.0,l1,0.180894,1.000000,0.180279,0.305485,0.180246,1.000000,0.179845,0.304862
294,TF-IDF (Comp),1.000000e-10,True,char,"(1, 1)",50.0,l2,0.180894,1.000000,0.180279,0.305485,0.180246,1.000000,0.179845,0.304862
0,TF-IDF (Comp),1.000000e-10,True,char,"(1, 4)",,l1,0.187958,1.000000,0.181562,0.307326,0.186025,0.999774,0.180865,0.306316
14,TF-IDF (Comp),1.000000e-10,True,char,"(1, 4)",,l2,0.186737,1.000000,0.181339,0.307006,0.185252,0.999774,0.180724,0.306114
224,TF-IDF (Comp),1.000000e-10,True,char,"(1, 1)",,l1,0.181400,1.000000,0.180370,0.305616,0.180490,0.999774,0.179863,0.304878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,TF-IDF (Comp),2.500000e-01,True,char,"(1, 1)",,l1,0.819752,0.000678,0.350000,0.001353,0.819713,0.000453,0.117647,0.000902
318,TF-IDF (Comp),7.500000e-01,True,char,"(1, 1)",100.0,l1,0.819717,0.000194,0.166667,0.000387,0.819958,0.000226,0.111111,0.000452
320,TF-IDF (Comp),1.000000e+00,True,char,"(1, 1)",100.0,l1,0.819682,0.000097,0.083333,0.000193,0.819998,0.000226,0.125000,0.000452
236,TF-IDF (Comp),1.000000e+00,True,char,"(1, 1)",,l1,0.819682,0.000097,0.083333,0.000193,0.819998,0.000226,0.125000,0.000452


# Tests

In [None]:
tfidf_vectorizer_c = TfidfVectorizer(analyzer = 'char', ngram_range=(1,5), max_df=1, min_df=0.0, max_features=100, norm = 'l2') #analyzer = word, norm= 'l1' oder None (l2 Standard)

X_train_c = tfidf_vectorizer_c.fit_transform(X_train_base)
X_test_c = tfidf_vectorizer_c.transform(X_test_base)