In [26]:
import pandas as pd
import joblib
import numpy as np
from nltk import word_tokenize
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
    matthews_corrcoef, 
)

In [16]:
def evaluate(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print("Accuracy:", accuracy)
    print("MCC:", mcc)
    print("\n")
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(y_test, y_pred)))
    print("\n")
    print("Classification Report")
    print(classification_report(y_test, y_pred))

In [2]:
filepath_name_test = (('../../../data/mixed_dataset/test_cleaned.csv'))
df_test = pd.read_csv(filepath_name_test, encoding ='utf-8')

In [3]:
df_test = df_test[df_test['tweet_cleaned'].notna()]

# TF-IDF

In [6]:
vectorizer_tfidf = joblib.load('used_models_test_dataset/first_dataset/vectorizer_tfidf.joblib')

In [7]:
x_test_tfidf = vectorizer_tfidf.transform(df_test['tweet_cleaned'])

In [9]:
print(x_test_tfidf.shape)

(32466, 15658)


In [10]:
y_test = df_test['label']

In [12]:
clf_tfidf = joblib.load('used_models_test_dataset/first_dataset/model_tfidf_mn.joblib')

In [13]:
y_pred_tfidf = clf_tfidf.predict(x_test_tfidf)

In [17]:
evaluate(y_test, y_pred_tfidf)

F1 Score: 0.3245067116555417
Recall: 0.323439099283521
Precision: 0.32558139534883723
Accuracy: 0.7163494116922319
MCC: 0.14499428292891944


Confusion Matrix
       0     1
0  21045  4582
1   4627  2212


Classification Report
              precision    recall  f1-score   support

           0       0.82      0.82      0.82     25627
           1       0.33      0.32      0.32      6839

    accuracy                           0.72     32466
   macro avg       0.57      0.57      0.57     32466
weighted avg       0.72      0.72      0.72     32466



In [18]:
# Save Confusionmatrix
confusion_matrix_tfidf_df = pd.DataFrame(confusion_matrix(y_test, y_pred_tfidf))

In [19]:
confusion_matrix_tfidf_df

Unnamed: 0,0,1
0,21045,4582
1,4627,2212


In [21]:
confusion_matrix_tfidf_df.to_csv("evaluation_test_dataset/nb_mn_tfidf_confusion_matrix.csv", index=True)

In [22]:
# Save Classificationreport
classification_report_tfidf_df = pd.DataFrame(classification_report(y_test, y_pred_tfidf, output_dict=True)).transpose()

In [24]:
classification_report_tfidf_df

Unnamed: 0,precision,recall,f1-score,support
0,0.819765,0.821204,0.820484,25627.0
1,0.325581,0.323439,0.324507,6839.0
accuracy,0.716349,0.716349,0.716349,0.716349
macro avg,0.572673,0.572322,0.572495,32466.0
weighted avg,0.715664,0.716349,0.716006,32466.0


In [25]:
classification_report_tfidf_df.to_csv("evaluation_test_dataset/nb_mn_tfidf_classification_report.csv", index=True)

# W2V

In [27]:
vectorizer_w2v = joblib.load('used_models_test_dataset/first_dataset/vectorizer_w2v_param.joblib')

In [32]:
def vectorize_w2v(tweets, loaded_vectorizer, vector_size=10):
    
    x_tokenized = tweets.map(word_tokenize)    

    def w2v_vector(x_tokenized, vector_size):
            vec = np.zeros(vector_size).reshape((1, vector_size))
            count = 0
            for word in x_tokenized:
                try:
                    vec += loaded_vectorizer.wv[word].reshape((1, vector_size))
                    count += 1
                except KeyError:

                    continue
            if count != 0:
                vec /= count
            return vec
        
    tweets_w2v = np.zeros((len(x_tokenized), 10))
    for i in range(len(x_tokenized)):
        tweets_w2v[i, :] = w2v_vector(x_tokenized.iloc[i], 10)

    return tweets_w2v

In [33]:
x_test_w2v = vectorize_w2v(df_test['tweet_cleaned'], vectorizer_w2v)

In [29]:
clf_w2v = joblib.load('used_models_test_dataset/first_dataset/model_nb_w2v_param.joblib')

In [34]:
y_pred_w2v = clf_w2v.predict(x_test_w2v)

In [35]:
evaluate(y_test, y_pred_w2v)

F1 Score: 0.29746464185622107
Recall: 0.3336745138178096
Precision: 0.2683443085606773
Accuracy: 0.6679911291812973
MCC: 0.08428650826837117


Confusion Matrix
       0     1
0  19405  6222
1   4557  2282


Classification Report
              precision    recall  f1-score   support

           0       0.81      0.76      0.78     25627
           1       0.27      0.33      0.30      6839

    accuracy                           0.67     32466
   macro avg       0.54      0.55      0.54     32466
weighted avg       0.70      0.67      0.68     32466



In [36]:
confusion_matrix_w2v_df = pd.DataFrame(confusion_matrix(y_test, y_pred_w2v))

In [37]:
confusion_matrix_w2v_df

Unnamed: 0,0,1
0,19405,6222
1,4557,2282


In [38]:
confusion_matrix_w2v_df.to_csv("evaluation_test_dataset/nb_w2v_confusion_matrix.csv", index=True)

In [39]:
# Save Classificationreport
classification_report_w2v_df = pd.DataFrame(classification_report(y_test, y_pred_w2v, output_dict=True)).transpose()

In [40]:
classification_report_w2v_df

Unnamed: 0,precision,recall,f1-score,support
0,0.809824,0.757209,0.782633,25627.0
1,0.268344,0.333675,0.297465,6839.0
accuracy,0.667991,0.667991,0.667991,0.667991
macro avg,0.539084,0.545442,0.540049,32466.0
weighted avg,0.695761,0.667991,0.680432,32466.0


In [41]:
classification_report_w2v_df.to_csv("evaluation_test_dataset/nb_w2v_classification_report.csv", index=True)

# Glove

In [42]:
%run ../../functions/vectorize_functions.py

In [57]:
x_test_glv, x_leer, y_test_glv, y_leer, vectorizer_glv = vectorize_glove(df=df_test, test_size=1, text_column='tweet_cleaned', 
                                                                                 label_column="label")

In [52]:
clf_w2v = joblib.load('used_models_test_dataset/first_dataset/model_nb_glv.joblib')

In [53]:
y_pred_glv = clf_w2v.predict(x_test_glv)

In [58]:
evaluate(y_test_glv, y_pred_glv)

F1 Score: 0.3367551622418879
Recall: 0.41731247258371107
Precision: 0.28226683809712194
Accuracy: 0.6537193901124287
MCC: 0.11810464798154796


Confusion Matrix
       0     1
0  18369  7257
1   3985  2854


Classification Report
              precision    recall  f1-score   support

           0       0.82      0.72      0.77     25626
           1       0.28      0.42      0.34      6839

    accuracy                           0.65     32465
   macro avg       0.55      0.57      0.55     32465
weighted avg       0.71      0.65      0.68     32465



In [None]:
confusion_matrix_glv_df = pd.DataFrame(confusion_matrix(y_test_glv, y_pred_glv))

In [None]:
confusion_matrix_glv_df

In [None]:
confusion_matrix_glv_df.to_csv("evaluation_test_dataset/nb_glv_confusion_matrix.csv", index=True)

In [None]:
# Save Classificationreport
classification_report_glv_df = pd.DataFrame(classification_report(y_test_glv, y_pred_glv, output_dict=True)).transpose()

In [None]:
classification_report_glv_df

In [None]:
classification_report_glv_df.to_csv("evaluation_test_dataset/nb_glv_classification_report.csv", index=True)