In [18]:
from sklearn.naive_bayes import MultinomialNB # ideal für counting features wie bow oder tfidf https://towardsdatascience.com/why-how-to-use-the-naive-bayes-algorithms-in-a-regulated-industry-with-sklearn-python-code-dbd8304ab2cf
from sklearn.naive_bayes import GaussianNB # für Features in Decimal Form geeignet
from sklearn.naive_bayes import ComplementNB # ähnlich wie Multinomial, soll sich aber besser für imbalanced data eignen
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
)
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

In [None]:
import vectorize_functions

In [7]:
def evaluate(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Precision:", precision)
    print(pd.DataFrame(confusion_matrix(y_test, y_pred)))

In [3]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = vectorize_functions.vectorize_bow()
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = vectorize_functions.vectorize_tfidf()
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = vectorize_functions.vectorize_w2v()
X_train_ft, X_test_ft, y_train_ft, y_test_ft = vectorize_functions.vectorize_ft()

In [4]:
print(X_train_bow.shape)
print(X_test_bow.shape)
print(y_train_bow.shape)
print(y_test_bow.shape)
print(type(X_train_bow))

(13737, 4925)
(5888, 4925)
(13737,)
(5888,)
<class 'scipy.sparse._csr.csr_matrix'>


In [5]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train_tfidf.shape)
print(y_test_tfidf.shape)
print(type(X_train_tfidf))

(19354, 6293)
(8295, 6293)
(19354,)
(8295,)
<class 'scipy.sparse._csr.csr_matrix'>


In [6]:
print(X_train_w2v.shape)
print(X_test_w2v.shape)
print(y_train_w2v.shape)
print(y_test_w2v.shape)
print(type(X_train_w2v))

(19354, 300)
(8295, 300)
(19354,)
(8295,)
<class 'numpy.ndarray'>


In [7]:
print(X_train_ft.shape)
print(X_test_ft.shape)
print(y_train_ft.shape)
print(y_test_ft.shape)
print(type(X_train_ft))

(19354, 300)
(8295, 300)
(19354,)
(8295,)
<class 'numpy.ndarray'>


 #### BoW

In [42]:
clf_bow = MultinomialNB()
clf_bow.fit(X_train_bow,y_train_bow)
y_pred_bow = clf_bow.predict(X_test_bow)

In [43]:
evaluate(y_test_bow, y_pred_bow)

Accuracy: 0.9410488245931284
F1 Score: 0.5852417302798982
Recall: 0.6377079482439926
Precision: 0.5407523510971787
      0    1
0  7461  293
1   196  345


In [49]:
clf_bow_comp = ComplementNB()
clf_bow_comp.fit(X_train_bow,y_train_bow)
y_pred_bow_comp = clf_bow_comp.predict(X_test_bow)

In [50]:
evaluate(y_test_bow, y_pred_bow_comp)

Accuracy: 0.8546112115732369
F1 Score: 0.4300567107750472
Recall: 0.8410351201478743
Precision: 0.28888888888888886
      0     1
0  6634  1120
1    86   455


#### TF-IDF

In [46]:
clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf,y_train_tfidf)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)

In [47]:
evaluate(y_test_tfidf, y_pred_tfidf)

Accuracy: 0.9482820976491863
F1 Score: 0.3529411764705882
Recall: 0.21626617375231053
Precision: 0.9590163934426229
      0    1
0  7749    5
1   424  117


In [51]:
clf_tfidf_comp = ComplementNB()
clf_tfidf_comp.fit(X_train_tfidf,y_train_tfidf)
y_pred_tfidf_comp = clf_tfidf_comp.predict(X_test_tfidf)

In [52]:
evaluate(y_test_bow, y_pred_tfidf_comp)

Accuracy: 0.8603978300180832
F1 Score: 0.42157842157842157
Recall: 0.7800369685767098
Precision: 0.2888432580424367
      0     1
0  6715  1039
1   119   422


#### W2V

In [55]:
clf_w2v = GaussianNB()
clf_w2v.fit(X_train_w2v,y_train_w2v)
y_pred_w2v = clf_w2v.predict(X_test_w2v)

In [24]:
evaluate(y_test_w2v, y_pred_w2v)

NameError: name 'y_pred_w2v' is not defined

#### FastText

In [57]:
clf_ft = GaussianNB()
clf_ft.fit(X_train_ft,y_train_ft)
y_pred_ft = clf_ft.predict(X_test_ft)

In [58]:
evaluate(y_test_ft, y_pred_ft)

Accuracy: 0.747799879445449
F1 Score: 0.29704301075268813
Recall: 0.8170055452865065
Precision: 0.1815195071868583
      0     1
0  5761  1993
1    99   442


## Tests mit modifizierter Vektorisierung

### Word2Vec

#### Vektoren summieren anstelle des Mittelwerts

In [78]:
from nltk import word_tokenize
from gensim.models import Word2Vec
import numpy as np
from gensim.models import FastText
from sklearn.model_selection import train_test_split

def vectorize_w2v_sum():
    df = pd.read_csv('../../../data/twitter_hate-speech/train_cleaned.csv', index_col=0)
    df = df[df['tweet_cleaned'].notna()]

    X_base = df.tweet_cleaned
    y_base = df.label

    X_train_base, X_test_base, y_train_base_sum, y_test_base_sum = train_test_split(X_base, y_base, test_size=0.3,
                                                                            random_state=42)

    X_train_base_tokenized = X_train_base.map(word_tokenize)
    X_test_base_tokenized = X_test_base.map(word_tokenize)

    w2v = Word2Vec(min_count=1, window=35, vector_size=300, sg=0)
    w2v.build_vocab(X_train_base_tokenized)#, progress_per=10000)
    w2v.train(X_train_base_tokenized, total_examples=len(X_train_base_tokenized), epochs=30)

    def w2v_vector(tokenized_tweet, size):
        vec = np.zeros(size).reshape((1, size))
        count = 0
        for word in tokenized_tweet:
            try:
                vec += w2v.wv[word].reshape((1, size))
                count += 1
            except KeyError:

                continue
        #if count != 0:
         #   vec /= count
        return vec

    size = 300
    X_train_w2v_sum = np.zeros((len(X_train_base_tokenized), size))
    for i in range(len(X_train_base_tokenized)):
        X_train_w2v_sum[i, :] = w2v_vector(X_train_base_tokenized.iloc[i], size)

    X_test_w2v_sum = np.zeros((len(X_test_base_tokenized), size))
    for i in range(len(X_test_base_tokenized)):
        X_test_w2v_sum[i, :] = w2v_vector(X_test_base_tokenized.iloc[i], size)

    return X_train_w2v_sum, X_test_w2v_sum, y_train_base_sum, y_test_base_sum


In [79]:
X_train_w2v_sum, X_test_w2v_sum, y_train_w2v_sum, y_test_w2v_sum = vectorize_w2v_sum()

In [81]:
clf_w2v_sum = GaussianNB()
clf_w2v_sum.fit(X_train_w2v_sum,y_train_w2v_sum)
y_pred_w2v_sum = clf_w2v_sum.predict(X_test_w2v_sum)

In [82]:
evaluate(y_test_w2v_sum, y_pred_w2v_sum)

Accuracy: 0.4412296564195298
F1 Score: 0.17658553917214426
Recall: 0.9186691312384473
Precision: 0.09768081761006289
      0     1
0  3163  4591
1    44   497


Recall höher als in Durchschnittsvariante, alle anderen schlechter

#### Vorhandenes Twitter Wörterbuch verwenden

In [None]:
# Glove 200: https://nlp.stanford.edu/projects/glove/

In [5]:
from nltk import word_tokenize
from gensim.models import Word2Vec
import numpy as np
from gensim.models import FastText
from sklearn.model_selection import train_test_split
import gensim.downloader as api

def vectorize_glv():
    df = pd.read_csv('../../../data/twitter_hate-speech/train_cleaned.csv', index_col=0)
    df = df[df['tweet_cleaned'].notna()]

    X_base = df.tweet_cleaned
    y_base = df.label

    X_train_base, X_test_base, y_train_base_glv, y_test_base_glv = train_test_split(X_base, y_base, test_size=0.3,
                                                                            random_state=42)

    X_train_base_tokenized = X_train_base.map(word_tokenize)
    X_test_base_tokenized = X_test_base.map(word_tokenize)

    w2v = api.load("glove-twitter-200") #Word2Vec(min_count=1, window=35, vector_size=300, sg=0)
    #w2v.build_vocab(X_train_base_tokenized)#, progress_per=10000)
    #w2v.train(X_train_base_tokenized, total_examples=len(X_train_base_tokenized), epochs=30)

    def w2v_vector(tokenized_tweet, size):
        vec = np.zeros(size).reshape((1, size))
        count = 0
        for word in tokenized_tweet:
            try:
                vec += w2v[word].reshape((1, size))
                count += 1
            except KeyError:

                continue
        if count != 0:
            vec /= count
        return vec

    size = 200
    X_train_w2v_glv = np.zeros((len(X_train_base_tokenized), size))
    for i in range(len(X_train_base_tokenized)):
        X_train_w2v_glv[i, :] = w2v_vector(X_train_base_tokenized.iloc[i], size)

    X_test_w2v_glv = np.zeros((len(X_test_base_tokenized), size))
    for i in range(len(X_test_base_tokenized)):
        X_test_w2v_glv[i, :] = w2v_vector(X_test_base_tokenized.iloc[i], size)

    return X_train_w2v_glv, X_test_w2v_glv, y_train_base_glv, y_test_base_glv


In [None]:
X_train_glv, X_test_glv, y_train_glv, y_test_glv = vectorize_glv()

In [7]:
clf_glv = GaussianNB()
clf_glv.fit(X_train_glv,y_train_glv)
y_pred_glv = clf_glv.predict(X_test_glv)

In [8]:
evaluate(y_test_glv, y_pred_glv)

Accuracy: 0.8738111413043478
F1 Score: 0.4117181314330958
Recall: 0.6951871657754011
Precision: 0.2924634420697413
      0    1
0  4885  629
1   114  260


Schlechterer Wert für Recall, alle anderen besser als oben in W2V

### FastText

In [110]:
def vectorize_ft_sum():
    df = pd.read_csv('../../../data/twitter_hate-speech/train_cleaned.csv', index_col=0)
    df = df[df['tweet_cleaned'].notna()]

    X_base = df.tweet_cleaned
    y_base = df.label

    X_train_base, X_test_base, y_train_base_ft_sum, y_test_base_ft_sum = train_test_split(X_base, y_base, test_size=0.3,
                                                                            random_state=42)

    X_train_base_tokenized = X_train_base.map(word_tokenize)
    X_test_base_tokenized = X_test_base.map(word_tokenize)

    ft = FastText(window=35, min_count=1, vector_size=300)
    ft.build_vocab(corpus_iterable=X_train_base_tokenized)
    ft.train(corpus_iterable=X_train_base_tokenized, total_examples=len(X_train_base_tokenized), epochs=30)

    def ft_vector(tokenized_tweet, size):
        vec = np.zeros(size).reshape((1, size))
        count = 0
        for word in tokenized_tweet:
            try:
                vec += ft.wv[word].reshape((1, size))
                count += 1
            except KeyError:

                continue
        #if count != 0:
        #    vec /= count
        return vec

    size = 300
    X_train_ft_sum = np.zeros((len(X_train_base_tokenized), size))
    for i in range(len(X_train_base_tokenized)):
        X_train_ft_sum[i, :] = ft_vector(X_train_base_tokenized.iloc[i], size)

    X_test_ft_sum = np.zeros((len(X_test_base_tokenized), size))
    for i in range(len(X_test_base_tokenized)):
        X_test_ft_sum[i, :] = ft_vector(X_test_base_tokenized.iloc[i], size)

    return X_train_ft_sum, X_test_ft_sum, y_train_base_ft_sum, y_test_base_ft_sum


In [111]:
X_train_ft_sum, X_test_ft_sum, y_train_ft_sum, y_test_ft_sum = vectorize_ft_sum()

In [112]:
clf_ft_sum = GaussianNB()
clf_ft_sum.fit(X_train_ft_sum,y_train_ft_sum)
y_pred_ft_sum = clf_ft_sum.predict(X_test_ft_sum)

In [113]:
evaluate(y_test_ft_sum, y_pred_ft_sum)

Accuracy: 0.5057263411693791
F1 Score: 0.19513152728700434
Recall: 0.9186691312384473
Precision: 0.10915879639797936
      0     1
0  3698  4056
1    44   497


Recall besser, alle anderen schlechter als oben

## Evaluation neue Vectorize-Funktionen (08.12.)

In [2]:
%run ../../functions/vectorize_functions.py

In [3]:
filepath_name = (('../../../data/twitter_hate-speech/train_cleaned.csv'))
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

In [4]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = vectorize_tfidf(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label")
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = vectorize_w2v(df=df_cleaned, text_column ="tweet_cleaned", label_column="label")
X_train_glv, X_test_glv, y_train_glv, y_test_glv = vectorize_glove(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label")

In [5]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train_tfidf.shape)
print(y_test_tfidf.shape)
print(type(X_train_tfidf))

(13737, 4925)
(5888, 4925)
(13737,)
(5888,)
<class 'scipy.sparse._csr.csr_matrix'>


In [6]:
clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf,y_train_tfidf)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)

In [9]:
evaluate(y_test_tfidf, y_pred_tfidf)

Accuracy: 0.9458220108695652
F1 Score: 0.27334851936218685
Recall: 0.16042780748663102
Precision: 0.9230769230769231
      0   1
0  5509   5
1   314  60


In [12]:
clf_tfidf_comp = ComplementNB()
clf_tfidf_comp.fit(X_train_tfidf,y_train_tfidf)
y_pred_tfidf_comp = clf_tfidf_comp.predict(X_test_tfidf)

In [13]:
evaluate(y_test_tfidf, y_pred_tfidf_comp)

Accuracy: 0.8566576086956522
F1 Score: 0.4089635854341736
Recall: 0.7807486631016043
Precision: 0.27703984819734345
      0    1
0  4752  762
1    82  292


In [23]:
results_list = []

param_grid = {
    'alpha' : [0, 0.01, 0.1, 0.25, 0.5, 0.75, 1],
    'norm' : [True, False]
}

cnb = ComplementNB()

grid_search = GridSearchCV(estimator=cnb, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train_tfidf)

for params in grid_search.cv_results_['params']:
    model = ComplementNB(**params)  
    model.fit(X_train_tfidf, y_train_tfidf)  

    y_train_pred_tfidf = model.predict(X_train_tfidf)

    y_test_pred_tfidf = model.predict(X_test_tfidf)

    train_accuracy = accuracy_score(y_train_tfidf, y_train_pred_tfidf)
    train_recall = recall_score(y_train_tfidf, y_train_pred_tfidf)
    train_precision = precision_score(y_train_tfidf, y_train_pred_tfidf)
    train_f1 = f1_score(y_train_tfidf, y_train_pred_tfidf)

    test_accuracy = accuracy_score(y_test_tfidf, y_test_pred_tfidf)
    test_recall = recall_score(y_test_tfidf, y_test_pred_tfidf)
    test_precision = precision_score(y_test_tfidf, y_test_pred_tfidf)
    test_f1 = f1_score(y_test_tfidf, y_test_pred_tfidf)

    result_dict = {
        'alpha': params['alpha'],
        'norm': params['norm'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('cnb_grid_tfidf.csv', index=False)

print(results_df)


Fitting 3 folds for each of 14 candidates, totalling 42 fits




    alpha   norm  train_accuracy  train_recall  train_precision  train_f1  \
0    0.00   True        0.487151      1.000000         0.116614  0.208871   
1    0.00  False        0.949989      0.979570         0.576947  0.726186   
2    0.01   True        0.735459      1.000000         0.203769  0.338551   
3    0.01  False        0.936667      0.979570         0.517026  0.676820   
4    0.10   True        0.888112      0.979570         0.375051  0.542423   
5    0.10  False        0.911116      0.972043         0.430681  0.596897   
6    0.25   True        0.931644      0.913978         0.497367  0.644183   
7    0.25  False        0.894591      0.949462         0.386602  0.549471   
8    0.50   True        0.950571      0.830108         0.597061  0.694557   
9    0.50  False        0.883308      0.916129         0.358435  0.515271   
10   0.75   True        0.955012      0.759140         0.641818  0.695567   
11   0.75  False        0.882725      0.893548         0.354673  0.507791   