# Test Klassifizierung gespeicherte Modell

In [1]:
import joblib

In [2]:
import numpy as np
import pandas as pd
from nltk import word_tokenize

## TF-IDF NB

###### Model und Vectorizer laden

In [59]:
vectorizer_tfidf = joblib.load('joblib_models/vectorizer_tfidf.joblib')

In [60]:
clf_nb_tfidf = joblib.load('joblib_models/model_nb_tfidf_comp.joblib')

In [61]:
clf_nb_tfidf.get_params()

{'alpha': 0.4,
 'class_prior': None,
 'fit_prior': True,
 'force_alpha': 'warn',
 'norm': False}

## Word2Vec NB

###### Model und Vectorizer laden

In [62]:
vectorizer_w2v = joblib.load('joblib_models/vectorizer_w2v_param.joblib')

In [63]:
clf_nb_w2v = joblib.load('joblib_models/model_nb_w2v_param.joblib')

In [64]:
clf_nb_w2v.get_params()

{'priors': None, 'var_smoothing': 0.01}

In [65]:
# tweets = Spalte aus pd Dataframe
# loaded_vectorizer = joblib w2v vectorizer

def vectorize_w2v(tweets, vectorizer_w2v, vector_size=200):
    
    x_tokenized = tweets.map(word_tokenize)    

    def w2v_vector(x_tokenized, vector_size):
            vec = np.zeros(vector_size).reshape((1, vector_size))
            count = 0
            for word in x_tokenized:
                try:
                    vec += vectorizer_w2v.wv[word].reshape((1, vector_size))
                    count += 1
                except KeyError:

                    continue
            if count != 0:
                vec /= count
            return vec
        
    tweets_w2v = np.zeros((len(x_tokenized), 200))
    for i in range(len(x_tokenized)):
        tweets_w2v[i, :] = w2v_vector(x_tokenized.iloc[i], 200)

    return tweets_w2v

## Test mit Beispiel Tweets

In [66]:
tweet_hs = ['i hate jews so much i wish the holocaust actually happened']
tweet_no_hs = ['today i start breaking in a new pair of docs have mercy on my soles']

In [67]:
df_hs = pd.DataFrame(tweet_hs)
df_no_hs = pd.DataFrame(tweet_no_hs)

In [68]:
print(clf_nb_tfidf.predict(vectorizer_tfidf.transform(tweet_hs)))

[1]


In [76]:
print(clf_nb_tfidf.predict_proba(vectorizer_tfidf.transform(tweet_hs)))

[[0.23827261 0.76172739]]


In [71]:
print(clf_nb_tfidf.predict(vectorizer_tfidf.transform(tweet_no_hs)))

[0]


In [75]:
print(clf_nb_tfidf.predict_proba(vectorizer_tfidf.transform(tweet_no_hs)))

[[0.67355329 0.32644671]]


In [72]:
print(clf_nb_w2v.predict(vectorize_w2v(df_hs[0],vectorizer_w2v )))

[1]


In [77]:
print(clf_nb_w2v.predict_proba(vectorize_w2v(df_hs[0],vectorizer_w2v )))

[[0.0040413 0.9959587]]


In [73]:
print(clf_nb_w2v.predict(vectorize_w2v(df_no_hs[0],vectorizer_w2v )))

[0]


In [74]:
print(clf_nb_w2v.predict_proba(vectorize_w2v(df_no_hs[0],vectorizer_w2v )))

[[9.99920836e-01 7.91636976e-05]]


## Test Datensatz

In [78]:
filepath_name_test = (('../../../../data/mixed_dataset/test_cleaned.csv'))
df_test = pd.read_csv(filepath_name_test, encoding ='utf-8')

In [79]:
df_test = df_test[df_test['tweet_cleaned'].notna()]

In [80]:
df_test['clf_tfidf'] = clf_nb_tfidf.predict(vectorizer_tfidf.transform(df_test['tweet_cleaned']))

In [81]:
df_test['prob_tfidf_0'], df_test['prob_tfidf_1'] = clf_nb_tfidf.predict_proba(vectorizer_tfidf.transform(df_test['tweet_cleaned']))[:,0],clf_nb_tfidf.predict_proba(vectorizer_tfidf.transform(df_test['tweet_cleaned']))[:,1]

In [82]:
df_test['clf_w2v'] = clf_nb_w2v.predict(vectorize_w2v(df_test['tweet_cleaned'],vectorizer_w2v ))

In [83]:
df_test['prob_w2v_0'], df_test['prob_w2v_1'] = clf_nb_w2v.predict_proba(vectorize_w2v(df_test['tweet_cleaned'],vectorizer_w2v ))[:,0], clf_nb_w2v.predict_proba(vectorize_w2v(df_test['tweet_cleaned'],vectorizer_w2v ))[:,1]

In [85]:
df_test = df_test.drop(['user_handle', 'hashtags', 'emojis'], axis=1)

In [86]:
df_test

Unnamed: 0.1,Unnamed: 0,label,tweet,tweet_cleaned,clf_tfidf,prob_tfidf_0,prob_tfidf_1,clf_w2v,prob_w2v_0,prob_w2v_1
0,8886,0,@user #cinemaaawards final rehearsals!! geari...,final gear evening butterfly stage hope like,0,0.853726,0.146274,0,0.999974,2.554390e-05
1,27613,0,this was amazing. the weather was not. #musica...,amazing weather musical london weekend …,0,0.989774,0.010226,0,1.000000,1.162569e-17
2,18952,0,child attacked by alligator at walt disney wor...,child attack alligator disney world,0,0.868091,0.131909,0,0.997782,2.218165e-03
3,12485,0,me rn cause school is over soon,right cause school soon,0,0.595383,0.404617,0,0.999382,6.181024e-04
4,5879,0,is it #good to #sleep when ? #good #sleep,good sleep good sleep,0,0.637034,0.362966,0,1.000000,6.932741e-11
...,...,...,...,...,...,...,...,...,...,...
32577,176360,0,Even liberals are humiliated by Charlie Rangel...,even liberal humiliate charlie claim gop back ...,0,0.507514,0.492486,1,0.207755,7.922446e-01
32578,176366,0,@user photoshop my nig,photoshop nig,1,0.358460,0.641540,0,0.987519,1.248107e-02
32579,176384,0,And mad hoes you can ask Beavis I get nothing ...,mad hoe ask beavis get nothing butthead,1,0.342576,0.657424,1,0.452294,5.477059e-01
32580,176385,0,Because child concentration camps were not bad...,child concentration camp bad enough trump admi...,0,0.617181,0.382819,0,0.683339,3.166607e-01


In [56]:
abovepointfive=df_test[df_test["prob_tfidf_11_1"].between(0.1, 0.9)]

In [57]:
abovepointfive

Unnamed: 0.1,Unnamed: 0,label,tweet,tweet_cleaned,user_handle,hashtags,emojis,clf_tfidf,prob_tfidf_0,prob_tfidf_1,clf_w2v,prob_w2v_0,prob_w2v_1,clf_tfidf_11,prob_tfidf_11_0,prob_tfidf_11_1
0,8886,0,@user #cinemaaawards final rehearsals!! geari...,final gear evening butterfly stage hope like,1,"['#cinemaaawards', '#butterflies', '#stage']",,0,0.500004,0.499996,0,0.999974,2.554390e-05,0,0.853726,0.146274
2,18952,0,child attacked by alligator at walt disney wor...,child attack alligator disney world,0,['#waltdisneyworld'],,0,0.500004,0.499996,0,0.997782,2.218165e-03,0,0.868091,0.131909
3,12485,0,me rn cause school is over soon,right cause school soon,0,[],,0,0.500001,0.499999,0,0.999382,6.181024e-04,0,0.595383,0.404617
4,5879,0,is it #good to #sleep when ? #good #sleep,good sleep good sleep,0,"['#good', '#sleep', '#good', '#sleep']",,0,0.500001,0.499999,0,1.000000,6.932741e-11,0,0.637034,0.362966
5,31817,0,hi @user we hear you're coming to mk . where ...,hi hear come,1,[],,0,0.500003,0.499997,0,1.000000,6.548846e-09,0,0.806315,0.193685
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32577,176360,0,Even liberals are humiliated by Charlie Rangel...,even liberal humiliate charlie claim gop back ...,0,['#8217'],,0,0.500001,0.499999,1,0.207755,7.922446e-01,0,0.507514,0.492486
32578,176366,0,@user photoshop my nig,photoshop nig,1,[],,1,0.499999,0.500001,0,0.987519,1.248107e-02,1,0.358460,0.641540
32579,176384,0,And mad hoes you can ask Beavis I get nothing ...,mad hoe ask beavis get nothing butthead,0,[],,1,0.499999,0.500001,1,0.452294,5.477059e-01,1,0.342576,0.657424
32580,176385,0,Because child concentration camps were not bad...,child concentration camp bad enough trump admi...,0,[],,0,0.500002,0.499998,0,0.683339,3.166607e-01,0,0.617181,0.382819
