# Test Klassifizierung gespeicherte Modelle

In [6]:
import joblib
import numpy as np
import pandas as pd
from nltk import word_tokenize

### Modelle und Vectorizer importieren

### First Dataset: TF-IDF NB

In [11]:
vectorizer_tfidf = joblib.load('used_models_test_dataset/first_dataset/vectorizer_tfidf.joblib')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [12]:
clf_nb_tfidf = joblib.load('used_models_test_dataset/first_dataset/model_tfidf_mn.joblib')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [13]:
clf_nb_tfidf.get_params()

{'alpha': 0.1,
 'class_prior': [0.75, 0.25],
 'fit_prior': False,
 'force_alpha': 'warn'}

### Mixed Dataset: TF-IDF NB

In [14]:
vectorizer_tfidf_mixed = joblib.load('used_models_test_dataset/mixed_dataset/vectorizer_tfidf.joblib')

In [15]:
clf_nb_tfidf_mixed = joblib.load('used_models_test_dataset/mixed_dataset/model_nb_tfidf_comp.joblib')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [16]:
clf_nb_tfidf_mixed.get_params()

{'alpha': 0.4,
 'class_prior': None,
 'fit_prior': True,
 'force_alpha': 'warn',
 'norm': False}

## Test mit Beispiel Tweets

In [17]:
def tweet_check(tweet):
    df_tweet = pd.DataFrame(tweet)
    print("First Model TF-IDF:")
    print(clf_nb_tfidf.predict(vectorizer_tfidf.transform(tweet)),clf_nb_tfidf.predict_proba(vectorizer_tfidf.transform(tweet)))
    print("Mixed Model TF-IDF:")
    print(clf_nb_tfidf_mixed.predict(vectorizer_tfidf_mixed.transform(tweet)),clf_nb_tfidf_mixed.predict_proba(vectorizer_tfidf_mixed.transform(tweet)))

In [18]:
tweet_hs = ['i hate jews so much i wish the holocaust actually happened']
tweet_no_hs = ['today i start breaking in a new pair of docs have mercy on my soles']

In [19]:
tweet_test = ['Trump just said']

In [20]:
tweet_check(tweet_hs)

First Model TF-IDF:
[1] [[0.24003358 0.75996642]]
Mixed Model TF-IDF:
[1] [[0.27304127 0.72695873]]


In [21]:
tweet_check(tweet_no_hs)

First Model TF-IDF:
[0] [[0.92781014 0.07218986]]
Mixed Model TF-IDF:
[0] [[0.72215419 0.27784581]]


In [22]:
tweet_check(['trump just said'])

First Model TF-IDF:
[1] [[0.28601363 0.71398637]]
Mixed Model TF-IDF:
[1] [[0.31681255 0.68318745]]


In [23]:
tweet_check(['obama just said'])

First Model TF-IDF:
[1] [[0.23512964 0.76487036]]
Mixed Model TF-IDF:
[1] [[0.36809884 0.63190116]]


In [24]:
tweet_check(['merkel just said'])

First Model TF-IDF:
[1] [[0.09087912 0.90912088]]
Mixed Model TF-IDF:
[1] [[0.39250015 0.60749985]]


In [25]:
tweet_check(['you are stupid'])

First Model TF-IDF:
[0] [[0.54075847 0.45924153]]
Mixed Model TF-IDF:
[1] [[0.31127967 0.68872033]]


In [26]:
tweet_check(['you are dumb'])

First Model TF-IDF:
[1] [[0.31607922 0.68392078]]
Mixed Model TF-IDF:
[1] [[0.28330677 0.71669323]]


## Test Datensatz

In [27]:
filepath_name_test = (('../../../data/mixed_dataset/test_cleaned.csv'))
df_test = pd.read_csv(filepath_name_test, encoding ='utf-8')

In [28]:
df_test = df_test[df_test['tweet_cleaned'].notna()]

In [29]:
df_test['clf_tfidf'] = clf_nb_tfidf.predict(vectorizer_tfidf.transform(df_test['tweet_cleaned']))

In [30]:
df_test['prob_tfidf_0'], df_test['prob_tfidf_1'] = clf_nb_tfidf.predict_proba(vectorizer_tfidf.transform(df_test['tweet_cleaned']))[:,0],clf_nb_tfidf.predict_proba(vectorizer_tfidf.transform(df_test['tweet_cleaned']))[:,1]

In [31]:
df_test['clf_tfidf_mixed'] = clf_nb_tfidf_mixed.predict(vectorizer_tfidf_mixed.transform(df_test['tweet_cleaned']))

In [32]:
df_test['prob_tfidf_mixed_0'], df_test['prob_tfidf_mixed_1'] = clf_nb_tfidf_mixed.predict_proba(vectorizer_tfidf_mixed.transform(df_test['tweet_cleaned']))[:,0],clf_nb_tfidf_mixed.predict_proba(vectorizer_tfidf_mixed.transform(df_test['tweet_cleaned']))[:,1]

In [33]:
df_test = df_test.drop(['user_handle', 'hashtags', 'emojis'], axis=1)

In [34]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,label,tweet,tweet_cleaned,clf_tfidf,prob_tfidf_0,prob_tfidf_1,clf_tfidf_mixed,prob_tfidf_mixed_0,prob_tfidf_mixed_1
0,8886,0,@user #cinemaaawards final rehearsals!! geari...,final gear evening butterfly stage hope like,0,0.986428,0.013572,0,0.83611,0.16389
1,27613,0,this was amazing. the weather was not. #musica...,amazing weather musical london weekend …,0,0.999683,0.000317,0,0.99143,0.00857
2,18952,0,child attacked by alligator at walt disney wor...,child attack alligator disney world,0,0.967843,0.032157,0,0.825607,0.174393
3,12485,0,me rn cause school is over soon,right cause school soon,0,0.880998,0.119002,0,0.59216,0.40784
4,5879,0,is it #good to #sleep when ? #good #sleep,good sleep good sleep,0,0.974112,0.025888,0,0.649481,0.350519


In [35]:
pd.set_option('display.max_rows', 100)

In [36]:
pd.options.display.max_colwidth = 320

## Vergleich Trump vs Obama

In [57]:
df_trump = df_test[df_test["tweet_cleaned"].str.contains('trump')]

In [55]:
df_trump.head(100)

Unnamed: 0.1,Unnamed: 0,label,tweet,tweet_cleaned,clf_tfidf,prob_tfidf_0,prob_tfidf_1,clf_tfidf_mixed,prob_tfidf_mixed_0,prob_tfidf_mixed_1
98,23365,1,@user @user #trump = #biher non stop election,trump = non stop election,1,0.165814,0.834186,1,0.354326,0.645674
196,28295,1,"@user #newyear 'wish list' of cretin #carlpaladino,election co-chairman of #presidentelect #trump on mr&amp;mrs…",wish list co - trump …,1,0.418244,0.581756,1,0.399596,0.600404
225,29700,0,"trump: if elected, i'll ban immigration from areas with terrorism ties.... i guess only people from antaica will be permitted.",trump elect ill ban immigration area terrorism tie guess people permit,1,0.449035,0.550965,1,0.458372,0.541628
270,9895,1,"#buffalo, #newyork : #trump ally carl #paladino slammed for wish list for #obamas. what a cockwomble #usa",buffalo trump ally paladino slam wish list obama usa,1,0.01455,0.98545,1,0.224583,0.775417
351,31652,0,"@user snookums, don't be mad that bernie-chan isn't as good as trump at campaigning. #pathetic",mad bernie - chan good trump campaign pathetic,0,0.820068,0.179932,0,0.551387,0.448613
383,8804,1,"carl paladino, trump ally, wishes obama dead of mad cow disease in ’17",paladino trump ally wish obama dead mad cow disease ',1,0.011652,0.988348,1,0.072151,0.927849
513,12586,1,trumps associates are a classy bunch.,trumps associate classy bunch,1,0.203483,0.796517,1,0.317543,0.682457
541,15633,1,#trump's allies r unstoppable! #obama via @user #christmaseve,trump ally r unstoppable obama via,1,0.096012,0.903988,1,0.256092,0.743908
598,25515,1,@user #gop #cons who voted for #nazi #fascist #dictator #trump are just scared little bullies who…,gop con vote nazi fascist dictator trump scared little bully …,1,0.01868,0.98132,1,0.20079,0.79921
647,1393,1,@user buffalo school districts boots trump ally over racist comments. #birdsofafeather #resist :,buffalo school district boot trump ally racist comment resist,1,0.033777,0.966223,1,0.300199,0.699801


In [69]:
count_trump_hs = df_trump["clf_tfidf"].value_counts().get(1,0)
print("First Classifier detected Hate Speech:",count_trump_hs)
count_trump_no_hs = df_trump["clf_tfidf"].value_counts().get(0,0)
print("First Classifier detected None Hate Speech:",count_trump_no_hs)
count_trump_hs_mixed = df_trump["clf_tfidf_mixed"].value_counts().get(1,0)
print("First Classifier detected Hate Speech:",count_trump_hs_mixed)
count_trump_no_hs_mixed = df_trump["clf_tfidf_mixed"].value_counts().get(0,0)
print("First Classifier detected None Hate Speech:",count_trump_no_hs_mixed)

First Classifier detected Hate Speech: 196
First Classifier detected None Hate Speech: 193
First Classifier detected Hate Speech: 218
First Classifier detected None Hate Speech: 171


In [61]:
df_obama = df_test[df_test["tweet_cleaned"].str.contains('obama')]

In [62]:
df_obama

Unnamed: 0.1,Unnamed: 0,label,tweet,tweet_cleaned,clf_tfidf,prob_tfidf_0,prob_tfidf_1,clf_tfidf_mixed,prob_tfidf_mixed_0,prob_tfidf_mixed_1
170,3479,1,@user #obama suppoing #communism+#islam which is another as deadly ideology isn't 'bout #race-only suppoing #nazi=…,obama another deadly ideology bout race - …,1,0.03604,0.96396,1,0.294207,0.705793
270,9895,1,"#buffalo, #newyork : #trump ally carl #paladino slammed for wish list for #obamas. what a cockwomble #usa",buffalo trump ally paladino slam wish list obama usa,1,0.01455,0.98545,1,0.224583,0.775417
383,8804,1,"carl paladino, trump ally, wishes obama dead of mad cow disease in ’17",paladino trump ally wish obama dead mad cow disease ',1,0.011652,0.988348,1,0.072151,0.927849
541,15633,1,#trump's allies r unstoppable! #obama via @user #christmaseve,trump ally r unstoppable obama via,1,0.096012,0.903988,1,0.256092,0.743908
856,10756,1,@user barack #obama gets real about the he #faced in office via @user,barack obama get real face office via,1,0.355402,0.644598,1,0.477182,0.522818
866,14145,0,"@user @user make #smokejerky of these 53% sodom&amp;gomorrah obama governmentassistance lazy azzes. not all, but there's work",make obama lazy work,0,0.882071,0.117929,0,0.524647,0.475353
1294,115,1,@user why not @user mocked obama for being black. @user @user @user @user #brexit,mock obama black brexit,1,0.069381,0.930619,0,0.60016,0.39984
1453,10868,0,@user obama is utilizing a terrorist attack that killed 50 americans to push his own agenda.,obama terrorist attack kill americans push agenda,1,0.235361,0.764639,1,0.325465,0.674535
1493,27976,0,thanks to some random software download going on in the set top box i am missing the tonight show starring jimmy fallon!#why #obama,thank random software download go set top box miss tonight show star jimmy obama,0,0.987181,0.012819,0,0.908916,0.091084
1986,26016,1,"""new"" paladino loving eggs like this shit head are out bitching about obama. block early, block often. #spam",""" new paladino love egg like shit head bitch obama block early block often spam",0,0.780869,0.219131,0,0.538181,0.461819


In [71]:
count_obama_hs = df_obama["clf_tfidf"].value_counts().get(1,0)
print("First Classifier detected Hate Speech:",count_obama_hs)
count_obama_no_hs = df_obama["clf_tfidf"].value_counts().get(0,0)
print("First Classifier detected None Hate Speech:",count_obama_no_hs)
count_obama_hs_mixed = df_obama["clf_tfidf_mixed"].value_counts().get(1,0)
print("First Classifier detected Hate Speech:",count_obama_hs_mixed)
count_obama_no_hs_mixed = df_obama["clf_tfidf_mixed"].value_counts().get(0,0)
print("First Classifier detected None Hate Speech:",count_obama_no_hs_mixed)

First Classifier detected Hate Speech: 53
First Classifier detected None Hate Speech: 34
First Classifier detected Hate Speech: 46
First Classifier detected None Hate Speech: 41


## Prüfung einzelner Wörter

In [37]:
def check_word(word):
    df_word = df_test[df_test["tweet_cleaned"].str.contains(word)]
    count_word_hs = df_word["clf_tfidf"].value_counts().get(1,0)
    print("First Classifier detected Hate Speech:",count_word_hs)
    count_word_no_hs = df_word["clf_tfidf"].value_counts().get(0,0)
    print("First Classifier detected None Hate Speech:",count_word_no_hs)
    count_word_hs_mixed = df_word["clf_tfidf_mixed"].value_counts().get(1,0)
    print("First Classifier detected Hate Speech:",count_word_hs_mixed)
    count_word_no_hs_mixed = df_word["clf_tfidf_mixed"].value_counts().get(0,0)
    print("First Classifier detected None Hate Speech:",count_word_no_hs_mixed)

In [38]:
word = "test"

In [39]:
check_word(word)

First Classifier detected Hate Speech: 44
First Classifier detected None Hate Speech: 136
First Classifier detected Hate Speech: 56
First Classifier detected None Hate Speech: 124


In [40]:
check_word("shit")

First Classifier detected Hate Speech: 254
First Classifier detected None Hate Speech: 1379
First Classifier detected Hate Speech: 971
First Classifier detected None Hate Speech: 662


In [43]:
check_word("fuck")

First Classifier detected Hate Speech: 719
First Classifier detected None Hate Speech: 2852
First Classifier detected Hate Speech: 2569
First Classifier detected None Hate Speech: 1002


In [44]:
check_word("hate")

First Classifier detected Hate Speech: 316
First Classifier detected None Hate Speech: 780
First Classifier detected Hate Speech: 638
First Classifier detected None Hate Speech: 458
