In [1]:
import json
import os
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import urllib.request
from collections import Counter
import re
import pickle
import html
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression

from nltk.tokenize import TweetTokenizer
from IPython.core.display import display, HTML

# Check overlap among top attention words

In [2]:
# Load sexism-sexism data
sexism_data = pd.read_csv('/usr0/home/mamille2/11-830-Final-Project/output/sexism_zeerak_naacl_zeerak_sexism_all/test_attn_top_weights.csv')
racism_data = pd.read_csv('/usr0/home/mamille2/11-830-Final-Project/output/zeerak_racism_all/test_attn_top_weights.csv')
racism_data.columns

Index(['word', 'average_weight', 'count', 'total_weight',
       'average_hate_weight', 'hate_count', 'hate_weight',
       'average_none_weight', 'none_count', 'average_none_weight.1'],
      dtype='object')

In [14]:
top_sexism = set(sexism_data.loc[:100, 'word'].tolist())
top_racism = set(racism_data.loc[:100, 'word'].tolist())
overlap = top_sexism.intersection(top_racism)
print(len(overlap))
overlap

6


{'</SENT>', 'according', 'also', 'changing', 'comes', 'something'}

# Check UNK postagging

In [2]:
old_data = pd.read_csv('/usr0/home/mamille2/11-830-Final-Project/data/zeerak_naacl/train_old_slur.csv')
old_data.columns

Index(['tweet', 'label', 'mentions', 'hashtags', 'slurs', 'original_tweet',
       'racism', 'sexism', 'none', 'tweet_id', 'user_screen_name',
       'tweet_unk_slur', 'tweet_no_slur'],
      dtype='object')

In [13]:
new_data = pd.read_csv('/usr0/home/mamille2/11-830-Final-Project/data/zeerak_naacl/train_POS.csv')
# new_data = pd.read_csv('/usr0/home/mamille2/11-830-Final-Project/data/davidson/train_POS.csv')
new_data.columns

Index(['tweet', 'label', 'mentions', 'hashtags', 'slurs', 'original_tweet',
       'racism', 'sexism', 'none', 'tweet_id', 'user_screen_name',
       'tweet_unk_slur', 'tweet_no_slur'],
      dtype='object')

In [5]:
all(new_data['slurs'] == old_data['slurs'])

True

In [6]:
new_data['slurs']

0               []
1               []
2               []
3               []
4               []
5        ['trash']
6               []
7               []
8               []
9               []
10              []
11              []
12              []
13              []
14              []
15              []
16              []
17              []
18              []
19              []
20              []
21              []
22              []
23              []
24              []
25              []
26              []
27              []
28              []
29              []
           ...    
12642           []
12643           []
12644           []
12645           []
12646           []
12647           []
12648           []
12649           []
12650           []
12651           []
12652           []
12653           []
12654           []
12655           []
12656           []
12657           []
12658           []
12659           []
12660           []
12661           []
12662           []
12663    ['b

In [7]:
old_data['slurs']

0               []
1               []
2               []
3               []
4               []
5        ['trash']
6               []
7               []
8               []
9               []
10              []
11              []
12              []
13              []
14              []
15              []
16              []
17              []
18              []
19              []
20              []
21              []
22              []
23              []
24              []
25              []
26              []
27              []
28              []
29              []
           ...    
12642           []
12643           []
12644           []
12645           []
12646           []
12647           []
12648           []
12649           []
12650           []
12651           []
12652           []
12653           []
12654           []
12655           []
12656           []
12657           []
12658           []
12659           []
12660           []
12661           []
12662           []
12663    ['b

In [14]:
new_data['tweet']

0        <MENTION> what high level of customer service ...
1        <MENTION> talk to your kids about content mana...
2        with a bit of luck nikki can do some promo wor...
3        <MENTION> <MENTION> horseshit VBD led 17 major...
4        <MENTION> <MENTION> people reply and mentions ...
5        wow there s a lot of NN in my mentions this mo...
6                         <MENTION> have you tried pravana
7        <MENTION> i m not JJ but i do believe that NNS...
8        <MENTION> i think you might enjoy it too much ...
9        how dare you give me free food i don t like as...
10       <MENTION> i m not JJ but i want a NN that s go...
11       those kisses on the mirror look like cats arse...
12       <MENTION> <MENTION> ever hear of minding your ...
13                        nobody likes a dry sausage # mkr
14       so the quds general leading the assault on isi...
15       # mkr pretty sure that s how all the chicks wa...
16       the amount of mean NNS mentality coming from g.

In [15]:
new_data['tweet'] == old_data['tweet']

0         True
1         True
2         True
3        False
4         True
5        False
6         True
7        False
8         True
9         True
10       False
11        True
12        True
13        True
14        True
15        True
16       False
17        True
18        True
19        True
20        True
21        True
22        True
23        True
24        True
25        True
26        True
27        True
28        True
29       False
         ...  
12642     True
12643     True
12644     True
12645     True
12646    False
12647     True
12648     True
12649     True
12650     True
12651     True
12652     True
12653     True
12654     True
12655    False
12656     True
12657     True
12658    False
12659     True
12660     True
12661    False
12662     True
12663    False
12664    False
12665     True
12666     True
12667     True
12668     True
12669    False
12670     True
12671     True
Name: tweet, Length: 12672, dtype: bool

In [12]:
old_data['tweet']

0        <MENTION> what high level of customer service ...
1        <MENTION> talk to your kids about content mana...
2        with a bit of luck nikki can do some promo wor...
3        <MENTION> <MENTION> horseshit mohammed led 17 ...
4        <MENTION> <MENTION> people reply and mentions ...
5        wow there s a lot of trash in my mentions this...
6                         <MENTION> have you tried pravana
7        <MENTION> i m not sexist but i do believe that...
8        <MENTION> i think you might enjoy it too much ...
9        how dare you give me free food i don t like as...
10       <MENTION> i m not sexist but i want a girl tha...
11       those kisses on the mirror look like cats arse...
12       <MENTION> <MENTION> ever hear of minding your ...
13                        nobody likes a dry sausage # mkr
14       so the quds general leading the assault on isi...
15       # mkr pretty sure that s how all the chicks wa...
16       the amount of mean girls mentality coming from.

# Hate label on Zeerak data

In [4]:
# Load data
folds = ['train', 'dev', 'test']
data = {}
for f in folds:
    data[f] = pd.read_csv(f'/usr0/home/mamille2/11-830-Final-Project/data/zeerak_naacl/{f}_utf8.csv')
    print(len(data[f]))
    
print(data['train'].columns)

12676
1568
1568
Index(['tweet', 'label', 'mentions', 'hashtags', 'original_tweet', 'racism',
       'sexism', 'none', 'tweet_id', 'user_screen_name'],
      dtype='object')


In [17]:
for f in folds:
    data[f]['hate'] = data[f]['racism'] | data[f]['sexism']
    data[f].to_csv(f'/usr0/home/mamille2/11-830-Final-Project/data/zeerak_naacl/{f}_utf8.csv', index=False)

# Precision, recall, accuracy on dev

In [2]:
# Load predictions
# with open('/usr0/home/mamille2/11-830-Final-Project/output/davidson_tweet_unk_slur_dev_unked_preds.pkl', 'rb') as f:
with open('/usr0/home/mamille2/11-830-Final-Project/output/davidson_tweet_no_slur_dev_unked_preds.pkl', 'rb') as f:
    preds = pickle.load(f)
    
print(len(preds))
print(sum(preds))

2472
106


In [3]:
# Load gold standard
dev = pd.read_csv('/usr0/home/mamille2/11-830-Final-Project/data/davidson/dev.csv')

label_id = {'neither': 0, 'offensive_language': 0, 'hate_speech': 1}
gold = [label_id[l] for l in dev['label'].tolist()]
print(len(gold))
print(sum(gold))

2473
165


In [4]:
# Precision, recall, f1 and accuracy
correct_hs = 0
correct_athg = 0
pred_hs = sum(preds)
actual_hs = sum(gold)
total = len(preds)

for tl, pl in zip(preds, gold):
    if tl == pl == 1:
        correct_hs += 1
        correct_athg += 1
    elif tl == pl == 0:
        correct_athg += 1
        
prec = correct_hs/pred_hs
rec = correct_hs/actual_hs
f1 = 2 * prec * rec / (prec + rec)
acc = correct_athg/total

print(f'Precision: {prec}')
print(f'Recall: {rec}')
print(f'F1: {f1}')
print(f'Accuracy: {acc}')

Precision: 0.1792452830188679
Recall: 0.11515151515151516
F1: 0.14022140221402213
Accuracy: 0.9057443365695793


## Logistic regression baseline

In [2]:
# Load data
folds = ['train', 'dev', 'test']
data = {}
for f in folds:
    data[f] = pd.read_csv(f'/usr0/home/mamille2/11-830-Final-Project/data/davidson/{f}.csv')
    print(len(data[f]))
    
print(data['train'].columns)

19836
2473
2474
Index(['text', 'hate_speech', 'offensive_language', 'neither', 'mentions',
       'hashtags', 'original_tweet', 'label'],
      dtype='object')


In [4]:
# Use tweet tokenizer
tokenizer = TweetTokenizer()
text_data = {}
for f in folds:
    text_data[f] = data[f]['text'].map(lambda x: ' '.join(tokenizer.tokenize(x.lower()))).tolist()
    print(len(text_data[f]))

19836
2473
2474


In [9]:
# vec = TfidfVectorizer(min_df=2)
# vec = CountVectorizer(min_df=2)
# vec = CountVectorizer()
vec = CountVectorizer(ngram_range=(1,2))
vec.fit(text_data['train'])

bow = {}
bow['train'] = vec.transform(text_data['train'])
print(bow['train'].shape)
bow['dev'] = vec.transform(text_data['dev'])
print(bow['dev'].shape)

labels = {}
for f in folds:
    labels[f] = data[f]['hate_speech'].tolist()

clf = LogisticRegression()
clf.fit(bow['train'], labels['train'])
preds_lr = clf.predict(bow['dev'])
print(preds_lr.shape)

# Precision, recall, f1 and accuracy
preds = preds_lr
gold = labels['dev']

correct_hs = 0
correct_athg = 0
pred_hs = sum(preds)
actual_hs = sum(gold)
total = len(preds)

for tl, pl in zip(preds, gold):
    if tl == pl == 1:
        correct_hs += 1
        correct_athg += 1
    elif tl == pl == 0:
        correct_athg += 1
        
prec = correct_hs/pred_hs
rec = correct_hs/actual_hs
f1 = 2 * prec * rec / (prec + rec)
acc = correct_athg/total

print()
print(f'Precision: {prec}')
print(f'Recall: {rec}')
print(f'F1: {f1}')
print(f'Accuracy: {acc}')

(19836, 135114)
(2473, 135114)
(2473,)

Precision: 0.578125
Recall: 0.22424242424242424
F1: 0.3231441048034934
Accuracy: 0.9373230893651435


# Attention weight visualization

In [7]:
# Load weights
with open('/usr0/home/mamille2/11-830-Final-Project/output/davidson_tweet_unk_slur_dev_unked_attn.pkl', 'rb') as f:
    wts = pickle.load(f)
    
wts = [w[0][0] for w in wts]
len(wts)

2473

In [8]:
# Load predictions
with open('/usr0/home/mamille2/11-830-Final-Project/output/davidson_tweet_unk_slur_dev_unked_preds.pkl', 'rb') as f:
    preds = pickle.load(f)
    
len(preds)

2473

In [14]:
# Load text data
# data = pd.read_csv('/usr0/home/mamille2/11-830-Final-Project/data/davidson/dev.csv')
data = pd.read_csv('/usr0/home/mamille2/11-830-Final-Project/data/davidson/dev_unked.csv')
print(len(data))
print(data.columns)

2473
Index(['tweet', 'hate_speech', 'offensive_language', 'neither', 'mentions',
       'hashtags', 'original_tweet', 'tweet_unk_slur', 'tweet_no_slur',
       'label'],
      dtype='object')


In [10]:
label_id = {'neither': 0, 'offensive_language': 0, 'hate_speech': 1}
gold = [label_id[l] for l in data['label'].tolist()]
len(gold)

2473

In [15]:
# Use tweet tokenizer
tokenizer = TweetTokenizer()
# text_data = data['text'].map(lambda x: tokenizer.tokenize(x.lower())).tolist()
text_data = data['tweet_unk_slur'].map(lambda x: tokenizer.tokenize(x.lower())).tolist()
len(text_data)

2473

In [16]:
# Check lengths
for i, (w,t) in enumerate(zip(wts, text_data)):
    if len(w) != len(t):
        print(f'{i}: {len(w) - len(t)}')

In [46]:
def red_blue(val, mean, total_max, total_min):
    """ Returns red/blue 256 spectrum for highlighting """
    
    rscale = 1/(total_max - mean)
    bscale = 1/(mean - total_min)
    
    rval = max(0, val-mean) * (255 * rscale)
    bval = max(0, mean-val) * (255 * bscale)
    
    return (rval, bval)

In [17]:
def color_attn(val, total_max, total_min):
    """ Returns 0-1 for highlighting """
    
    scale = 1/total_max
    val = (val-total_min) * scale
    return val

In [22]:
total_max = max(d for wt in wts for d in wt)
total_min = min(d for wt in wts for d in wt)

# wts_viz = ""
wts_viz = []
for i, (wt, sent) in enumerate(tqdm(zip(wts, text_data))):
#     sent = ['<sent>'] + sent + ['</sent>']
    vals = [color_attn(d, total_max, total_min) for d in wt]
#     display(HTML(''.join([f"<span style='background-color: rgba(255,0,0,{val})'>{w}</span>&nbsp" for val,w in zip(vals, sent)])))
#     wts_viz += f"Sample {i}:<br>" + \
#             ''.join([f"<span style='background-color: rgba(255,0,0,{val})'>{w}</span>&nbsp" for val,w in zip(vals, sent)]) + \
#             "<br><br>"
    wts_viz.append(''.join([f"<span style='background-color: rgba(255,0,0,{val})'>{html.escape(w)}</span>&nbsp" for val,w in zip(vals, sent)]))
    
# display(HTML(wts_viz))
len(wts_viz)




2473

In [23]:
# Match attention weights with predictions
out = pd.DataFrame(list(zip(wts_viz, preds, gold)), columns=['attention_weights', 'predicted_label', 'gold_label'])
out

Unnamed: 0,attention_weights,predicted_label,gold_label
0,"<span style='background-color: rgba(255,0,0,3.784116984689903e-09)'>&lt;mention&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,4.9622634539616265e-09)'>the</span>&nbsp<span style='background-color: rgba(255,0,0,5.944945389785475e-09)'>shit</span>&nbsp<span style='background-color: rgba(255,0,0,2.2115962044324115e-09)'>you</span>&nbsp<span style='background-color: rgba(255,0,0,1.907671763262855e-09)'>hear</span>&nbsp<span style='background-color: rgba(255,0,0,5.289820453171082e-10)'>about</span>&nbsp<span style='background-color: rgba(255,0,0,1.402533641274313e-08)'>me</span>&nbsp<span style='background-color: rgba(255,0,0,1.1532475734767481e-09)'>might</span>&nbsp<span style='background-color: rgba(255,0,0,2.3570143135515267e-08)'>be</span>&nbsp<span style='background-color: rgba(255,0,0,8.940418148512166e-09)'>true</span>&nbsp<span style='background-color: rgba(255,0,0,8.894917656249348e-09)'>or</span>&nbsp<span style='background-color: rgba(255,0,0,3.2531854508505334e-08)'>it</span>&nbsp<span style='background-color: rgba(255,0,0,1.19099770982696e-09)'>might</span>&nbsp<span style='background-color: rgba(255,0,0,2.380530791205611e-08)'>be</span>&nbsp<span style='background-color: rgba(255,0,0,7.774373110208857e-09)'>faker</span>&nbsp<span style='background-color: rgba(255,0,0,3.2141793320618753e-09)'>than</span>&nbsp<span style='background-color: rgba(255,0,0,5.545996014481598e-08)'>the</span>&nbsp<span style='background-color: rgba(255,0,0,2.906868342526532e-08)'>&lt;unk&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,5.3688776056532064e-08)'>who</span>&nbsp<span style='background-color: rgba(255,0,0,4.776597392824158e-07)'>told</span>&nbsp<span style='background-color: rgba(255,0,0,7.57108921299249e-07)'>it</span>&nbsp<span style='background-color: rgba(255,0,0,7.007718522796628e-05)'>to</span>&nbsp<span style='background-color: rgba(255,0,0,0.99992835521698)'>ya</span>&nbsp",0,0
1,"<span style='background-color: rgba(255,0,0,1.756440500648178e-08)'>murda</span>&nbsp<span style='background-color: rgba(255,0,0,3.115486037690686e-07)'>gang</span>&nbsp<span style='background-color: rgba(255,0,0,2.0152368627662967e-08)'>&lt;unk&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,5.360025170025153e-07)'>its</span>&nbsp<span style='background-color: rgba(255,0,0,2.9014097435369593e-06)'>gang</span>&nbsp<span style='background-color: rgba(255,0,0,0.9999961853027344)'>land</span>&nbsp",0,0
2,"<span style='background-color: rgba(255,0,0,0.004332508891820873)'>i</span>&nbsp<span style='background-color: rgba(255,0,0,0.006676460616290535)'>met</span>&nbsp<span style='background-color: rgba(255,0,0,0.008156002499163116)'>that</span>&nbsp<span style='background-color: rgba(255,0,0,0.0033866483718156463)'>&lt;unk&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,0.002467946382239426)'>on</span>&nbsp<span style='background-color: rgba(255,0,0,0.009203165769576992)'>ocean</span>&nbsp<span style='background-color: rgba(255,0,0,0.014399860054254497)'>dr</span>&nbsp<span style='background-color: rgba(255,0,0,0.00427472591400143)'>i</span>&nbsp<span style='background-color: rgba(255,0,0,0.012767068110406364)'>gave</span>&nbsp<span style='background-color: rgba(255,0,0,0.012990499846637214)'>that</span>&nbsp<span style='background-color: rgba(255,0,0,0.01770006492733952)'>&lt;unk&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,0.021724732592701877)'>a</span>&nbsp<span style='background-color: rgba(255,0,0,0.04527720063924786)'>pill</span>&nbsp<span style='background-color: rgba(255,0,0,0.8366430997848511)'>?</span>&nbsp",0,0
3,"<span style='background-color: rgba(255,0,0,2.948205519679931e-06)'>it</span>&nbsp<span style='background-color: rgba(255,0,0,5.592819433890952e-07)'>aint</span>&nbsp<span style='background-color: rgba(255,0,0,9.015036539144319e-07)'>nothing</span>&nbsp<span style='background-color: rgba(255,0,0,7.314903541520351e-06)'>to</span>&nbsp<span style='background-color: rgba(255,0,0,4.878909294312933e-06)'>cut</span>&nbsp<span style='background-color: rgba(255,0,0,9.474759281112228e-06)'>a</span>&nbsp<span style='background-color: rgba(255,0,0,3.307739916600866e-06)'>&lt;unk&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,0.999970555305481)'>off</span>&nbsp",0,0
4,"<span style='background-color: rgba(255,0,0,2.7655741515579273e-07)'>lames</span>&nbsp<span style='background-color: rgba(255,0,0,4.787702323132805e-07)'>crying</span>&nbsp<span style='background-color: rgba(255,0,0,4.022292614501083e-06)'>over</span>&nbsp<span style='background-color: rgba(255,0,0,2.3801824507833654e-07)'>&lt;unk&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,1.369003683398375e-07)'>thats</span>&nbsp<span style='background-color: rgba(255,0,0,5.383988536742862e-06)'>tears</span>&nbsp<span style='background-color: rgba(255,0,0,0.00018650837591845763)'>of</span>&nbsp<span style='background-color: rgba(255,0,0,3.479555380177905e-05)'>a</span>&nbsp<span style='background-color: rgba(255,0,0,0.9997681975364685)'>clown</span>&nbsp",0,0
5,"<span style='background-color: rgba(255,0,0,0.001249017659574712)'>we</span>&nbsp<span style='background-color: rgba(255,0,0,0.0014857113128527648)'>dont</span>&nbsp<span style='background-color: rgba(255,0,0,0.0051610996015369545)'>trust</span>&nbsp<span style='background-color: rgba(255,0,0,0.004626103676855529)'>these</span>&nbsp<span style='background-color: rgba(255,0,0,0.283374011516571)'>&lt;unk&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,0.006122347433119978)'>all</span>&nbsp<span style='background-color: rgba(255,0,0,0.0033801044337451107)'>these</span>&nbsp<span style='background-color: rgba(255,0,0,0.6946015954017639)'>&lt;unk&gt;</span>&nbsp",0,0
6,"<span style='background-color: rgba(255,0,0,3.3927757613093004e-05)'>yall</span>&nbsp<span style='background-color: rgba(255,0,0,0.00010880161426026595)'>&lt;unk&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,5.737621177100797e-05)'>b</span>&nbsp<span style='background-color: rgba(255,0,0,8.863324910631624e-05)'>cuffing</span>&nbsp<span style='background-color: rgba(255,0,0,0.0003328519233036436)'>&lt;unk&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,0.021411104127764667)'>cause</span>&nbsp<span style='background-color: rgba(255,0,0,0.00033606900251467004)'>yall</span>&nbsp<span style='background-color: rgba(255,0,0,0.0008592387312091537)'>aint</span>&nbsp<span style='background-color: rgba(255,0,0,0.037506926804780925)'>never</span>&nbsp<span style='background-color: rgba(255,0,0,0.005880065262317623)'>have</span>&nbsp<span style='background-color: rgba(255,0,0,0.9333849549293518)'>&lt;unk&gt;</span>&nbsp",0,0
7,"<span style='background-color: rgba(255,0,0,0.0019239482935517675)'>&lt;mention&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,0.0026779298204928285)'>hennessey</span>&nbsp<span style='background-color: rgba(255,0,0,0.005062090232968296)'>venom</span>&nbsp<span style='background-color: rgba(255,0,0,0.0007714909734204061)'>gt</span>&nbsp<span style='background-color: rgba(255,0,0,0.0012770876055583007)'>?</span>&nbsp<span style='background-color: rgba(255,0,0,0.0018083356553688294)'>&lt;url&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,0.001723330235108698)'>that</span>&nbsp<span style='background-color: rgba(255,0,0,0.0035071547608822233)'>s</span>&nbsp<span style='background-color: rgba(255,0,0,0.013070731423795189)'>one</span>&nbsp<span style='background-color: rgba(255,0,0,0.05050255730748173)'>sexy</span>&nbsp<span style='background-color: rgba(255,0,0,0.9176753759384155)'>&lt;unk&gt;</span>&nbsp",0,0
8,"<span style='background-color: rgba(255,0,0,3.692699465315078e-05)'>&lt;mention&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,3.5565350117359654e-05)'>&lt;mention&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,7.634889334436727e-05)'>&lt;url&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,0.0003955404390580602)'>0</span>&nbsp<span style='background-color: rgba(255,0,0,0.0008370773284695692)'>rings</span>&nbsp<span style='background-color: rgba(255,0,0,0.0019355237018316633)'>0</span>&nbsp<span style='background-color: rgba(255,0,0,0.00024151869001794827)'>mvps</span>&nbsp<span style='background-color: rgba(255,0,0,0.004391333088278736)'>0</span>&nbsp<span style='background-color: rgba(255,0,0,0.00044893924496133104)'>&lt;unk&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,0.9916011691093445)'>lol</span>&nbsp",0,0
9,"<span style='background-color: rgba(255,0,0,0.007101420313119854)'>&lt;mention&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,0.008207976818084682)'>?</span>&nbsp<span style='background-color: rgba(255,0,0,0.008239524438977207)'>?</span>&nbsp<span style='background-color: rgba(255,0,0,0.008245368488132919)'>?</span>&nbsp<span style='background-color: rgba(255,0,0,0.007105095777660574)'>&lt;mention&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,0.009676988236606086)'>&lt;unk&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,0.02400279045104977)'>be</span>&nbsp<span style='background-color: rgba(255,0,0,0.0033682528883218414)'>wanting</span>&nbsp<span style='background-color: rgba(255,0,0,0.00811316072940823)'>to</span>&nbsp<span style='background-color: rgba(255,0,0,0.012174951843917335)'>act</span>&nbsp<span style='background-color: rgba(255,0,0,0.021397422999143566)'>like</span>&nbsp<span style='background-color: rgba(255,0,0,0.1581720560789108)'>&lt;unk&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,0.02912659756839272)'>so</span>&nbsp<span style='background-color: rgba(255,0,0,0.007011454552411998)'>bad</span>&nbsp<span style='background-color: rgba(255,0,0,0.008787589147686924)'>?</span>&nbsp<span style='background-color: rgba(255,0,0,0.008773056790232624)'>?</span>&nbsp<span style='background-color: rgba(255,0,0,0.00876795500516888)'>?</span>&nbsp<span style='background-color: rgba(255,0,0,0.011220341548323597)'>that</span>&nbsp<span style='background-color: rgba(255,0,0,0.015457030385732616)'>shit</span>&nbsp<span style='background-color: rgba(255,0,0,0.009766723029315437)'>aint</span>&nbsp<span style='background-color: rgba(255,0,0,0.006499952170997823)'>cuteeeee</span>&nbsp<span style='background-color: rgba(255,0,0,0.010455349460244144)'>but</span>&nbsp<span style='background-color: rgba(255,0,0,0.026385210454463924)'>&lt;unk&gt;</span>&nbsp<span style='background-color: rgba(255,0,0,0.017431380227208103)'>act</span>&nbsp<span style='background-color: rgba(255,0,0,0.06148060783743855)'>like</span>&nbsp<span style='background-color: rgba(255,0,0,0.5030317306518555)'>&lt;unk&gt;</span>&nbsp",0,0


In [20]:
pd.set_option('display.max_colwidth', -1)

In [24]:
out.to_html('/usr0/home/mamille2/11-830-Final-Project/output/davidson_dev_unked_attn.html', escape=False)

# Unk/remove slurs from datasets

In [40]:
slurs_r = r'|'.join([r'\b{}(?:s|es)?\b'.format(w) for w in slurs])

# UNK
slurs_p = re.compile(slurs_r)

for f in folds:
    data[f]['tweet_unk_slur'] = data[f]['tweet'].map(lambda x: re.sub(slurs_p, '<UNK>', x))
    data[f]['tweet_unk_slur']

In [41]:
# Remove
for f in folds:
    data[f]['tweet_no_slur'] = data[f]['tweet'].map(lambda x: re.sub(r'\s+', ' ', re.sub(slurs_p, '', x)))
    data[f]['tweet_no_slur']

In [35]:
all_wds = [w for t in data['train']['tweet_unk_slur'].tolist() for w in t.split()]
print('bitch' in all_wds)
print('bitches' in all_wds)
print('hoes' in all_wds)

False
False
False


In [43]:
# Save out
for f in folds:
    data[f].to_csv(f'/usr0/home/mamille2/11-830_data/project/davidson/{f}_unked.csv', index=False)

In [45]:
data['train']['tweet_unk_slur'].head()

0    <MENTION> as a woman you shouldn t complain ab...
1    <MENTION> boy dats cold tyga dwn bad for cuffi...
2    <MENTION> dawg <MENTION> you ever fuck a <UNK>...
3            <MENTION> <MENTION> she look like a <UNK>
4    <MENTION> the shit just blows me claim you so ...
Name: tweet_unk_slur, dtype: object

# Add labels column

In [12]:
# Load data
folds = ['train', 'dev', 'test']
data = {}
for f in folds:
#     data[f] = pd.read_csv(f'/usr0/home/mamille2/11-830-Final-Project/data/davidson/{f}.csv')
    data[f] = pd.read_csv(f'/usr0/home/mamille2/11-830_data/project/davidson/{f}_unked.csv')
    print(len(data[f]))
    
print(data['train'].columns)

19836
2473
2474
Index(['tweet', 'hate_speech', 'offensive_language', 'neither', 'mentions',
       'hashtags', 'original_tweet', 'tweet_unk_slur', 'tweet_no_slur'],
      dtype='object')


In [8]:
for f in folds:
    label_lines = []
    for h_val, o_val, n_val in zip(*[data[f][c].tolist() for c in ['hate_speech', 'offensive_language', 'neither']]):
        if h_val == 1:
            label_lines.append('hate_speech')
        elif o_val == 1:
            label_lines.append('offensive_language')
        else:
            label_lines.append('neither')
            
    data[f]['label'] = label_lines
            
#     for c in ['hate_speech', 'offensive_language', 'neither']:
#         data[f]['label'] = data[f][c].map(lambda x: c if 1 else '')

data['train'].loc[:, ['hate_speech', 'offensive_language', 'neither', 'label']].head(10)

Unnamed: 0,hate_speech,offensive_language,neither,label
0,0,0,1,neither
1,0,1,0,offensive_language
2,0,1,0,offensive_language
3,0,1,0,offensive_language
4,0,1,0,offensive_language
5,0,1,0,offensive_language
6,0,1,0,offensive_language
7,0,1,0,offensive_language
8,0,1,0,offensive_language
9,0,1,0,offensive_language


In [9]:
# Save out
for f in folds:
    data[f].to_csv(f'/usr0/home/mamille2/11-830-Final-Project/data/davidson/{f}_unked.csv', index=False)

In [10]:
data['train'].columns

Index(['tweet', 'hate_speech', 'offensive_language', 'neither', 'mentions',
       'hashtags', 'original_tweet', 'tweet_unk_slur', 'tweet_no_slur',
       'label'],
      dtype='object')

# Check slur overlap with datasets

In [7]:
# Load data
folds = ['train', 'dev', 'test']
data = {}
for f in folds:
    data[f] = pd.read_csv(f'/usr0/home/acoda/11-830/Project/data/davidson/{f}.csv', encoding='ISO-8859-1')
    print(len(data[f]))

19836
2473
2474


In [8]:
data['train'].columns

Index(['tweet', 'hate_speech', 'offensive_language', 'neither', 'mentions',
       'hashtags', 'original_tweet'],
      dtype='object')

In [9]:
data['train'].head()

Unnamed: 0,tweet,hate_speech,offensive_language,neither,mentions,hashtags,original_tweet
0,<MENTION> as a woman you shouldn t complain ab...,0,0,1,['@mayasolovely'],[],!!! RT @mayasolovely: As a woman you shouldn't...
1,<MENTION> boy dats cold tyga dwn bad for cuffi...,0,1,0,['@mleew17'],[],!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,<MENTION> dawg <MENTION> you ever fuck a bitch...,0,1,0,"['@UrKindOfBrand', '@80sbaby4life']",[],!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,<MENTION> <MENTION> she look like a tranny,0,1,0,"['@C_G_Anderson', '@viva_based']",[],!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,<MENTION> the shit just blows me claim you so ...,0,1,0,['@T_Madison_x'],[],"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."


In [15]:
all_tweets = [w for t in data['train']['tweet'].tolist() for w in t.split()]
len(all_tweets)

283656

In [14]:
slurs_in_tweets = [w for w in all_tweets if w in slurs]
len(slurs_in_tweets)

16916

In [20]:
slur_ctr = Counter(slurs_in_tweets)
len(slur_ctr)

212

In [21]:
slur_ctr.most_common(n=200)

[('bitch', 6674),
 ('pussy', 1734),
 ('hoe', 1548),
 ('nigga', 965),
 ('trash', 910),
 ('faggot', 352),
 ('bird', 318),
 ('charlie', 221),
 ('niggah', 219),
 ('retarded', 207),
 ('yellow', 206),
 ('ghetto', 205),
 ('cunt', 202),
 ('nigger', 191),
 ('fag', 184),
 ('ho', 164),
 ('monkey', 141),
 ('colored', 134),
 ('nicca', 128),
 ('retard', 90),
 ('nig', 87),
 ('nigguh', 72),
 ('queer', 68),
 ('redneck', 68),
 ('af', 67),
 ('dyke', 67),
 ('mock', 59),
 ('oreo', 52),
 ('cracker', 52),
 ('yankee', 52),
 ('jihadi', 48),
 ('coon', 45),
 ('tranny', 40),
 ('skinny', 38),
 ('teabagger', 38),
 ('twat', 36),
 ('sole', 35),
 ('slope', 33),
 ('brownie', 33),
 ('hillbilly', 30),
 ('whitey', 30),
 ('fuzzy', 29),
 ('chug', 27),
 ('shy', 26),
 ('slit', 26),
 ('fairy', 22),
 ('crow', 22),
 ('clam', 22),
 ('nip', 21),
 ('jig', 21),
 ('boo', 21),
 ('abo', 21),
 ('cripple', 19),
 ('wigga', 19),
 ('apple', 18),
 ('beaner', 17),
 ('albino', 17),
 ('scally', 16),
 ('zebra', 16),
 ('ape', 16),
 ('wop', 15),
 

# Examine Hatebase list of slurs

In [1]:
# Load slurs
with open('/usr0/home/mamille2/11-830_data/project/hatebase_slurs.txt') as f:
    slurs = [w.lower() for w in f.read().splitlines()]
    
len(slurs)

584

In [2]:
# Examine
slurs

['abbo',
 'abc',
 'abcd',
 'abo',
 'af',
 'african catfish',
 'africoon',
 'afro-saxon',
 'albino',
 'alligator bait',
 'americoon',
 'amo',
 'anchor baby',
 'angie',
 'anglo',
 'ann',
 'ape',
 'apple',
 'argie',
 'armo',
 'aunt jane',
 'aunt jemima',
 'aunt mary',
 'aunt sally',
 'azn',
 'bamboo coon',
 'banana',
 'banana bender',
 'banana lander',
 'banjo lips',
 'bans and cans',
 'beach nigger',
 'bean dipper',
 'beaner',
 'beaner shnitzel',
 'beaney',
 'bengali',
 'bhrempti',
 'bint',
 'bird',
 'bitch',
 'bitter clinger',
 'bix nood',
 'black barbie',
 'black dago',
 'blaxican',
 'blockhead',
 'bludger',
 'bluegum',
 'bog hopper',
 'bog irish',
 'bog jumper',
 'bog trotter',
 'bogan',
 'bong',
 'boo',
 'boojie',
 'book book',
 'boon',
 'booner',
 'boong',
 'boonga',
 'boonie',
 'border bunny',
 'border hopper',
 'border jumper',
 'border nigger',
 'bounty bar',
 'boxhead',
 'brass ankle',
 'brownie',
 'bubble',
 'buck',
 'buckethead',
 'buckra',
 'buckwheat',
 'buddhahead',
 'buffi

In [7]:
outlist = [
    'anglo',
    'queer',
    'tommy',
    'trash',
    'wasp'
]

In [8]:
slurs = [w for w in slurs if not w in outlist]
len(slurs)

579

In [3]:
# Save slurs
with open('/usr0/home/mamille2/11-830_data/project/slurs.txt', 'w') as f:
    for s in slurs:
        f.write(f'{s}\n')

# Get Hatebase list

In [5]:
# Import key
with open('/usr0/home/mamille2/11-830_data/project/hatebase_api_key.txt') as f:
    key = f.read()[:-1]

In [12]:
results = {}
for i in range(6):
    url = f'http://api.hatebase.org/v3-0/{key}/vocabulary/json/language%3Deng%7Cpage%3D{i+1}'
    results[i] = urllib.request.urlopen(url).read()

len(results)

6

In [13]:
results[5]



In [22]:
obj = json.loads(results[5])

In [23]:
len(obj['data']['datapoint'])

84

In [25]:
len([el['vocabulary'] for el in obj['data']['datapoint']])

84

## Assemble terms

In [26]:
slurs = []

for _, bstr in results.items():
    obj = json.loads(bstr)
    slurs.extend([el['vocabulary'] for el in obj['data']['datapoint']])
    
len(slurs)

584

In [27]:
# Save slurs
with open('/usr0/home/mamille2/11-830_data/project/hatebase_slurs.txt', 'w') as f:
    for s in slurs:
        f.write(f'{s}\n')

# Examine Zeerak NAACL tweets

In [58]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 999)

In [62]:
datapath = '/usr0/home/mamille2/11-830_data/project/zeerak_naacl/zeerak_naacl_tweets.pkl'
data = pd.read_pickle(datapath)
data.loc[:, ['text', 'label']]

Unnamed: 0,text,label
0,"@ummayman90 @wood_brwood333 For example, in Medina Mohammed said, kindly cut off the heads of these 600 Jewish men and enslave their women.",racism
1,"@ummayman90 Again, your entire concept of god corresponds to a tyrannical earthly egomaniac because you are simple and stupid. #Islam",racism
2,"@anjemchoudary Your prophet was a rapist, murderer, pedophile, caravan robber, slave trader, bigot and sexist. God would never use the scum.",racism
3,"RT @DilanaKurdi: Yazidi children who are taken from their parents, forcibly converted to Islam! #Sinjar #Shengal http://t.co/wwzplHld7f",racism
4,@anjemchoudary Accepting the idea that Allah obsesses over how we take a dump shows just how utterly insane this inbred religion is.,racism
5,"These girls are the equivalent of the irritating Asian girls a couple years ago. Well done, 7. #MKR",racism
6,@ummayman90 Who knows if they criticized him? He killed them. And the things against him would never make it into the Sunnah.,racism
7,@lauracdean I love how the Islamofascists recruit 14 and 15 year old jihadis and then talk about minors in reference to 17 year olds.,racism
8,RT @PeterTownsend7: #ISIS #Islam PC Puzzle: Converting to a religion of peace leading to violence? http://t.co/TBJUsaeMUH http://t.co/G4xOh…,racism
9,@yousufpoosuf @RKingLive2Dance @izrinhariri And most Muslims are actually better behaved than their religion leads them to be.,racism


In [63]:
len(data[data['label']=='none'])

10853

In [60]:
data.loc[:, 'text', 'label', 'tags']

IndexingError: Too many indexers

# Assemble tweepy-downloaded tweets

In [53]:
data_dirpath = '/usr0/home/mamille2/11-830_data/project/zeerak_naacl/'
out_pklpath = '/usr0/home/mamille2/11-830_data/project/zeerak_naacl/zeerak_naacl_tweets.pkl'
out_csvpath = '/usr0/home/mamille2/11-830_data/project/zeerak_naacl/zeerak_naacl_tweets.csv'

file_lens = []
outlines = []

for fname in tqdm(sorted(os.listdir(data_dirpath))):
    with open(os.path.join(data_dirpath, fname)) as f:
        tweets = json.load(f)
        file_lens.append(len(tweets))

        for t in tweets:

            # Extract info from tweet
            info = [t['id'], t['created_at'], t['in_reply_to_status_id'], t['lang'], t['retweet_count'], 
                    t['user']['id'], t['user']['screen_name'], t['user']['name'],
                    t['text'], t['entities']['hashtags']]

            outlines.append(info)

print(np.mean(file_lens))

out = pd.DataFrame(outlines, columns=['tweet_id', 'created_at', 'in_reply_to_status_id', 'lang',
                                 'retweet_count', 'user_id', 'user_screen_name', 'user_name',
                                 'text', 'tags'])

print(len(out))
out.to_pickle(out_pklpath)
out.to_csv(out_csvpath, index=False)

In [16]:
# Get tweet labels
labeled_data = pd.read_csv('/usr0/home/mamille2/11-830_data/project/zeerak_naacl/NAACL_SRW_2016.csv', header=None,
                          names=['tweet_id', 'label'])
len(labeled_data)

16907

In [42]:
merged = pd.merge(out, labeled_data, how='right')
print(merged.columns)
print(len(merged))

Index(['tweet_id', 'created_at', 'in_reply_to_status_id', 'lang',
       'retweet_count', 'user_id', 'user_screen_name', 'user_name', 'text',
       'tags', 'label'],
      dtype='object')
17007


In [23]:
len(out)

15913

In [24]:
len(merged['tweet_id'].unique())

15863

In [43]:
# Drop duplicates
merged.drop_duplicates(subset=['tweet_id'], inplace=True, keep=False)
len(merged)

16791

In [49]:
# Drop label and text NaN rows
merged.dropna(axis=0, subset=['label', 'text'], inplace=True)
len(merged)

15813

In [61]:
merged.to_pickle(out_pklpath)
merged.to_csv(out_csvpath, index=False)

In [34]:
merged['text']

0        @ummayman90 @wood_brwood333 For example, in Me...
1        @ummayman90 Again, your entire concept of god ...
2        @anjemchoudary Your prophet was a rapist, murd...
3        RT @DilanaKurdi: Yazidi children who are taken...
4        @anjemchoudary Accepting the idea that Allah o...
5        These girls are the equivalent of the irritati...
6        @ummayman90 Who knows if they criticized him? ...
7        @lauracdean I love how the Islamofascists recr...
8        RT @PeterTownsend7: #ISIS #Islam PC Puzzle: Co...
9        @yousufpoosuf @RKingLive2Dance @izrinhariri An...
14       @HillaryGuess @riwired @izrinhariri @AtharHKha...
15       @ummayman90 Because they breed like rats, kill...
16       @ummayman90 Again, you miss the point that god...
17       RT @dsyndergaard: Israeli spat upon, beaten by...
18       @TII99 That is a total lie.  Minorities in Mus...
19       It seems that Allah sits around all day obsess...
20       @yousufpoosuf @RKingLive2Dance @izrinhariri Th.

In [33]:
gped = out.groupby('tweet_id').size()
dups = gped[gped > 1]
dups

tweet_id
552487055553757187    2
552923665952546816    2
572330982029709312    2
572331686156873728    2
572335493586685953    2
572336047775870976    2
572336241871478786    2
572336280186429440    2
572336293725642752    2
572336406053322753    2
572336427867885568    2
572336451221757953    2
572336657384407040    2
572336799340630016    2
572336970979885056    2
572337067436351489    2
572337337318817792    2
572337509159452672    2
572337676482838528    2
572337676713512960    2
572338147700293632    2
572338419851911168    2
572338617772740608    2
572338619282694144    2
572338780671107072    2
572339081780326400    2
572339216178270208    2
572339607829794816    2
572339639240929281    2
572340250715930624    2
572340476503724032    2
572340883489615872    2
572341118118936577    2
572341307584073728    2
572341498827522049    2
572342652907044864    2
572342944256032768    2
572342978255048705    2
572343206479712256    2
572343828968943616    2
572344263389794304    2
5723449

In [29]:
merged[merged['tweet_id'].isin(dups.index)]

Unnamed: 0,tweet_id,created_at,in_reply_to_status_id,lang,retweet_count,user_id,user_screen_name,user_name,text,tags,label
10,572341498827522049,Mon Mar 02 10:23:41 +0000 2015,,en,0,110114783,trish2295,patricia hilder,Drasko they didn't cook half a bird you idiot ...,"[{'text': 'mkr', 'indices': [46, 50]}]",racism
11,572341498827522049,Mon Mar 02 10:23:41 +0000 2015,,en,0,110114783,trish2295,patricia hilder,Drasko they didn't cook half a bird you idiot ...,"[{'text': 'mkr', 'indices': [46, 50]}]",none
12,572341498827522049,Mon Mar 02 10:23:41 +0000 2015,,en,0,110114783,trish2295,patricia hilder,Drasko they didn't cook half a bird you idiot ...,"[{'text': 'mkr', 'indices': [46, 50]}]",racism
13,572341498827522049,Mon Mar 02 10:23:41 +0000 2015,,en,0,110114783,trish2295,patricia hilder,Drasko they didn't cook half a bird you idiot ...,"[{'text': 'mkr', 'indices': [46, 50]}]",none
22,572340476503724032,Mon Mar 02 10:19:37 +0000 2015,,en,0,38650214,foodbling,"food bling, Brisbane",Hopefully someone cooks Drasko in the next ep ...,"[{'text': 'MKR', 'indices': [49, 53]}]",racism
23,572340476503724032,Mon Mar 02 10:19:37 +0000 2015,,en,0,38650214,foodbling,"food bling, Brisbane",Hopefully someone cooks Drasko in the next ep ...,"[{'text': 'MKR', 'indices': [49, 53]}]",none
24,572340476503724032,Mon Mar 02 10:19:37 +0000 2015,,en,0,38650214,foodbling,"food bling, Brisbane",Hopefully someone cooks Drasko in the next ep ...,"[{'text': 'MKR', 'indices': [49, 53]}]",racism
25,572340476503724032,Mon Mar 02 10:19:37 +0000 2015,,en,0,38650214,foodbling,"food bling, Brisbane",Hopefully someone cooks Drasko in the next ep ...,"[{'text': 'MKR', 'indices': [49, 53]}]",none
43,552487055553757187,Tue Jan 06 15:29:13 +0000 2015,,en,2,2941145694,VileIslam,Levi Stein,RT @Benfrancisallen: It hasn't been a good few...,"[{'text': 'ISIS', 'indices': [57, 62]}, {'text...",racism
44,552487055553757187,Tue Jan 06 15:29:13 +0000 2015,,en,2,2941145694,VileIslam,Levi Stein,RT @Benfrancisallen: It hasn't been a good few...,"[{'text': 'ISIS', 'indices': [57, 62]}, {'text...",none


In [30]:
len(labeled_data['tweet_id'].unique())

16849