In [89]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score, accuracy_score
from scipy.sparse import hstack, csr_matrix
import re
from collections import defaultdict, Counter
from nltk import TweetTokenizer, pos_tag
from IPython.display import display
import os
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Conv1D, MaxPooling1D, Embedding, Flatten, Dropout
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras import optimizers

from IPython.core.debugger import set_trace

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [17]:
# Load data
data = {}
# data['train'] = pd.read_csv('../../hw3/train_mod.csv')
data['train'] = pd.read_csv('/usr0/home/mamille2/11-830_data/hw3/train_mod.csv')
data['train']

Unnamed: 0.1,Unnamed: 0,tweet,hate,offensive,neither,tweet_processed
0,0,"@HarveyLevinTMZ @sly309 they want what I got, ...",1,0,0,"<mention> <mention> they want what i got , i w..."
1,1,@HeauxmerSimpson I'm jus tryna vaca away from ...,1,0,0,<mention> i'm jus tryna vaca away from the nig...
2,2,@HeilSidious Sion is a faggot. Very meh ganks ...,1,0,0,<mention> sion is a faggot . very meh ganks pr...
3,3,@Herfarm @SteveWorks4You @TheDemocrats your lo...,1,0,0,<mention> <mention> <mention> your low info re...
4,4,@Herfarm Move out of our country teabaggers!,1,0,0,<mention> move out of our country teabaggers !
5,5,@HollowDaDonLOM those god damn chinks. http://...,1,0,0,<mention> those god damn chinks . http://t.co/...
6,6,@Hovaa_ #niggerfood #niggerperson #you #are #a...,1,0,0,<mention> #niggerfood #niggerperson #you #are ...
7,7,@Hovaa_ ok wat ever u say whitey. u prolly nev...,1,0,0,<mention> ok wat ever u say whitey . u prolly ...
8,8,@Hovaa_ shut up lizard faggot nigger cunt,1,0,0,<mention> shut up lizard faggot nigger cunt
9,9,@Hovaa_ ya I know all the slang I'm racist I h...,1,0,0,<mention> ya i know all the slang i'm racist i...


## Preprocess

In [3]:
def preprocess_tweet(text):
    
    # Remove @mentions
    p = re.sub(r'@\w+', '<mention>', text)
    
    # tokenize
    tokenizer = TweetTokenizer()
    p = ' '.join(tokenizer.tokenize(p.lower()))
    
    return p

In [4]:
data['train']['tweet_processed'] = data['train']['tweet'].map(lambda x: preprocess_tweet(x))

In [41]:
data['train']['tweet_processed'].head()

0    <mention> <mention> they want what i got , i want what they got , america.they got the word nigger , i want it to                           
1    <mention> i'm jus tryna vaca away from the niggers bro .                                                                                    
2    <mention> sion is a faggot . very meh ganks pre - 6 and post - 6 his ganks are still pretty meh he's just a tank , nothing special imo mao +
3    <mention> <mention> <mention> your low info redneck dumbasses vote for these corporate shills that idiots like you believe #dummies         
4    <mention> move out of our country teabaggers !                                                                                              
Name: tweet_processed, dtype: object

In [42]:
data['train'].to_csv('../../hw3/train_mod.csv')

# Logistic regression baseline

In [10]:
vec = CountVectorizer(min_df=2)
vec.fit(data['train'].tweet_processed.tolist())

bow = {}
bow['train'] = vec.transform(data['train'].tweet_processed.tolist())
bow['train'].shape

(21083, 8534)

In [13]:
# Hate speech classification
x = {}
y = {}
x['train'] = bow['train']
y['train'] = data['train'].hate

clf = LogisticRegression()

scoring = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1': make_scorer(f1_score)}

scores = cross_validate(clf, x['train'], y['train'], cv=5, scoring=scoring)
print(f'Accuracy: {scores["test_accuracy"].mean(): .1%}')
print(f'Precision: {scores["test_precision"].mean(): .1%}')
print(f'Recall: {scores["test_recall"].mean(): .1%}')
print(f'F1: {scores["test_f1"].mean(): .1%}')

Accuracy:  93.8%
Precision:  48.5%
Recall:  21.3%
F1:  25.7%


In [14]:
# Offensive language classification
x = {}
y = {}
x['train'] = bow['train']
y['train'] = data['train'].offensive

clf = LogisticRegression()

scoring = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1': make_scorer(f1_score)}

scores = cross_validate(clf, x['train'], y['train'], cv=5, scoring=scoring)
print(f'Accuracy: {scores["test_accuracy"].mean(): .1%}')
print(f'Precision: {scores["test_precision"].mean(): .1%}')
print(f'Recall: {scores["test_recall"].mean(): .1%}')
print(f'F1: {scores["test_f1"].mean(): .1%}')

Accuracy:  89.2%
Precision:  92.8%
Recall:  93.3%
F1:  93.0%


## Error analysis (hate speech)

In [15]:
# Train on full dataset (hate speech) and see errors
clf = LogisticRegression()
y['train'] = data['train'].hate
clf.fit(x['train'], y['train'])
preds = clf.predict(x['train'])
preds.shape

(21083,)

In [16]:
false_pos = []
false_neg = []

for i, (gold, pred) in enumerate(zip(data['train'].hate.tolist(), preds)):
    if pred != gold:
        if gold == 0:
            false_pos.append(i)
        elif gold == 1:
            false_neg.append(i)
        else:
            "What???"
            
print(len(false_pos))
print(len(false_neg))

63
695


In [26]:
pd.set_option('display.max_colwidth', -1)

In [25]:
print("False negatives:")
display(data['train'].loc[false_neg, ['tweet_processed', 'hate', 'offensive', 'neither']].sample(30))

False negatives:


Unnamed: 0,tweet_processed,hate,offensive,neither
19933,"bitch kill yoself , go on to the bathroom and eat the pills bitch , all of em",1,0,0
10717,rt <mention> : <mention> <mention> <mention> not about to shoot this pussy ! ima beat his face into the asphalt t ...,1,0,0
12006,rt <mention> : the fox says you're a cunt .,1,0,0
738,<mention> fucking coon,1,0,0
3340,<mention> <mention> <mention> <mention> yeah ok nigger .,1,0,0
13655,rt <mention> : <mention> happy birthday my little cotton picker,1,0,0
14387,rt <mention> : i saw a great chimpout 2 summers ago . a middle aged salvadoran and his 15 yr old son got wasted by 6 beaners at the metlife …,1,0,0
2811,<mention> faggot,1,0,0
3255,<mention> i'm a faggot :/ http://t.co/gr627ldfaa,1,0,0
4865,curse avi for coke because he's a fag and hasn't put it on yet http://t.co/jhqpwop4wk,1,0,0


## Error analysis (offensive language)

In [28]:
# Train on full dataset and see errors
clf = LogisticRegression()
y['train'] = data['train'].offensive
clf.fit(x['train'], y['train'])
preds = clf.predict(x['train'])
preds.shape

(21083,)

In [30]:
false_pos = []
false_neg = []

for i, (gold, pred) in enumerate(zip(data['train'].offensive.tolist(), preds)):
    if pred != gold:
        if gold == 0:
            false_pos.append(i)
        elif gold == 1:
            false_neg.append(i)
        else:
            "What???"
            
print(len(false_pos))
print(len(false_neg))

633
477


In [31]:
print("False negatives:")
display(data['train'].loc[false_neg, ['tweet_processed', 'hate', 'offensive', 'neither']].sample(30))

False negatives:


Unnamed: 0,tweet_processed,hate,offensive,neither
16690,rt <mention> : <mention> blaxican,0,1,0
15079,rt <mention> : 😂 😂 rt <mention> : lol rt <mention> : 😂 😭 rt <mention> : just so u faggots know ! http://t.co/bghtsjipne,0,1,0
15728,rt <mention> : bruh you trash take yo ass back to college <mention>,0,1,0
6141,"hmm .. ass all chunky , brain is insanity .",0,1,0
12632,rt <mention> : i don't understand how so many americans can hate on obama for saluting with a latte . nobody tells these swagfags anything abo …,0,1,0
12728,rt <mention> : one man's trash is another man's girlfriend .,0,1,0
6373,"i been way to consistent on niggas , i'm have to drop some trash to get appreciated",0,1,0
15955,rt <mention> : i believe the time for uncouth / uncivilised people to be sentenced to death for racial slurs / attacks against ne citizens ...,0,1,0
11039,rt <mention> : if you don't make her go to sleep after y'all fucked then your pipe game trash,0,1,0
17480,stop spoiling u fags,0,1,0


# CNN

In [11]:
MAX_VOCAB_SIZE = 30000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE,
                     filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”')
texts = data['train']['tweet_processed'].tolist()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique words')

Found 21870 unique words


In [54]:
# labels = data['train']['hate'].data
labels = to_categorical(data['train']['hate'].tolist())
labels.shape

(21083, 2)

In [26]:
MAX_SEQUENCE_LENGTH = 100
seqs = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
seqs.shape

(21083, 100)

In [35]:
vocab = list(word_index.keys())[:MAX_VOCAB_SIZE] # lower indices are words kept
len(vocab)

21870

In [55]:
# Shuffle, split into train/dev
test_size = int(0.1 * len(seqs))
x_train, x_dev, y_train, y_dev = train_test_split(seqs, np.asarray(labels), test_size=test_size)
print(x_train.shape)
print(y_train.shape)
print(x_dev.shape)
print(y_dev.shape)

(18975, 100)
(18975, 2)
(2108, 100)
(2108, 2)


In [61]:
p = np.array([[0.2, 0.3], [0.7, 0.3]])
np.argmax(p, axis=1)

array([1, 0])

In [71]:
metrics(p, np.array([1,0]))

Precision: 1.0
Recall: 1.0
F1: 1.0
Accuracy: 1.0


In [79]:
def metrics(preds, actual):
    tp = 0
    fp = 0
    fn = 0
    matches = 0
    
    binary_preds = np.argmax(preds, axis=1)
    binary_actual = np.argmax(actual, axis=1)
    
    for pred, act in zip(binary_preds, binary_actual):
        
        if pred == act == 1:
            tp += 1
            matches += 1
        elif pred == 1 and act == 0:
            fp += 1
        elif pred == 0 and act == 1:
            fn += 1
        elif pred == act == 0:
            matches += 1
        
        
    if tp == fp == 0:
        prec = 0
    else:
        prec = tp/(tp+fp)
        
    rec = tp/(tp+fn)
    
    if prec == rec == 0.0:
        f1 = 0.0
        
    else:
        f1 = 2 * prec * rec / (prec + rec)
        
    acc = matches/len(preds)
    
    print(f"Precision: {prec}")
    print(f"Recall: {rec}")
    print(f"F1: {f1}")
    print(f"Accuracy: {acc}")

In [86]:
# Prepare model

EMBEDDING_DIM = 50
embedding_layer = Embedding(len(vocab),
                            EMBEDDING_DIM,
                            input_length = MAX_SEQUENCE_LENGTH,
                            trainable=True
                           )

model = Sequential()

model.add(embedding_layer)
model.add(Conv1D(32, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=5))
model.add(Dropout(0.1))
model.add(Conv1D(16, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.1))
model.add(Flatten())
model.add(Dense(4, activation='relu')) # final classification layer
model.add(Dense(2, activation='softmax')) # final classification layer

# adam = optimizers.adam(lr=0.1)
sgd = optimizers.SGD(lr=0.05)
model.compile(loss='binary_crossentropy', optimizer=sgd)

callbacks = [
    EarlyStopping(monitor='val_loss', patience=2, verbose=0),
]
model.fit(x_train, y_train,
#          batch_size=16, epochs=20, validation_data=(x_dev, y_dev))
         batch_size=16, epochs=50, validation_data=(x_dev, y_dev), callbacks=callbacks)

preds = model.predict(x_dev, batch_size=16)

print()
metrics(preds, y_dev)

Train on 18975 samples, validate on 2108 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50

Precision: 0
Recall: 0.0
F1: 0.0
Accuracy: 0.9354838709677419


# Hatebase lexicon attempts

In [35]:
# Load Hatebase lexicon
# with open('/usr0/home/mamille2/11-830_data/project/hatebase_slurs.txt', 'r') as f:
with open('../../project/hatebase_slurs.txt', 'r') as f:
    slurs = set([w.lower() for w in f.read().splitlines()])
    
len(slurs)

584

## Add as feature

In [38]:
# Extract Hatebase features

data['train']['num_slurs'] = data['train']['tweet_processed'].map(lambda x: sum([1 for wd in x.split() if wd in slurs]))
data['train']['num_slurs'].head()

0    1
1    0
2    1
3    1
4    0
Name: num_slurs, dtype: int64

In [42]:
data['train']['num_slurs'].data.shape

(21083,)

In [40]:
vec = CountVectorizer(min_df=2)
vec.fit(data['train'].tweet_processed.tolist())

bow = {}
bow['train'] = vec.transform(data['train'].tweet_processed.tolist())
bow['train'].shape

(21083, 8534)

In [44]:
# Hate speech classification
x = {}
y = {}
x['train'] = hstack([bow['train'], csr_matrix(data['train']['num_slurs'].data).T])
y['train'] = data['train'].hate

clf = LogisticRegression()

scoring = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1': make_scorer(f1_score)}

scores = cross_validate(clf, x['train'], y['train'], cv=5, scoring=scoring)
print(f'Accuracy: {scores["test_accuracy"].mean(): .1%}')
print(f'Precision: {scores["test_precision"].mean(): .1%}')
print(f'Recall: {scores["test_recall"].mean(): .1%}')
print(f'F1: {scores["test_f1"].mean(): .1%}')

Accuracy:  93.8%
Precision:  48.0%
Recall:  21.4%
F1:  25.8%


In [46]:
# Offensive language classification
x = {}
y = {}
x['train'] = hstack([bow['train'], csr_matrix(data['train']['num_slurs'].data).T])
y['train'] = data['train'].offensive

clf = LogisticRegression()

scoring = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1': make_scorer(f1_score)}

scores = cross_validate(clf, x['train'], y['train'], cv=5, scoring=scoring)
print(f'Accuracy: {scores["test_accuracy"].mean(): .1%}')
print(f'Precision: {scores["test_precision"].mean(): .1%}')
print(f'Recall: {scores["test_recall"].mean(): .1%}')
print(f'F1: {scores["test_f1"].mean(): .1%}')

Accuracy:  89.1%
Precision:  92.7%
Recall:  93.2%
F1:  93.0%


## Bigrams (with hate speech lexicon pairing and not)

In [47]:
# Normal bigrams

vec = CountVectorizer(ngram_range=(1,2), min_df=2)
vec.fit(data['train'].tweet_processed.tolist())

bow = {}
bow['train'] = vec.transform(data['train'].tweet_processed.tolist())
bow['train'].shape

(21083, 34274)

In [48]:
# Hate speech classification
x = {}
y = {}
x['train'] = bow['train']
y['train'] = data['train'].hate

clf = LogisticRegression()

scoring = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1': make_scorer(f1_score)}

scores = cross_validate(clf, x['train'], y['train'], cv=5, scoring=scoring)
print(f'Accuracy: {scores["test_accuracy"].mean(): .1%}')
print(f'Precision: {scores["test_precision"].mean(): .1%}')
print(f'Recall: {scores["test_recall"].mean(): .1%}')
print(f'F1: {scores["test_f1"].mean(): .1%}')

Accuracy:  93.7%
Precision:  52.9%
Recall:  22.7%
F1:  25.1%


In [49]:
# Offensive language classification
x = {}
y = {}
x['train'] = bow['train']
y['train'] = data['train'].offensive

clf = LogisticRegression()

scoring = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1': make_scorer(f1_score)}

scores = cross_validate(clf, x['train'], y['train'], cv=5, scoring=scoring)
print(f'Accuracy: {scores["test_accuracy"].mean(): .1%}')
print(f'Precision: {scores["test_precision"].mean(): .1%}')
print(f'Recall: {scores["test_recall"].mean(): .1%}')
print(f'F1: {scores["test_f1"].mean(): .1%}')

Accuracy:  89.4%
Precision:  93.3%
Recall:  93.1%
F1:  93.1%


In [54]:
def get_hate_pairs(tweet):
    toks = tweet.split()
    hate_dict = defaultdict(int)
    
    hate_terms = [w for w in toks if w in slurs]
    for h in hate_terms:
        for t in toks:
            hate_dict[f'{h}_{t}'] += 1
        
    return hate_dict

In [55]:
# Extract dict features
hate_pairs = data['train']['tweet_processed'].map(get_hate_pairs)
hate_pairs[:10]

0    {'nigger_<mention>': 2, 'nigger_they': 2, 'nigger_want': 3, 'nigger_what': 2, 'nigger_i': 3, 'nigger_got': 3, 'nigger_,': 3, 'nigger_america.they': 1, 'nigger_the': 1, 'nigger_word': 1, 'nigger_nigger': 1, 'nigger_it': 1, 'nigger_to': 1}                                                                                                                                                                                                                                               
1    {}                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
2    {'faggot_<mention>': 1, 'faggot

In [56]:
# Build special pairings with hatebase terms
dict_vec = DictVectorizer()

hate_feats = dict_vec.fit_transform(hate_pairs)
hate_feats.shape

(21083, 57450)

In [57]:
vec = CountVectorizer(min_df=2)
vec.fit(data['train'].tweet_processed.tolist())

bow = {}
bow['train'] = vec.transform(data['train'].tweet_processed.tolist())
bow['train'].shape

(21083, 8534)

In [83]:
x['train'].shape

(21083, 65984)

In [88]:
# Hate speech classification
x = {}
y = {}
x['train'] = hstack([bow['train'], hate_feats])
y['train'] = data['train'].hate

# clf = make_pipeline(SelectKBest(chi2, k=30000), LogisticRegression())
clf = LogisticRegression()

scoring = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1': make_scorer(f1_score)}

scores = cross_validate(clf, x['train'], y['train'], cv=5, scoring=scoring)
print(f'Accuracy: {scores["test_accuracy"].mean(): .1%}')
print(f'Precision: {scores["test_precision"].mean(): .1%}')
print(f'Recall: {scores["test_recall"].mean(): .1%}')
print(f'F1: {scores["test_f1"].mean(): .1%}')

Accuracy:  93.5%
Precision:  47.1%
Recall:  22.2%
F1:  25.2%


In [60]:
# Offensive language classification
x = {}
y = {}
x['train'] = hstack([bow['train'], hate_feats])
y['train'] = data['train'].offensive

clf = LogisticRegression()

scoring = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1': make_scorer(f1_score)}

scores = cross_validate(clf, x['train'], y['train'], cv=5, scoring=scoring)
print(f'Accuracy: {scores["test_accuracy"].mean(): .1%}')
print(f'Precision: {scores["test_precision"].mean(): .1%}')
print(f'Recall: {scores["test_recall"].mean(): .1%}')
print(f'F1: {scores["test_f1"].mean(): .1%}')

Accuracy:  90.0%
Precision:  93.7%
Recall:  93.4%
F1:  93.5%


## Violence words (doesn't help)

In [114]:
viol_wds = ['kill', 'murder', 'stab', 'death', 'bullets', 'blood', 'knife', 'gun', 'shoot', 'life', 'beat', 'hate',
           'fight']

In [115]:
def get_viol_pairs(tweet):
    toks = tweet.split()
    hate_dict = defaultdict(int)
    
    hate_terms = [w for w in toks if w in viol_wds]
    for h in hate_terms:
        for t in toks:
            hate_dict[f'{h}_{t}'] += 1
        
    return hate_dict

In [116]:
# Extract dict features
hate_pairs = data['train']['tweet_processed'].map(get_viol_pairs)
hate_pairs[:10]

0    {}
1    {}
2    {}
3    {}
4    {}
5    {}
6    {}
7    {}
8    {}
9    {}
Name: tweet_processed, dtype: object

In [117]:
len([h for h in hate_pairs.tolist() if len(h) > 0])

1094

In [118]:
# Build special pairings with hatebase terms
dict_vec = DictVectorizer()

hate_feats = dict_vec.fit_transform(hate_pairs)
hate_feats.shape

(21083, 6339)

In [119]:
vec = CountVectorizer(min_df=2)
vec.fit(data['train'].tweet_processed.tolist())

bow = {}
bow['train'] = vec.transform(data['train'].tweet_processed.tolist())
bow['train'].shape

(21083, 8534)

In [123]:
# Hate speech classification
x = {}
y = {}
x['train'] = hstack([bow['train'], hate_feats])
# x['train'] = bow['train']
y['train'] = data['train'].hate

clf = make_pipeline(SelectKBest(chi2, k=10000), LogisticRegression())
clf = LogisticRegression()

scoring = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1': make_scorer(f1_score)}

scores = cross_validate(clf, x['train'], y['train'], cv=5, scoring=scoring)
print(f'Accuracy: {scores["test_accuracy"].mean(): .1%}')
print(f'Precision: {scores["test_precision"].mean(): .1%}')
print(f'Recall: {scores["test_recall"].mean(): .1%}')
print(f'F1: {scores["test_f1"].mean(): .1%}')

Accuracy:  93.7%
Precision:  46.6%
Recall:  21.0%
F1:  25.3%


In [113]:
# Offensive language classification
x = {}
y = {}
x['train'] = hstack([bow['train'], hate_feats])
y['train'] = data['train'].offensive

clf = LogisticRegression()

scoring = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1': make_scorer(f1_score)}

scores = cross_validate(clf, x['train'], y['train'], cv=5, scoring=scoring)
print(f'Accuracy: {scores["test_accuracy"].mean(): .1%}')
print(f'Precision: {scores["test_precision"].mean(): .1%}')
print(f'Recall: {scores["test_recall"].mean(): .1%}')
print(f'F1: {scores["test_f1"].mean(): .1%}')

Accuracy:  89.1%
Precision:  92.8%
Recall:  93.2%
F1:  93.0%


## POS templates

In [87]:
data['train']['pos'] = data['train']['tweet_processed'].map(lambda x: ' '.join([pos for wd, pos in pos_tag(x.split())]))
data['train']['pos'].head()

0    NNS VBP PRP VBP WP NN VBD , VB VBP WP PRP VBD ...
1                    JJ NN NN NN FW RB IN DT NNS VBP .
2    JJ NN VBZ DT NN . RB JJ NNS VBP : CD CC VB : C...
3    JJ NNP NNP PRP$ JJ NN NN NNS VBP IN DT JJ NNS ...
4                            RB VB IN IN PRP$ NN NNS .
Name: pos, dtype: object

In [98]:
def pos_contexts(text, postags, window_size):
    toks = text.split()
    tags = postags.split()
    feats = []
    
    assert len(toks) == len(tags)
    
    for i in range(len(toks)):
        if i == 0:
            feats.append('_'.join([toks[i]] + tags[i+1:i+1+window_size]))
        elif i == len(toks) - 1:
            feats.append('_'.join(tags[i-window_size:i] + [tags[i]]))
        else:
            feats.append('_'.join(tags[i-window_size:i] + [toks[i]] + tags[i+1:i+1+window_size]))
    
    return Counter(feats)

In [100]:
pos_templates = [pos_contexts(t, postags, 1) for (t, postags) in list(zip(data['train']['tweet_processed'], data['train']['pos']))]
len(pos_templates)

21083

In [101]:
# Build special pairings with hatebase terms
dict_vec = DictVectorizer()

hate_feats = dict_vec.fit_transform(pos_templates)
hate_feats.shape

(21083, 106507)

In [102]:
vec = CountVectorizer(min_df=2)
vec.fit(data['train'].tweet_processed.tolist())

bow = {}
bow['train'] = vec.transform(data['train'].tweet_processed.tolist())
bow['train'].shape

(21083, 8534)

In [105]:
# Hate speech classification
x = {}
y = {}
x['train'] = hstack([bow['train'], hate_feats])
y['train'] = data['train'].hate

clf = make_pipeline(SelectKBest(chi2, k=10000), LogisticRegression())
# clf = LogisticRegression()

scoring = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1': make_scorer(f1_score)}

scores = cross_validate(clf, x['train'], y['train'], cv=5, scoring=scoring)
print(f'Accuracy: {scores["test_accuracy"].mean(): .1%}')
print(f'Precision: {scores["test_precision"].mean(): .1%}')
print(f'Recall: {scores["test_recall"].mean(): .1%}')
print(f'F1: {scores["test_f1"].mean(): .1%}')

Accuracy:  89.9%
Precision:  43.8%
Recall:  19.0%
F1:  13.4%


In [106]:
# Offensive language classification
x = {}
y = {}
x['train'] = hstack([bow['train'], hate_feats])
y['train'] = data['train'].offensive

clf = LogisticRegression()

scoring = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1': make_scorer(f1_score)}

scores = cross_validate(clf, x['train'], y['train'], cv=5, scoring=scoring)
print(f'Accuracy: {scores["test_accuracy"].mean(): .1%}')
print(f'Precision: {scores["test_precision"].mean(): .1%}')
print(f'Recall: {scores["test_recall"].mean(): .1%}')
print(f'F1: {scores["test_f1"].mean(): .1%}')

Accuracy:  88.3%
Precision:  93.3%
Recall:  91.6%
F1:  92.3%


## POS ngrams (doesn't improve)

In [96]:
data['train']['pos'] = data['train']['tweet_processed'].map(lambda x: ' '.join([pos for wd, pos in pos_tag(x.split())]))
data['train']['pos'].head()

0    NNS VBP PRP VBP WP NN VBD , VB VBP WP PRP VBD , NN VBD DT NN NN , NN VBP PRP TO                   
1    JJ NN NN NN FW RB IN DT NNS VBP .                                                                 
2    JJ NN VBZ DT NN . RB JJ NNS VBP : CD CC VB : CD PRP$ NNS VBP RB RB JJ NN RB DT NN , NN JJ NN NN NN
3    JJ NNP NNP PRP$ JJ NN NN NNS VBP IN DT JJ NNS IN NNS IN PRP VBP NNS                               
4    RB VB IN IN PRP$ NN NNS .                                                                         
Name: pos, dtype: object

In [97]:
# Extract ngrams
pos_vec = CountVectorizer(ngram_range=(1,3))
pos_feats = pos_vec.fit_transform(data['train']['pos'])
pos_feats.shape

(21083, 8972)

In [101]:
# Hate speech classification
x = {}
y = {}
x['train'] = hstack([bow['train'], pos_feats])
y['train'] = data['train'].hate

# clf = make_pipeline(SelectKBest(chi2, k=10000), LogisticRegression())
clf = LogisticRegression()

scoring = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1': make_scorer(f1_score)}

scores = cross_validate(clf, x['train'], y['train'], cv=5, scoring=scoring)
print(f'Accuracy: {scores["test_accuracy"].mean(): .1%}')
print(f'Precision: {scores["test_precision"].mean(): .1%}')
print(f'Recall: {scores["test_recall"].mean(): .1%}')
print(f'F1: {scores["test_f1"].mean(): .1%}')

Accuracy:  93.0%
Precision:  41.9%
Recall:  22.5%
F1:  24.5%


In [99]:
# Offensive language classification
x = {}
y = {}
x['train'] = hstack([bow['train'], pos_feats])
y['train'] = data['train'].offensive

clf = LogisticRegression()

scoring = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1': make_scorer(f1_score)}

scores = cross_validate(clf, x['train'], y['train'], cv=5, scoring=scoring)
print(f'Accuracy: {scores["test_accuracy"].mean(): .1%}')
print(f'Precision: {scores["test_precision"].mean(): .1%}')
print(f'Recall: {scores["test_recall"].mean(): .1%}')
print(f'F1: {scores["test_f1"].mean(): .1%}')

Accuracy:  87.9%
Precision:  92.0%
Recall:  92.4%
F1:  92.2%


## Hate density (doesn't improve)

In [71]:
def slurs_norm(num_slurs, text):
    return num_slurs/len(text.split())

In [73]:
data['train']['slurs_normalized'] = [slurs_norm(ns, t) for (ns, t) in list(zip(data['train']['num_slurs'], data['train']['tweet_processed']))]
data['train']['slurs_normalized'].head()

0    0.041667
1    0.000000
2    0.031250
3    0.052632
4    0.000000
Name: slurs_normalized, dtype: float64

In [74]:
# Hate speech classification
x = {}
y = {}
x['train'] = hstack([bow['train'], csr_matrix(data['train']['slurs_normalized'].data).T])
y['train'] = data['train'].hate

clf = LogisticRegression()

scoring = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1': make_scorer(f1_score)}

scores = cross_validate(clf, x['train'], y['train'], cv=5, scoring=scoring)
print(f'Accuracy: {scores["test_accuracy"].mean(): .1%}')
print(f'Precision: {scores["test_precision"].mean(): .1%}')
print(f'Recall: {scores["test_recall"].mean(): .1%}')
print(f'F1: {scores["test_f1"].mean(): .1%}')

Accuracy:  93.8%
Precision:  48.1%
Recall:  21.1%
F1:  25.6%


In [75]:
# Offensive language classification
x = {}
y = {}
x['train'] = hstack([bow['train'], csr_matrix(data['train']['slurs_normalized'].data).T])
y['train'] = data['train'].offensive

clf = LogisticRegression()

scoring = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1': make_scorer(f1_score)}

scores = cross_validate(clf, x['train'], y['train'], cv=5, scoring=scoring)
print(f'Accuracy: {scores["test_accuracy"].mean(): .1%}')
print(f'Precision: {scores["test_precision"].mean(): .1%}')
print(f'Recall: {scores["test_recall"].mean(): .1%}')
print(f'F1: {scores["test_f1"].mean(): .1%}')

Accuracy:  89.2%
Precision:  92.8%
Recall:  93.3%
F1:  93.0%
