Dictionary-based white supremacist classifier from Siegel+202  
Match dictionary, then filter out false positives with a NB classifier

# Dictionary

In [4]:
# Load white nationalist terms
import pandas as pd

words = pd.read_csv('../data/siegel2021/qjps_hatespeech_dictionary.csv')
print(len(words))
words = words[words.exclude != 'yes']
words

4476


Unnamed: 0,term,meaning,type,source,exclude
0,book book,"in hawaii, denotes a filipino, from the allege...",anti_asian,hatebase,
1,book_book,"in hawaii, denotes a filipino, from the allege...",anti_asian,hatebase,
2,book-book,"in hawaii, denotes a filipino, from the allege...",anti_asian,hatebase,
3,bookbook,"in hawaii, denotes a filipino, from the allege...",anti_asian,hatebase,
4,bug eater,an person of asian descent.,anti_asian,hatebase,
...,...,...,...,...,...
4471,racemixing,,white nationalist,,
4472,sub human,a black person.,white_nationalist,hatebase,
4473,sub_human,they are said to be the missing link between a...,white_nationalist,rsd,
4474,sub-human,they are said to be the missing link between a...,white_nationalist,rsd,


In [5]:
# Select white nationalist terms
words.type.unique()

array(['anti_asian', 'anti_black', 'anti_black_misogynistic',
       'anti_immigrant', 'anti_latino', 'anti_muslim_anti_arab',
       'anti_semitic', 'anti_semitic_white nationalist', 'anti-semitic',
       'homophobic_anti_lbgt', 'homophobic_anti_lgbt', 'misogynistic',
       'multiple', 'white nationalist', 'white nationalist ',
       'white_nationalist'], dtype=object)

In [6]:
wn_types = ['anti_semitic_white nationalist', 'white nationalist', 'white nationalist ', 'white_nationalist']
wn_words = words[words.type.isin(wn_types)]
wn_words

Unnamed: 0,term,meaning,type,source,exclude
4013,shoaed,,anti_semitic_white nationalist,reddit,
4128,14 words,,white nationalist,adl,
4129,14_words,,white nationalist,adl,
4130,14-words,,white nationalist,adl,
4131,5 words,,white nationalist,adl,
...,...,...,...,...,...
4471,racemixing,,white nationalist,,
4472,sub human,a black person.,white_nationalist,hatebase,
4473,sub_human,they are said to be the missing link between a...,white_nationalist,rsd,
4474,sub-human,they are said to be the missing link between a...,white_nationalist,rsd,


In [10]:
wn_terms = wn_words.term.unique()
len(wn_terms)
wn_terms

array(['shoaed', '14 words', '14_words', '14-words', '5 words', 'a c a b',
       'a_c_ab', 'a-c-a-b', 'acab', 'akia', 'ayak', 'blood and honor',
       'blood and soil', 'blood_and_honor', 'blood-and-honor',
       'bloodandhonor', 'blut', 'blut und boden', 'blut und ehre',
       'blut_und_boden', 'blut_und_ehre', 'blut-und-boden',
       'blut-und-ehre', 'blutundboden', 'blutundehre', 'fgrn',
       'five words', 'five_words', 'five-words', 'fivewords',
       'for god, race and nation', 'fourteen words', 'fourteen_words',
       'fourteen-words', 'fourteenwords', 'h s n', 'h_s_n', 'h-s-n',
       'h.s.n.', 'hammer skins', 'hammer_skins', 'hammer-skins',
       'hammerskins', 'hffh', 'hsn', 'itsub', 'kabark', 'kigy', 'klasp',
       'kraft', 'krieg', 'landser', 'lotie', 'love your race',
       'meine ehre heisst treue', 'mut', 'my honor is called loyalty',
       'my honor is loyalty', 'my_honor_is_called_loyalty',
       'my_honor_is_loyalty', 'my-honor-is-called-loyalty',
       

# Train NB classifier to filter out false positives

In [13]:
# Load training data

trainpath = '../data/siegel2021/white_nationalist_training_data.csv'
train = pd.read_csv(trainpath, index_col=0)
train

Unnamed: 0,text,white_nationalism_total
1,RT @chipcamel: #Trump2016 fuck this gook ass t...,no
2,As long as you are going to be thinking anyway...,no
3,Psure Trump's hair was made by the CHOINAISE #...,no
4,"@realDonaldTrump Trump, you're a ching chong d...",no
5,Trump finds the chink in democracy's armour. ...,no
...,...,...
5447,Yes but the sonofabitch still has me blocked. ...,no
5448,@Bakari_Sellers chink in the armor...it's star...,no
5449,"RT @xeni: Overheard at Trump rallies: ""F--- th...",no
5450,RT @RaniaKhalek: Trump rallies getting worse: ...,no


In [18]:
# Preprocess
# From appendix: "we pre-processed the data by stemming and lowercasing words, 
# and removing punctuation except for Twitter relevant symbols (@ and #)"

from nltk import word_tokenize
from nltk.stem import PorterStemmer 
from sklearn.feature_extraction.text import CountVectorizer
from string import punctuation

class StemTokenizer(object):
    def __init__(self):
        self.ps = PorterStemmer()
    def __call__(self, doc):
        return [self.ps.stem(t.lower()) for t in word_tokenize(doc)]
    
stops = list(punctuation.replace('@', '').replace('#', ''))
# print(stops)
vec = CountVectorizer(tokenizer=StemTokenizer(), stop_words=stops)

In [19]:
bow = vec.fit_transform(train.text)



In [20]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(bow, train.white_nationalism_total)

In [24]:
set(sum(train.text.str.split(), []))

{'personally',
 'Bomb',
 'https://t.co/kK�ۡ��_',
 'Nu',
 '@NancyNvandyke86:',
 '@spookydyke',
 'https://t.co/uqkvPwo0yi',
 'speaks',
 '@racebear11:',
 'mereka',
 "'shoot",
 '@2nigger4u:',
 'grass',
 'fukk#n',
 '#Politics:',
 '"absurd"',
 "RT'd",
 'yelled',
 'NITA',
 'https://t.co/6zuKwZuIKo',
 'leader.',
 'holds',
 'https://t.co/JfvKo208nK',
 'outta',
 'berencana',
 '@AKekelik',
 'Seven',
 'mfkn',
 'send',
 'trump?!?!?!!!!!!!',
 'kikes,',
 '@jhuereca2',
 '@mjsmith23atl',
 '@IvoryDove',
 'route',
 'billions',
 'teacher',
 'https://t.co/vsU3AISy05',
 '!�ۡ��_',
 'empty',
 'troubling',
 '#TrumpProtest',
 "here's",
 'won:',
 'avi',
 'DNC',
 'admires',
 '@davemeItzerW0N',
 '#Nomorerapefugees',
 '@jadecameron_:',
 'technical',
 'tourists',
 'involved',
 'moms',
 'Donal�ۡ��_',
 '6$',
 'messican',
 'Fallon,',
 '"lied"',
 '"[so]',
 'BS�ۡ��_',
 'nuts',
 'Nigga',
 'lousy',
 'Democratic',
 'obvious.',
 'sabe',
 'Nope.',
 "'all",
 'https://t.co/WTme0JFdVW',
 '@bigdaddyken___',
 'https://t.co/lG2abG6