# Load, construct dataset
Examples of language from accounts and discourse spaces known for white supremacist extremism

## Qian+2018
Try to match ideologies with tweets

In [4]:
# Load scraped data (Qian+2018)
import os
import json
import pandas as pd
from nltk.tokenize import TweetTokenizer

dirpath = '/storage2/mamille3/white_supremacist_lang/data/qian2018'

# Load tweet data
path = os.path.join(dirpath, 'data.jsonl')
with open(path, 'r') as f:
    tweets = [json.loads(tweet) for tweet in f.read().splitlines()]
    print(len(tweets))

# Build df
qian2018 = pd.json_normalize(tweets)
# print(qian2018.columns)

# Anonymize texts
from tqdm.notebook import tqdm
import pdb
import re
tokenizer = TweetTokenizer(strip_handles=True)

def remove_mentions(text, user_mentions):
    """ Remove mentions from a text (not technically needed with NLTK tokenizer, but still helps) """
    new_text = text
    usernames = [mention['username'] for mention in user_mentions]
    for username in usernames:
        new_text = re.sub(r'@+'+username, '@USER', new_text, flags=re.IGNORECASE)
    return new_text

def remove_urls(text, urls):
    new_text = text
    urls = [entity['url'] for entity in urls]
    for url in urls:
        new_text = new_text.replace(url, '<URL>')
    return new_text

def process_text(text, user_mentions, urls):
    new_text = text
    if isinstance(user_mentions, list):
        new_text = remove_mentions(new_text, user_mentions)
    if isinstance(urls, list):
        new_text = remove_urls(new_text, urls)
    new_text = ' '.join(tokenizer.tokenize(new_text))
    return new_text.lower()

qian2018['processed_text'] = [process_text(text, user_mentions, urls) for text, user_mentions, urls in tqdm(zip(
    qian2018['text'], qian2018['entities.mentions'], qian2018['entities.urls']), total=len(qian2018))]

qian2018.rename(columns={'id': 'tweet_id'}, inplace=True)

qian2018['id'] = 'qian2018_' + qian2018.index.astype(str)
qian2018.set_index('id', inplace=True)
qian2018.head()

qian2018['dataset'] = 'qian2018'
print(qian2018.columns)
qian2018.head()

qian2018['timestamp'] = pd.to_datetime(qian2018['created_at'])
# qian2018.timestamp
qian2018['source'] = 'twitter'

# Format data
# Need columns of 'id', 'text', 'dataset'
data = qian2018[['processed_text', 'timestamp', 'dataset']].copy().rename(columns={'processed_text': 'text'})
print(data.timestamp.dtype)
data.head()

88069


  0%|          | 0/88069 [00:00<?, ?it/s]

Index(['lang', 'created_at', 'text', 'conversation_id', 'tweet_id',
       'author_id', 'public_metrics.retweet_count',
       'public_metrics.reply_count', 'public_metrics.like_count',
       'public_metrics.quote_count', 'entities.annotations',
       'entities.mentions', 'entities.urls', 'referenced_tweets',
       'entities.hashtags', 'geo.place_id', 'withheld.copyright',
       'withheld.country_codes', 'geo.coordinates.type',
       'geo.coordinates.coordinates', 'entities.cashtags', 'processed_text',
       'dataset'],
      dtype='object')
datetime64[ns, UTC]


Unnamed: 0_level_0,text,timestamp,dataset
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
qian2018_0,we had a great time playing tonight for a pack...,2012-12-05 04:12:09+00:00,qian2018
qian2018_1,thanks for following our wkly radio show & pod...,2013-06-30 09:58:20+00:00,qian2018
qian2018_2,"<url> via wow . so my question is , "" what wou...",2016-03-28 19:39:56+00:00,qian2018
qian2018_3,racial killing in virginia <url> via,2015-08-27 01:23:54+00:00,qian2018
qian2018_4,#awesome #follow,2012-11-30 22:24:29+00:00,qian2018


## ElSherief+2021

### Load scraped tweets

In [2]:
# Load saved out results
import os
import json

dataset = 'elsherief2021'
dirpath = os.path.join('/storage2/mamille3/white_supremacist_lang/data', dataset)

# Load tweet data
path = os.path.join(dirpath, 'data.jsonl')
with open(path, 'r') as f:
    tweets = [json.loads(tweet) for tweet in f.read().splitlines()]
    print(len(tweets))

# Load users
path = os.path.join(dirpath, 'users.jsonl')
with open(path, 'r') as f:
    users = [json.loads(user) for user in f.read().splitlines()]
    print(len(users))

# Load the tweets into a tweet_dataframe
import pandas as pd

tweet_data = pd.json_normalize(tweets)
print(tweet_data.columns)
print(len(tweet_data))
tweet_data.id = tweet_data.id.astype('int64')
tweet_data.id.dtype

tweet_data.head()

# Merge in user info
user_data = pd.json_normalize(users)
user_data.drop_duplicates(subset='id', inplace=True)
user_data.set_index('id', drop=True, inplace=True)

# print(len(user_data))
# print(user_data.columns)
user_data.head()

elsherief2021_hydrated = tweet_data.join(user_data, on='author_id', rsuffix='_user')
print(len(elsherief2021_hydrated))
print(elsherief2021_hydrated.columns)
elsherief2021_hydrated.head()

7647
12924
Index(['conversation_id', 'author_id', 'lang', 'created_at', 'text', 'id',
       'public_metrics.retweet_count', 'public_metrics.reply_count',
       'public_metrics.like_count', 'public_metrics.quote_count',
       'entities.mentions', 'entities.urls', 'entities.annotations',
       'referenced_tweets', 'entities.hashtags', 'geo.place_id',
       'geo.coordinates.type', 'geo.coordinates.coordinates',
       'withheld.copyright', 'withheld.country_codes', 'entities.cashtags'],
      dtype='object')
7647
7647
Index(['conversation_id', 'author_id', 'lang', 'created_at', 'text', 'id',
       'public_metrics.retweet_count', 'public_metrics.reply_count',
       'public_metrics.like_count', 'public_metrics.quote_count',
       'entities.mentions', 'entities.urls', 'entities.annotations',
       'referenced_tweets', 'entities.hashtags', 'geo.place_id',
       'geo.coordinates.type', 'geo.coordinates.coordinates',
       'withheld.copyright', 'withheld.country_codes', 'entities.cas

Unnamed: 0,conversation_id,author_id,lang,created_at,text,id,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,...,geo.place_id,geo.coordinates.type,geo.coordinates.coordinates,withheld.copyright,withheld.country_codes,entities.cashtags,description,username,name,withheld.country_codes_user
0,625688620444180481,794531960,en,2015-07-27T15:26:06.000Z,Is Yasir Qadhi a hate preacher for calling Chr...,625688620444180481,0,1,0,0,...,,,,,,,"Huge fan of Dr. Shabir Ally, Helped exposing E...",_MCDialogue_,🕋🕌Muslim by Choice🕌🕋,
1,901458785973051392,343086403,en,2017-08-26T15:02:00.000Z,"@occdissent When you said ""we"" are you talking...",901459720954941440,0,1,0,0,...,,,,,,,,eencarnacion1,Eugenio Encarnacion,
2,939612494796132355,2767516817,en,2017-12-09T21:47:30.000Z,RT @mum2kidz2011: @ElderLansing @GrammerSherry...,939612494796132355,1,0,0,0,...,,,,,,,"Retired, Proud Grandma of 3 boys. Proud Trump ...",GrammerSherry,Sherry Tekulve,
3,548214253497249792,27413161,en,2014-12-25T20:30:38.000Z,@WhiteResister: Hunt for White Women: Black Ra...,548214253497249792,0,0,0,0,...,,,,,,,BITCHY WITCH HERE!!! I AM AN AUSSIE PATRIOT!!!...,swtimogenation,MysticEyesMarie☽✪☾☠💀👻👁🦇🕸🇦🇺🇭🇺,
4,184109665108504579,52837002,en,2012-03-26T02:48:59.000Z,RT @jihadwatchRS: @ZNovetsky @rezaaslan @Atlas...,184109665108504579,1,0,0,0,...,,,,,,,,armarosnews,Armaros,


### Use Qian+2018 labels to specify ideologies of tweets

In [3]:
# Check for overlap with the ~4k tweets that are overlaps with Qian+2018
# Load Qian+2018 tweet IDs

fpath = '/storage2/mamille3/data/hate_speech/qian2018/white_supremacist_tweets.csv'
qian2018_tweet_ids = pd.read_csv(fpath)['tweet id']
print(len(qian2018_tweet_ids))
qian2018_tweet_ids.dtype

# Check for overlap with usernames in Qian+2018 set from selected ideologies
# Load Qian+2018 scraped users
import os
import json

dirpath = '/storage2/mamille3/white_supremacist_lang/data/qian2018'

# Load users
path = os.path.join(dirpath, 'users.jsonl')
with open(path, 'r') as f:
    users = [json.loads(user) for user in f.read().splitlines()]
    # print(len(users))

user_data = pd.json_normalize(users)
# user_data.id = user_data.id.astype('int64')
user_data.drop_duplicates(subset='id', inplace=True)
user_data.set_index('id', drop=True, inplace=True)
# print(len(user_data))
print(user_data.columns)
user_data.head()

# White grievance tweets

# Load stage 2 annotations, implicit categories which include white grievance
stg2 = pd.read_csv('/storage2/mamille3/data/hate_speech/elsherief2021/implicit_hate_v1_stg2_posts.tsv', sep='\t')
white_grievance = stg2.query('implicit_class=="white_grievance" or extra_implicit_class=="white_grievance"').rename(columns={'post': 'text'})
print(len(white_grievance))
white_grievance.head()

# Hydrated overlap with Qian+2018 ideologies
id_matches = elsherief2021_hydrated[elsherief2021_hydrated['id'].isin(qian2018_tweet_ids)]
# print(len(id_matches))

# Check overlap of usernames with author IDs
user_matches = elsherief2021_hydrated[elsherief2021_hydrated['author_id'].isin(user_data.index)]
# print(len(user_matches))

# Add white grievance to hydrated ElSherief+2021 tweets
# May have duplicates (and could remove them if I match them with tweet ids)
elsherief2021 = pd.concat([white_grievance, id_matches, user_matches]).drop_duplicates(subset='id').reset_index(drop=True)
elsherief2021

elsherief2021.rename(columns={'id': 'tweet_id'}, inplace=True)

elsherief2021['id'] = 'elsherief2021_' + elsherief2021.index.astype(str)
elsherief2021.set_index('id', inplace=True)
elsherief2021.head()

elsherief2021['dataset'] = 'elsherief2021'

# Format data
# Need index of id, 'text', 'dataset'
elsherief2021['processed_text'] = [process_text(text, user_mentions, urls) for text, user_mentions, urls in tqdm(zip(
    elsherief2021['text'], elsherief2021['entities.mentions'], elsherief2021['entities.urls']), total=len(elsherief2021))]
elsherief2021.columns

elsherief2021['timestamp'] = pd.to_datetime(elsherief2021['created_at'])
elsherief2021['source'] = 'twitter'
elsherief2021 = elsherief2021[['processed_text', 'timestamp', 'dataset', 'source']].rename(columns={'processed_text': 'text'})
elsherief2021.head()

196277
Index(['description', 'name', 'username', 'withheld.country_codes'], dtype='object')
1572


  0%|          | 0/5378 [00:00<?, ?it/s]

Unnamed: 0_level_0,text,timestamp,dataset,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
elsherief2021_0,""" : jewish harvard professor noel ignatiev wan...",NaT,elsherief2021,twitter
elsherief2021_1,rt : ahmad javed appointed as the next ambassa...,2015-12-11 15:05:17+00:00,elsherief2021,twitter
elsherief2021_2,i think white people are waking up in unpreced...,2017-08-13 19:05:27+00:00,elsherief2021,twitter
elsherief2021_3,it's no secret that white people are native to...,2017-04-15 16:26:00+00:00,elsherief2021,twitter
elsherief2021_4,this is bs . i agree on all getting along . i ...,2017-11-02 22:59:16+00:00,elsherief2021,twitter


In [4]:
data = pd.concat([data, elsherief2021])
print(len(data))

print(data.timestamp.dtype)
data.dataset.value_counts()

93447
datetime64[ns, UTC]


qian2018         88069
elsherief2021     5378
Name: dataset, dtype: int64

## De Gibert+2019 (Stormfront) corpus

In [5]:
import os
import pandas as pd

stormfront_dirpath = '/storage2/mamille3/data/hate_speech/degibert2019/'
stormfront_data = pd.read_csv(os.path.join(stormfront_dirpath, 'combined_data.csv'))
# print(len(stormfront_data))
stormfront_data.head()

# Group by comment
stormfront_data['sentence_id'] = stormfront_data['sentence_id'].astype(int)
degibert2019 = stormfront_data.sort_values(['comment_id', 'sentence_id']).groupby('comment_id').agg(
    {
     'sentence_id': lambda x: list(x.astype(str)),
        'user_id': 'first',
        'text': ' '.join, 
    })
degibert2019['dataset'] = 'degibert2019'
degibert2019['id'] = 'degibert2019_' + degibert2019.index.astype(str)
degibert2019.set_index('id', inplace=True)

degibert2019 = degibert2019[['text', 'dataset']] # no timestamp
print(len(degibert2019))
degibert2019.head()

degibert2019['source'] = 'stormfront'

data = pd.concat([data, degibert2019])
print(len(data))
print(data.timestamp.dtype)
data.dataset.value_counts()

5000
98447
datetime64[ns, UTC]


qian2018         88069
elsherief2021     5378
degibert2019      5000
Name: dataset, dtype: int64

## Patriot Front dump (from Unicorn Riot)

In [6]:
import os
import json

base_dirpath = '/storage2/mamille3/data/patriotfront/'

### 2017 Vanguard America-Patriot Front Discord dump

In [7]:
# Load dump from xz file
# Join channels with messages to be able to select general channel

dirpath = os.path.join(base_dirpath, '2017-Vanguard_America-Patriot_Front/Discord/dump')
channels = pd.read_csv(os.path.join(dirpath, 'channels.csv'), index_col=0)

messages = pd.read_csv(os.path.join(dirpath, 'messages.csv'))
messages.dropna(subset='message', inplace=True)
# print(len(messages))
messages = messages.join(channels, on='channel_id', rsuffix='_channel')
# print(len(messages))
messages2017 = messages.query('name == "general"')
print(len(messages2017))

25133


### 2018

In [8]:
messages2018 = []
for dump in ['Front_And_Center', 'MI_Goy_Scouts_Official']:
    print(dump)
    dirpath = os.path.join(base_dirpath, '2018/Discord', f'dump_{dump}')
    messages = pd.read_csv(os.path.join(dirpath, 'messages.csv'))
    channels = pd.read_csv(os.path.join(dirpath, 'channels.csv')).set_index('id')
    # Remove messages that are just images
    messages.dropna(subset='message', inplace=True)
    messages = messages.join(channels, on='channel_id', rsuffix='_channel')
    messages2018.append(messages.query('name == "general"'))
    
messages2018 = pd.concat(messages2018)
print(len(messages2018))
messages2018.columns

pf_messages = pd.concat([messages2017, messages2018])
len(pf_messages)

Front_And_Center
MI_Goy_Scouts_Official
20997


46130

### Process data

In [9]:
# nltk.download('punkt')

In [10]:
# Load common first names
fpath = '../resources/us_first_names_1990.csv'
names = pd.read_csv(fpath, skiprows=[0])
names = names.query('Rank <= 300')
len(names)

common_names = set(names['Name'].str.lower()).union(names['Name.1'].str.lower())
print(len(common_names))
list(common_names)[:20]

# Remove spencer, guy
common_names -= {'spencer', 'guy'}

len(common_names)

# Tokenize, remove common first names (except Spencer, for Richard Spencer)
import nltk

pf_messages['processed'] = pf_messages['message'].map(lambda x: ' '.join([wd for wd in nltk.word_tokenize(str(x)) if wd not in common_names]).lower())
pf_messages['processed']

pf_messages['timestamp'] = pd.to_datetime(pf_messages.timestamp, utc=True)

patriotfront = pf_messages[['processed', 'timestamp']].reset_index(drop=True).rename(columns={'processed': 'text'})
patriotfront['dataset'] = 'patriotfront'
patriotfront['source'] = 'discord'
patriotfront['id'] = 'patriotfront_' + patriotfront.index.astype(str)
patriotfront.set_index('id', inplace=True)
patriotfront.head()

592


Unnamed: 0_level_0,text,timestamp,dataset,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
patriotfront_0,i 'd love to see a highway littered with cruci...,2017-08-05 03:17:46.275000+00:00,patriotfront,discord
patriotfront_1,i 'd keep it civil and in keeping with vanam '...,2017-08-08 01:48:41.372000+00:00,patriotfront,discord
patriotfront_2,im not purposely avoiding you guys,2017-07-21 03:07:12.169000+00:00,patriotfront,discord
patriotfront_3,i do n't even hate anyone except sjw and mudsl...,2017-06-24 04:04:24.633000+00:00,patriotfront,discord
patriotfront_4,im trying man do we have any memebers here,2017-07-30 18:24:15.171000+00:00,patriotfront,discord


In [11]:
data = pd.concat([data, patriotfront])
print(len(data))
print(data.timestamp.dtype)
data.dataset.value_counts()

144577
datetime64[ns, UTC]


qian2018         88069
patriotfront     46130
elsherief2021     5378
degibert2019      5000
Name: dataset, dtype: int64

## Alatawi+2022 annotated data

In [12]:
# Load data
import pandas as pd
fpath = '/storage2/mamille3/white_supremacist_lang/data/alatawi2021_white_supremacist_annotated_tweets.csv'
tweets = pd.read_csv(fpath)

# Count various annotation thresholds for white supremacy
agreed_tweets = tweets.query('`Voting and Final Labels` == 1')
len(agreed_tweets)

alatawi2021 = agreed_tweets[['input.text']].rename(columns={'input.text': 'text'}).reset_index(drop=True) # no timestamp
alatawi2021['dataset'] = 'alatawi2021'
alatawi2021['source'] = 'twitter'
alatawi2021.index = 'alatawi2021_' + alatawi2021.index.astype(str)
alatawi2021

alatawi2021['text'] = alatawi2021['text'].str.lower()
alatawi2021

data = pd.concat([data, alatawi2021])
print(len(data))
data.dataset.value_counts()

145677


qian2018         88069
patriotfront     46130
elsherief2021     5378
degibert2019      5000
alatawi2021       1100
Name: dataset, dtype: int64

## ADL HEATMap extracted propaganda quotes

In [13]:
# Load data (quotes extracted by Ahmad Diab)
import pandas as pd

fpath = '../data/adl_quotes.csv'
quotes = pd.read_csv(fpath)
quotes.columns

quotes['timestamp'] = pd.to_datetime(quotes.date, format='%m/%d/%y', errors='coerce', utc=True).fillna(
    pd.to_datetime(quotes.date, format='%y-%b', errors='coerce', utc=True))

adl = quotes[['quote', 'timestamp']].drop_duplicates(subset='quote').reset_index(drop=True).rename(columns={'quote': 'text'})
adl['dataset'] = 'adl_heatmap'
adl['source'] = 'offline_flyers_banners'
adl.index = 'adl_' + adl.index.astype(str)
adl.head()

# data = data.query('dataset != "adl_heatmap"')
# data.dataset.value_counts()

Unnamed: 0,text,timestamp,dataset,source
adl_0,it's okay to be white,2022-03-31 00:00:00+00:00,adl_heatmap,offline_flyers_banners
adl_1,europa - the last battle,2022-03-31 00:00:00+00:00,adl_heatmap,offline_flyers_banners
adl_2,free ram,2022-03-30 00:00:00+00:00,adl_heatmap,offline_flyers_banners
adl_3,for the nation against the state,2022-03-30 00:00:00+00:00,adl_heatmap,offline_flyers_banners
adl_4,reclaim america,2022-03-30 00:00:00+00:00,adl_heatmap,offline_flyers_banners


In [14]:
# Tokenize
import nltk

adl['text'] = adl['text'].map(lambda x: ' '.join(nltk.word_tokenize(x)))

NameError: name 'ad' is not defined

In [15]:
adl[['text']]

Unnamed: 0,text
adl_0,it 's okay to be white
adl_1,europa - the last battle
adl_2,free ram
adl_3,for the nation against the state
adl_4,reclaim america
...,...
adl_1024,our patience has its limits one day we will sh...
adl_1025,for ourselves and our posterity
adl_1026,save our land join the klan
adl_1027,join the kkk and fight for race and nation


In [16]:
data = pd.concat([data, adl])
print(len(data))
print(data.timestamp.dtype)
data.dataset.value_counts()

146706
datetime64[ns, UTC]


qian2018         88069
patriotfront     46130
elsherief2021     5378
degibert2019      5000
alatawi2021       1100
adl_heatmap       1029
Name: dataset, dtype: int64

## Iron March data dump

In [17]:
import pandas as pd
import nltk
from tqdm.notebook import tqdm
from multiprocessing import Pool

fpath = '/storage2/mamille3/white_supremacist_lang/data/iron_march_201911/csv/core_search_index.csv'
posts = pd.read_csv(fpath)

def preprocess(inp):
    return ' '.join(nltk.word_tokenize(str(inp))).lower()

# Tokenize, lowercase
# posts['processed'] = [' '.join(nltk.word_tokenize(str(x))).lower() for x in tqdm(posts['index_content'])]
with Pool(15) as p:
    posts['processed'] = list(tqdm(p.imap(preprocess, posts['index_content']), total=len(posts)))
posts['processed']
posts.columns

posts['timestamp'] = pd.to_datetime(posts.index_date_created, unit='s', utc=True)

ironmarch = posts[['processed', 'timestamp']].reset_index(drop=True).rename(columns={'processed': 'text'})
ironmarch['dataset'] = 'ironmarch'
ironmarch['source'] = 'ironmarch'
ironmarch.index = 'ironmarch_' + ironmarch.index.astype(str)
print(ironmarch.timestamp.dtype)
ironmarch

  0%|          | 0/196042 [00:00<?, ?it/s]

datetime64[ns, UTC]


Unnamed: 0,text,timestamp,dataset,source
ironmarch_0,"congrats on 1,488",2017-07-28 16:40:17+00:00,ironmarch,ironmarch
ironmarch_1,i approve of this avatar .,2017-06-28 06:40:14+00:00,ironmarch,ironmarch
ironmarch_2,"i have more reputation than u , fag",2017-06-22 06:49:22+00:00,ironmarch,ironmarch
ironmarch_3,"hi rostislav , danke für den willkommensgruss ...",2017-06-15 18:10:15+00:00,ironmarch,ironmarch
ironmarch_4,how to kill time at work ?,2017-06-14 11:27:41+00:00,ironmarch,ironmarch
...,...,...,...,...
ironmarch_196037,based barg,2017-11-20 22:32:14+00:00,ironmarch,ironmarch
ironmarch_196038,& gt ; tfw you notice that helicopters are fly...,2017-11-20 22:52:23+00:00,ironmarch,ironmarch
ironmarch_196039,thoughts on anglos ?,2017-11-21 00:58:53+00:00,ironmarch,ironmarch
ironmarch_196040,i think its hilarious you guys tried to intent...,2017-11-21 02:58:31+00:00,ironmarch,ironmarch


In [18]:
data = pd.concat([data, ironmarch])
print(data.timestamp.dtype)
data.dataset.value_counts()

datetime64[ns, UTC]


ironmarch        196042
qian2018          88069
patriotfront      46130
elsherief2021      5378
degibert2019       5000
alatawi2021        1100
adl_heatmap        1029
Name: dataset, dtype: int64

## 4chan datasets (Jokubausaite+2020, Papasavva+2020)

In [19]:
# Load Jokubausaite+2020 data
import os
import pandas as pd

dirpath = '/storage2/mamille3/data/4chan/jokubausaite2020/'

selected = [
    # 'president trump', # too focused just on Trump
    # 'trump', # too focused just on Trump
    'kraut/pol/ and afd',
    'national socialism',
    # 'islam', # Super Islamophobic and antisemitic but not necessarily white supremacist ideology
    'fascism',
    'dixie',
    # 'hinduism', # Not super white supremacist, though some antisemitism
    # 'black nationalism', # super racist and white nationalist, but some actual Black nationalism
    'kraut/pol/', # yep, German nationalists. Some German, but lots of white supremacy
    'ethnostate',
    'white',
    'chimpout',
    'feminist apocalypse',
    '(((krautgate)))',
]

dfs = []
for general in selected:
    fpath = os.path.join(dirpath, f'{general.replace("/", " ")} general.csv')
    dfs.append(pd.read_csv(fpath))

jokubausaite2020_posts = pd.concat(dfs)
jokubausaite2020_posts.reset_index(drop=True)
print(len(jokubausaite2020_posts))
print(jokubausaite2020_posts.columns)

jokubausaite2020_posts['timestamp'] = pd.to_datetime(jokubausaite2020_posts.timestamp, utc=True)
jokubausaite2020_posts['timestamp']

# Load Papasavva+2020 data with selected flags
import pandas as pd

fpath = '/storage2/mamille3/white_supremacist_lang/data/papasavva2020_white_supremacist_flag_posts.csv'
papasavva2020_posts = pd.read_csv(fpath,index_col=0).reset_index(drop=True)
print(len(papasavva2020_posts))
print(papasavva2020_posts.columns)

papasavva2020_posts['timestamp'] = pd.to_datetime(papasavva2020_posts.time, unit='s', utc=True)

# Merge, remove duplicates, get text
j = jokubausaite2020_posts[['id', 'body', 'timestamp']]
p = papasavva2020_posts.drop(columns='id').rename(columns={'no': 'id', 'com': 'body'})[['id', 'body', 'timestamp']]
fourchan = pd.concat([j, p]).drop_duplicates('id').dropna(subset='body')
len(fourchan)

# Process text (remove HTML, tokenize)
from html.parser import HTMLParser
from tqdm.notebook import tqdm
from multiprocessing import Pool
import re
import numpy as np
import nltk

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ' '.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def remove_special(text):
    text = text.replace('>', '')
    text = re.sub(r'\d{7,}', '', text)
    text = re.sub(r'\S+(?:\.com|\.org|\.edu)\S*|https?:\/\/\S*', '', text) # Remove URLs
    return text

def process_text(text):
    # Remove HTML
    text = strip_tags(str(text))
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove special characters
    text = remove_special(text)
    # Tokenize
    text = ' '.join(nltk.word_tokenize(str(text))).lower()
    return text

def process_chunk(texts):
    return [process_text(text) for text in texts]

n_processes = 20
p = Pool(n_processes)
chunks = np.array_split(fourchan.body, n_processes)
res = list(tqdm(p.imap(process_chunk, chunks), total=len(chunks)))
# res = list(map(process_chunk, chunks))
    
# fourchan['processed'] = [process_text(text) for text in tqdm(fourchan.body)]
fourchan['processed'] = [processed for processed_texts in res for processed in processed_texts]

chan = fourchan[['processed', 'timestamp']].reset_index(drop=True).rename(columns={'processed': 'text'})
chan['dataset'] = '4chan'
chan['source'] = '4chan'
chan.index = '4chan_' + chan.index.astype(str)
print(chan.timestamp.dtype)
chan.head()

  dfs.append(pd.read_csv(fpath))
  dfs.append(pd.read_csv(fpath))


684703
Index(['thread_id', 'id', 'timestamp', 'body', 'subject', 'author',
       'image_file', 'image_md5', 'country_code', 'country_name',
       'unix_timestamp'],
      dtype='object')


  papasavva2020_posts = pd.read_csv(fpath,index_col=0).reset_index(drop=True)


3068516
Index(['archived_on', 'replies', 'images', 'archived', 'no', 'fsize',
       'filename', 'tim', 'troll_country', 'closed', 'country_name',
       'bumplimit', 'extracted_poster_id', 'time', 'imagelimit',
       'semantic_url', 'now', 'md5', 'name', 'tn_w', 'h', 'ext', 'resto', 'w',
       'tn_h', 'com', 'entities', 'perspectives.TOXICITY',
       'perspectives.SEVERE_TOXICITY', 'perspectives.INFLAMMATORY',
       'perspectives.PROFANITY', 'perspectives.INSULT', 'perspectives.OBSCENE',
       'perspectives.SPAM', 'entitites', 'perspectives', 'm_img', 'tail_size',
       'sub', 'trip', 'filedeleted', 'since4pass', 'id', 'unique_ips', 'xa18',
       'xa19l', 'xa19s'],
      dtype='object')


  0%|          | 0/20 [00:00<?, ?it/s]

datetime64[ns, UTC]


Unnamed: 0,text,timestamp,dataset,source
4chan_0,germany 's situation is that of total despair ...,2017-03-09 14:11:59+00:00,4chan,4chan
4chan_1,haut die glatzen bis sie platzen,2017-03-09 14:13:53+00:00,4chan,4chan
4chan_2,how the politics works in krautland ? there is...,2017-03-09 14:19:38+00:00,4chan,4chan
4chan_3,godspeed krautbros,2017-03-09 14:22:26+00:00,4chan,4chan
4chan_4,the system is terribly rigged pro political pa...,2017-03-09 14:23:36+00:00,4chan,4chan


In [20]:
data = pd.concat([data, chan])
print(data.timestamp.dtype)
data.dataset.value_counts()

datetime64[ns, UTC]


4chan            3675508
ironmarch         196042
qian2018           88069
patriotfront       46130
elsherief2021       5378
degibert2019        5000
alatawi2021         1100
adl_heatmap         1029
Name: dataset, dtype: int64

## Stormfront archive

In [21]:
import pandas as pd
import os
from tqdm.notebook import tqdm

dirpath = '/storage2/mamille3/white_supremacist_lang/data/stormfront_archive/processed/'
dfs = []
for fname in tqdm(os.listdir(dirpath)):
    fpath = os.path.join(dirpath, fname)
    dfs.append(pd.read_csv(fpath))
posts = pd.concat(dfs).reset_index(drop=True) 

# Split up breadcrumb
posts[[f'breadcrumb{i}' for i in range(5)]] = posts.thread_breadcrumb.str.split(' > ', expand=True)

# Try to remove non-English posts
exclude = ['Nederland & Vlaanderen', 
            'Srbija',
            'en Español y Portugués',
            'Italia',
            'Croatia',
            'South Africa', # some Boer/Dutch
            'en Français',
            'Russia',
            'Baltic / Scandinavia', # but contains lots of English
            'Hungary', # but contains lots of English
            'Opposing Views Forum',
           'Computer Talks'
           ]

formatted = [f'Stormfront {el}' for el in exclude]
posts = posts.query('breadcrumb2!=@formatted').dropna(subset='text')
print(len(posts))
posts.columns

# Tokenize and prepare
import nltk
import re
from multiprocessing import Pool

def preprocess(inp):
    text = re.sub(r'Quote:\n\n\n\n\nOriginally Posted by .*\n\n\n', '', inp) # Remove quote tag
    text = re.sub(r'\S+(?:\.com|\.org|\.edu)\S*|https?:\/\/\S*', '', text) # Remove URLs
    text = ' '.join(nltk.word_tokenize(str(text))).lower()
    return text

# posts['processed'] = [' '.join(nltk.word_tokenize(str(x))).lower() for x in tqdm(posts['text'])]
with Pool(20) as p:
    posts['processed'] = list(tqdm(p.imap(preprocess, posts['text']), total=len(posts)))

posts['timestamp'] = pd.to_datetime(posts.timestamp, errors='coerce', utc=True)
posts.timestamp.dtype

  0%|          | 0/51 [00:00<?, ?it/s]

762585


  0%|          | 0/762585 [00:00<?, ?it/s]

datetime64[ns, UTC]

In [22]:
stormfront = posts[['processed', 'timestamp']].reset_index(drop=True).rename(columns={'processed': 'text'})
stormfront['dataset'] = 'stormfront'
stormfront['source'] = 'stormfront'
stormfront.index = 'stormfront_' + stormfront.index.astype(str)
stormfront.head()

data = pd.concat([data, stormfront])
print(data.timestamp.dtype)
data.dataset.value_counts()

datetime64[ns, UTC]


4chan            3675508
stormfront        762585
ironmarch         196042
qian2018           88069
patriotfront       46130
elsherief2021       5378
degibert2019        5000
alatawi2021         1100
adl_heatmap         1029
Name: dataset, dtype: int64

## Calderón+2021 DailyStormer and American Renaissance

In [23]:
# Load data
import json
import os
import pandas as pd

dirpath = '/storage2/mamille3/white_supremacist_lang/data/calderon2021/'
dailystormer_fpath = os.path.join(dirpath, 'd_stormer_documents.json')

with open(dailystormer_fpath) as f:
    ds_docs = json.load(f)
len(ds_docs)

dstormer = pd.json_normalize(ds_docs)
dstormer['source'] = 'daily_stormer'

# Load American Renaissance
amren_fpath = os.path.join(dirpath, 'amran_documents.json')

with open(amren_fpath) as f:
    amren_json = json.load(f)
len(amren_json)

amren = pd.json_normalize(amren_json)
amren['source'] = 'american_renaissance'

# articles = pd.concat([dstormer[['date', 'title', 'author_wording']], amren[['date', 'title', 'author_wording']]], 
#                      keys=['daily_stormer', 'american_renaissance'], 
#                      names=['dataset', 'old_index']).reset_index(level='dataset').reset_index(drop=True)
articles = pd.concat([dstormer[['date', 'title', 'author_wording', 'source']], amren[['date', 'title', 'author_wording', 'source']]]).reset_index(drop=True)
print(len(articles))
articles.columns

# Tokenize and prepare
import nltk
from tqdm.notebook import tqdm
# import re
from multiprocessing import Pool

def preprocess(inp):
    # text = re.sub(r'Quote:\n\n\n\n\nOriginally Posted by .*\n\n\n', '', inp) # Remove quote tag
    # text = re.sub(r'\S+(?:\.com|\.org|\.edu)\S*|https?:\/\/\S*', '', text) # Remove URLs
    # text = re.sub(r'[A-Za-z]\.[A-Za-z]', 
    text = ' '.join(nltk.word_tokenize(str(inp.replace('.', '. ')))).lower()
    return text

# articles['text'] = [' '.join(nltk.word_tokenize(str(x))).lower() for x in tqdm(articles['author_wording'])]
with Pool(20) as p:
    articles['text'] = list(tqdm(p.imap(preprocess, articles['title'] + ' ' + articles['author_wording']), total=len(articles)))

26250


  0%|          | 0/26250 [00:00<?, ?it/s]

In [24]:
articles['date'].str.slice(0,4).value_counts()

2014    7564
2016    7448
2015    5694
2013    2783
2017    2490
1912     243
2012      13
2104       4
2105       3
2011       2
2103       2
2010       2
2005       1
2915       1
Name: date, dtype: int64

In [25]:
articles.loc[~articles.date.str.startswith('20'), 'date'] = '' # remove date errors. Could extract real date by parsing text

In [26]:
calderon2021 = articles.drop(columns=['author_wording', 'title']).rename(columns={'date': 'timestamp'})
calderon2021['timestamp'] = pd.to_datetime(calderon2021.timestamp, errors='coerce', utc=True)
# calderon2021['dataset'] =  'calderon2021_' + calderon2021['dataset']
calderon2021['dataset'] =  'calderon2021'
calderon2021.index = 'calderon2021_' + calderon2021.index.astype(str)
print(calderon2021.timestamp.dtype)
calderon2021.head()

datetime64[ns, UTC]


Unnamed: 0,timestamp,source,text,dataset
calderon2021_0,2016-07-20 00:00:00+00:00,daily_stormer,kansas city : blacks kill another white cop it...,calderon2021
calderon2021_1,2016-08-10 00:00:00+00:00,daily_stormer,blacks murdering and raping more old women why...,calderon2021
calderon2021_2,2014-12-18 00:00:00+00:00,daily_stormer,cuban government releases jew convicted of inc...,calderon2021
calderon2021_3,2014-11-21 00:00:00+00:00,daily_stormer,mestizo subhumans on the white house facebook ...,calderon2021
calderon2021_4,2013-07-21 00:00:00+00:00,daily_stormer,ny food stamp recipients are shipping welfare-...,calderon2021


In [27]:
# data = data[data.dataset != 'calderon2021']
# data.dataset.value_counts()

In [28]:
data = pd.concat([data, calderon2021])
print(data.timestamp.dtype)
data.dataset.value_counts()

datetime64[ns, UTC]


4chan            3675508
stormfront        762585
ironmarch         196042
qian2018           88069
patriotfront       46130
calderon2021       26250
elsherief2021       5378
degibert2019        5000
alatawi2021         1100
adl_heatmap         1029
Name: dataset, dtype: int64

## Pruden+2022

In [29]:
# Load data
import os
import pandas as pd
import re

source_info = {
    'AndersBehringBreivikManifesto': {'year': 2011, 'source': 'breivik_manifesto'},
    'Enoch-Powells-Rivers-of-Blood-new-analysis-pdf': {'year': 1968, 'source': 'enoch_powell_rivers_of_blood_speech'},
    'Jean-Raspail-Camp-of-the-Saints': {'year': 1973, 'source': 'raspail_camp_of_the_saints_book'},
    'Lane_White Genocide Manifesto': {'year': 1988, 'source': 'lane_white_genocide_manifesto'},
    'Renaud Camus - The Great Replacement - Part I-RWTS (2012)': {'year': 2012, 'source': 'camus_the_great_replacement_book'},
    'Turner - The Turner Diaries': {'year': 1978, 'source': 'pierce_the_turner_diaries_book'},
}
    
dfs = []
dirpath = '/storage2/mamille3/white_supremacist_lang/data/pruden2022/'
for fname in sorted(os.listdir(dirpath)):
    print(fname)
    fpath = os.path.join(dirpath, fname)
    if fname == 'AndersBehringBreivikManifesto.txt':
        with open(fpath, encoding='latin-1') as f:
            text = [line.strip() for line in re.split(r'\n\s+', f.read()) if len(line.strip()) > 0]
        print(len(text))
    else:
        with open(fpath, encoding='latin-1') as f:
            text = [line.strip() for line in f.read().splitlines() if len(line.strip()) > 0]
        print(len(text))
    df = pd.DataFrame({'title': fname.split('.txt')[0], 'text': text})
    df['year'] = df.title.map(lambda x: source_info[x]['year'])
    df['source'] = df.title.map(lambda x: source_info[x]['source'])
    df['timestamp'] = pd.to_datetime(df.year, format='%Y', utc=True)
    dfs.append(df)
texts =  pd.concat(dfs)

# Preprocess
from tqdm.notebook import tqdm
import nltk

texts['processed'] = [' '.join(nltk.word_tokenize(str(x))).lower() for x in tqdm(texts['text'])]

pruden2022 = texts[['processed', 'timestamp', 'source']].reset_index(drop=True).rename(columns={'processed': 'text'})
pruden2022['dataset'] = 'pruden2022'
pruden2022.index = 'pruden2022_' + pruden2022.index.astype(str)
print(pruden2022.timestamp.dtype)
pruden2022.head()

AndersBehringBreivikManifesto.txt
18884
Enoch-Powells-Rivers-of-Blood-new-analysis-pdf.txt
47
Jean-Raspail-Camp-of-the-Saints.txt
1231
Lane_White Genocide Manifesto.txt
33
Renaud Camus - The Great Replacement - Part I-RWTS (2012).txt
162
Turner - The Turner Diaries.txt
1585


  0%|          | 0/21942 [00:00<?, ?it/s]

datetime64[ns, UTC]


Unnamed: 0,text,timestamp,source,dataset
pruden2022_0,"by andrew berwick , london ð 2011",2011-01-01 00:00:00+00:00,breivik_manifesto,pruden2022
pruden2022_1,contents ( active hyperlinks : ctrl + click to...,2011-01-01 00:00:00+00:00,breivik_manifesto,pruden2022
pruden2022_2,introduction,2011-01-01 00:00:00+00:00,breivik_manifesto,pruden2022
pruden2022_3,about the compendium - 2083 .....................,2011-01-01 00:00:00+00:00,breivik_manifesto,pruden2022
pruden2022_4,2.2 the eurabia code ð 2008 updates .............,2011-01-01 00:00:00+00:00,breivik_manifesto,pruden2022


In [30]:
data = pd.concat([data, pruden2022])
print(data.timestamp.dtype)
data.dataset.value_counts()

datetime64[ns, UTC]


4chan            3675508
stormfront        762585
ironmarch         196042
qian2018           88069
patriotfront       46130
calderon2021       26250
pruden2022         21942
elsherief2021       5378
degibert2019        5000
alatawi2021         1100
adl_heatmap         1029
Name: dataset, dtype: int64

# Save out/load from tmp (just for speed of stats)

In [31]:
# Join domain info, save out
dataset_info = pd.DataFrame([
    {'dataset': 'qian2018', 'domain': 'tweet/short propaganda'},
    {'dataset': 'elsherief2021', 'domain': 'tweet/short propaganda'},
    {'dataset': 'degibert2019', 'domain': 'forum'},
    {'dataset': 'patriotfront', 'domain': 'chat'},
    {'dataset': 'alatawi2021', 'domain': 'tweet/short propaganda'},
    {'dataset': 'adl_heatmap', 'domain': 'tweet/short propaganda'},
    {'dataset': 'ironmarch', 'domain': 'forum'},
    {'dataset': '4chan', 'domain': 'forum'},
    {'dataset': 'stormfront', 'domain': 'forum'},
    {'dataset': 'calderon2021', 'domain': 'long-form'},
    {'dataset': 'pruden2022', 'domain': 'long-form'},
]).set_index('dataset')
dataset_info
data = data.join(dataset_info, on='dataset')

outpath = '../tmp/white_supremacist_corpus.pkl'
data.to_pickle(outpath)