Gather, process examples of text from similar domains as the white supremacist dataset that isn't white supremacist

# Reddit data (forum data)
Scraped by get_reddit.py

In [1]:
# Load data
import os
import pandas as pd

dirpath = '../tmp/'
fpaths = sorted([fname for fname in os.listdir(dirpath) if 'subreddit_comments' in fname])
dfs = []
for fname in fpaths:
    print(fname)
    fpath = os.path.join(dirpath, fname)
    dfs.append(pd.read_csv(fpath, index_col=0, engine='python').assign(year=fname[:4]))
    
data = pd.concat(dfs)
data['year'] = data.year.astype(int)
data

2011_politics_subreddit_comments.csv
2012_politics_subreddit_comments.csv
2013_politics_subreddit_comments.csv
2014_politics_subreddit_comments.csv
2015_politics_subreddit_comments.csv
2016_politics_subreddit_comments.csv
2017_politics_subreddit_comments.csv
2018_politics_subreddit_comments.csv
2019_politics_subreddit_comments.csv


Unnamed: 0,author,body,created_utc,id,parent_id,score,created,text,year,permalink
48865,[deleted],If I have 3 kids it's not likely I'd be missin...,1325043053,c3bpt8q,t1_c3bohzb,1,1.325057e+09,If I have 3 kids it's not likely I'd be missin...,2011,
6229,zorbathustra,What if individual executives here held more d...,1325342043,c3cpubb,t1_c3coi5e,-2,1.325356e+09,What if individual executives here held more d...,2011,
62876,burningpineapples,You could always go the Visual-Basic way\n&lt;...,1324970410,c3bhbfi,t1_c3bdw69,1,1.324985e+09,You could always go the Visual-Basic way\n&lt;...,2011,
20181,sheasie,That's not true. There is no impunity against...,1325237238,c3ce40j,t1_c3c9v1j,2,1.325252e+09,That's not true. There is no impunity against...,2011,
53348,[deleted],Wow a clever Romney quip! Now only if I were 3...,1325019046,c3bmaa9,t3_nsibn,-2,1.325033e+09,Wow a clever Romney quip! Now only if I were 3...,2011,
...,...,...,...,...,...,...,...,...,...,...
100884,shrek_daddy79,"Was this before or after he told Medvedev, ""af...",1577639809,fcggtay,t1_fcgcmap,1,1.577654e+09,"Was this before or after he told Medvedev, ""af...",2019,/r/politics/comments/eh4hfp/russias_state_tv_c...
19047,marsglow,"Because that black guy got it passed, and they...",1577812555,fcniuz2,t1_fcni4e2,1,1.577827e+09,"Because that black guy got it passed, and they...",2019,/r/politics/comments/ei1hbx/three_years_into_t...
3775,ElitistPunter,"The EPA still has a science board? Well, I gu...",1577830357,fcoelld,t3_ei9m5j,1,1.577845e+09,"The EPA still has a science board? Well, I gu...",2019,/r/politics/comments/ei9m5j/epa_science_board_...
34206,NormieSpecialist,Wasn't Lincoln a republican at the time? How d...,1577775692,fcmhp8u,t1_fcldycm,1,1.577790e+09,Wasn't Lincoln a republican at the time? How d...,2019,/r/politics/comments/ehp5b1/presidentsanders_t...


In [4]:
# Compare with white supremacist dataset
# Load white supremacist dataset to count posts over time
import datetime

path = '../tmp/white_supremacist_corpus.pkl'
ws_data = pd.read_pickle(path)

# Select forum data, Group by year
yearly = ws_data.query('domain=="forum"').groupby(by=ws_data.timestamp.dt.year)['text'].count()
lookup = pd.DataFrame(yearly)
lookup['begin'] = pd.to_datetime(yearly.index.astype(int).astype(str), format='%Y')
lookup['end'] = lookup.begin.shift(-1, fill_value = datetime.datetime(2020,1,1))
lookup.index.name = 'year'
lookup.index = lookup.index.astype(int)
lookup.rename(columns={'text': 'post_count'}, inplace=True)
lookup

Unnamed: 0_level_0,post_count,begin,end
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001,1439,2001-01-01,2002-01-01
2002,13302,2002-01-01,2003-01-01
2003,27470,2003-01-01,2004-01-01
2004,50783,2004-01-01,2005-01-01
2005,54377,2005-01-01,2006-01-01
2006,80554,2006-01-01,2007-01-01
2007,63544,2007-01-01,2008-01-01
2008,74203,2008-01-01,2009-01-01
2009,94895,2009-01-01,2010-01-01
2010,62649,2010-01-01,2011-01-01


In [3]:
# Tokenize, lowercase Reddit data
import nltk
from tqdm.notebook import tqdm
from multiprocessing import Pool

def preprocess(inp):
    return ' '.join(nltk.word_tokenize(str(inp))).lower()

with Pool(20) as p:
    data['processed'] = list(tqdm(p.imap(preprocess, data.body), total=len(data)))
data['word_count'] = data.processed.str.split().str.len()

In [13]:
# reddit_gped = data.groupby('year').word_count.sum()
reddit_wc = data.groupby('year').agg({'word_count': ['sum', 'mean']})
reddit_wc

Unnamed: 0_level_0,word_count,word_count
Unnamed: 0_level_1,sum,mean
year,Unnamed: 1_level_2,Unnamed: 2_level_2
2011,3142965,55.205596
2012,4859704,55.834279
2013,4328347,60.20624
2014,3477817,54.849102
2015,3908496,54.295978
2016,5553748,50.44047
2017,66516956,40.923639
2018,55322325,40.456473
2019,25969851,39.538463


In [3]:
# Compare number of words between white supremacist dataset forum data and Reddit data
selected_ws = ws_data.loc[(ws_data.timestamp.dt.year.isin(data.year.unique())) & (ws_data.domain == 'forum'),]
selected_ws['word_count'] = selected_ws.text.str.split().str.len()

In [8]:
ws_wc = selected_ws.groupby(ws_data.timestamp.dt.year).agg({'word_count': ['sum', 'mean']})
ws_wc

Unnamed: 0_level_0,word_count,word_count
Unnamed: 0_level_1,sum,mean
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2
2011.0,6446972,113.239865
2012.0,8424657,96.80732
2013.0,6869854,95.577918
2014.0,6054223,95.567845
2015.0,5538318,76.94887
2016.0,6191536,56.24323
2017.0,47959049,29.506252
2018.0,36979076,27.04238
2019.0,17675839,26.911033


In [16]:
comparison = pd.concat([ws_wc, reddit_wc], axis=1, keys=['white_supremacist', 'reddit'])
# comparison = pd.concat([ws_wc, reddit_wc], axis=1)
comparison['reddit_percentage'] = comparison['reddit_word_count']/comparison['ws_word_count']
comparison

Unnamed: 0_level_0,white_supremacist,white_supremacist,reddit,reddit
Unnamed: 0_level_1,word_count,word_count,word_count,word_count
Unnamed: 0_level_2,sum,mean,sum,mean
2011.0,6446972,113.239865,3142965,55.205596
2012.0,8424657,96.80732,4859704,55.834279
2013.0,6869854,95.577918,4328347,60.20624
2014.0,6054223,95.567845,3477817,54.849102
2015.0,5538318,76.94887,3908496,54.295978
2016.0,6191536,56.24323,5553748,50.44047
2017.0,47959049,29.506252,66516956,40.923639
2018.0,36979076,27.04238,55322325,40.456473
2019.0,17675839,26.911033,25969851,39.538463


In [19]:
selected_ws.word_count.mean()

34.5762291155982

In [20]:
data.word_count.mean()

42.10143226914721

In [25]:
# Sample comments vs submissions
from IPython.display import display
pd.set_option('display.max_colwidth', None)

for post_type in ['comment', 'submission']:
    print(post_type)
    display(data.query('post_type == @post_type')[['text']].sample(20))

comment


Unnamed: 0,text
55771,The same can be said for fascist regimes.
13284,Ignorance.
20569,"To be honest, I'm not particularly interesting in debating the interpretation of the passage. All I'm trying to tell you is that you need to provide something more than some lawyer's interpretation of the passage and your opinion in the matter in order to claim that other people are misinterpreting the passage since they are not evidences or facts. Or else you are just making a baseless claim and it would not be surprising that people don't agree with you."
48625,Wrong about what? (I covered a lot of ground on the last post)
8311,"4 easier states, AK, AZ, Montana I believe, and Vermont, don't require a ccw. But I believe all still have one so you can carry to another state that recognizes said ccw."
11618,"Oh wow, this account has to be the most impressive Markov text generator I've ever seen. Incredible."
40066,"This is a really nice post. See, its helpful, kind, and offers alternatives. Its an example on how not to be a dick, unlike your first post."
2905,"I just have to say that I went to a very bike-friendly university and I worked at a bike shop for a few months (not claiming to be an expert) and I've never been in as bike-friendly a place as Manhattan. \n\nWalking was amazing too, much safer and friendlier than crossing the six-lane roads near where I grew up."
46724,none of which have ever won an election
55711,"To be fair the GTA series of games (which I played and liked) do tend to have a pretty jacked up and sociopathic narrative. While they don't focus on terrorism and there has always a veneer of good vs evil and redemption, the redemptions comes via morally ambiguous mass killings. I'm not arguing that GTA is breeding serial killers or anything like it, just that the subject matter of these games is as perverse and anti social as the subject matter on Fox News."


submission


Unnamed: 0,text
42918,Gen. David Petraeus arrived at a private lawn party in 2010 with a 28-car motorcade.
1531,Conservative Columnist Charles Krauthammer: Obama Caused GOP Civil War
2574,Letter Allegedly Penned by Chinese Labor Camp Prisoner Found Inside Box of Halloween Decorations from Kmart
18216,"George Will: ""Quite Literally, Opposition To Gay Marriage Is Dying"""
6388,"NRA advocates for armed guards in every school. In fact, Columbine High had an armed guard. He was out monitoring the Smoker's Corner while the shooters did their work inside."
14886,Petition to make illegal for news to lie to their audience in the US
49111,"Romney ""shellshocked"" by loss"
39349,"War: Israel Launches Gaza Invasion, Assassinates Hamas Military Wing Chief"
24511,"Gay Marriage Should be Addressed by SCOTUS, and Championed by Republicans"
42579,UPS ends charitable giving to boy scouts of America over Anti-gay discrimination


# Discord data (chat) 

In [29]:
# Compare with white supremacist dataset
# Load white supremacist dataset to count posts over time
import datetime

path = '../tmp/white_supremacist_corpus.pkl'
ws_data = pd.read_pickle(path)

# Select forum data, Group by year
chat_yearly = ws_data.query('domain=="chat"').groupby(by=ws_data.timestamp.dt.year)['text'].count()
chat_lookup = pd.DataFrame(chat_yearly)
chat_lookup['begin'] = pd.to_datetime(chat_yearly.index.astype(int).astype(str), format='%Y')
chat_lookup['end'] = chat_lookup.begin.shift(-1, fill_value = datetime.datetime(2019,1,1))
chat_lookup.index.name = 'year'
chat_lookup.index = chat_lookup.index.astype(int)
chat_lookup.rename(columns={'text': 'post_count'}, inplace=True)
print(chat_lookup.post_count.sum())
chat_lookup

46130


Unnamed: 0_level_0,post_count,begin,end
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017,33790,2017-01-01,2018-01-01
2018,12340,2018-01-01,2019-01-01


In [14]:
# Load random Discord data
import os
import json
import pandas as pd
from tqdm.notebook import tqdm
import re
from nltk.tokenize import TweetTokenizer
from multiprocessing import Pool

base_dirpath = '/storage2/mamille3/data/discord'
dirpath = os.path.join(base_dirpath, 'v3', 'content/drive/Shareddrives/Datasets/cleaned-v4/discord-v3')
# with open(os.path.join(dirpath, 'stats.json')) as f:
#     stats = json.load(f)
# stats
tokenizer = TweetTokenizer(strip_handles=True)

def process_dump(fname):
    if not fname.endswith('.txt'):
        return
    fpath = os.path.join(dirpath, fname)
    with open(fpath) as f:
        messages = [process_chat(message) for line in f.read().splitlines() for message in line.split('\t')]
    return messages

def process_chat(text):
    if ': ' in text:
        res =  text.split(': ')[1]
    else:
        res = text
    # # Remove usernames
    # res = re.sub(r'@.*\b', '', res)
    # Tokenize, lowercase
    return ' '.join(tokenizer.tokenize(res)).lower()

fpaths = [os.path.join(dirpath, fname) for fname in os.listdir(dirpath)]
with Pool(20) as p:
    chats = list(tqdm(p.imap(process_dump, fpaths), total=len(fpaths)))

In [26]:
# Create a df
df = pd.DataFrame({'text': [chat for chat_list in [el for el in chats if el is not None] for chat in chat_list]})
df

Unnamed: 0,text
0,hugs luci
1,snugglehugs
2,aite im gon na shower\npce
3,i feel big depresso but idk what i 'm sad abou...
4,yeah
...,...
132481515,stupid roaches i 'll be back in a few days
132481516,.
132481517,i ate his willy
132481518,lmfao


In [31]:
# Compare with white supremacist data
selected_ws = ws_data.query('domain=="chat"').copy()
selected_ws['word_count'] = selected_ws.text.str.split().str.len()
print(len(selected_ws))
print(selected_ws.word_count.sum())

46130
551967


In [32]:
selected_ws.word_count.mean()

11.96546715803165

In [38]:
sample = df.sample(int(selected_ws.word_count.sum()/4.2)) # 4.2 is the average words/post, so trying to match the number of words
print(len(sample))
sample['word_count'] = sample.text.str.split().str.len()
print(sample.word_count.sum())
print(sample.word_count.mean())

131420
542813
4.130368284888145
