# QAnon Reddit NLP

Name: Mateusz Kolodziejczyk
Student Number: 20084190

## Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
plt.style.use("seaborn-darkgrid")
pd.set_option('display.max_columns', None)  

DEBUG = False
SEED = 666

import os
for d in ['orig','data','output']: os.makedirs(d, exist_ok=True)


## Setup, load and prepare datasets

#### Authors

In [2]:
basename = "authors"

if os.path.isfile(f"data/{basename}.pickle"):
    # loading
    print("Reading pickle file", end=" ... ")
    df_authors = pd.read_pickle(f"data/{basename}.pickle")
else:
    # loading
    print("Reading csv", end=" ... ")
    df_authors = pd.read_csv(f"orig/{basename}.csv.gz")
    # cleaning
    print(df_authors.shape, end=" ... ")
    df_authors.rename(columns={"QAuthor":"author"}, inplace=True)
    # save as pickle for later use
    print("generating pickle file", end=" ... ")
    df_authors.to_pickle(f"data/{basename}.pickle")

print(df_authors.shape)
df_authors.head(5)

Reading pickle file ... (13182, 3)


Unnamed: 0,author,isUQ,status
0,aa65b7dd5d5fa660d058e094669f884bf7d52299,0,Active
1,2b1505f289338751829dfa129c0b52d145c9eceb,1,Active
2,4eeddb9abeb3c4889f1b037016bf2aeb834bb66d,0,Active
3,08a6fae5a56fcdb495b2de8a02625ea2b4abe32f,1,Active
4,01301652214982c57efe894efe7e2c7d57df2801,0,Active


#### Comments

In [3]:
basename = "comments"

if os.path.isfile(f"data/{basename}.pickle"):
    # loading
    print("Reading pickle file", end=" ... ")
    df_comments = pd.read_pickle(f"data/{basename}.pickle")
else:
    # loading
    print("Reading csv", end=" ... ")
    df_comments = pd.read_csv(f"orig/{basename}.csv.gz", dtype=str, parse_dates=["date_created"])
    # cleaning
    print(df_comments.shape, "dropna", end=" ... ")
    df_comments.dropna(inplace=True)
    print(df_comments.shape, end=" ... ")
    # save as pickle for later use
    print("generating pickle file", end=" ... ")
    df_comments.to_pickle(f"data/{basename}.pickle")

print(df_comments.shape)
df_comments.head(5)

Reading pickle file ... (10831841, 7)


Unnamed: 0,id,link_id,parent_id,author,subreddit,body,date_created
0,e0mztbn,t3_8qy7gp,t3_8qy7gp,182c774799aac38a84f5117fc59cde99b0df19af,greatawakening,My account is new because i lost my password t...,2018-06-14 02:17:37
1,e0n0e9q,t3_8qy9wy,t3_8qy9wy,182c774799aac38a84f5117fc59cde99b0df19af,greatawakening,new account only because i lost the password t...,2018-06-14 02:28:21
2,e0n11j4,t3_8qy9wy,t3_8qy9wy,182c774799aac38a84f5117fc59cde99b0df19af,greatawakening,i appreciate all the comments i'll read thru t...,2018-06-14 02:40:09
3,e0n1q4v,t3_8qy9wy,t1_e0n0vfm,182c774799aac38a84f5117fc59cde99b0df19af,greatawakening,why would rosenstein threaten those asking (li...,2018-06-14 02:52:41
4,e0n1u7v,t3_8qy9wy,t1_e0n0ltk,182c774799aac38a84f5117fc59cde99b0df19af,greatawakening,interesting i'm reading now. i'm still confuse...,2018-06-14 02:54:46


#### Submissions

In [4]:
basename = "submissions"

if os.path.isfile(f"data/{basename}.pickle"):
    # loading
    print("Reading pickle file", end=" ... ")
    df_submissions = pd.read_pickle(f"data/{basename}.pickle")
else:
    # loading
    print("Reading csv", end=" ... ")
    df_submissions = pd.read_csv(f"orig/{basename}.csv.gz", parse_dates=["date_created"])
    subset_labels = df_submissions.drop(['text'], axis=1).columns.values.tolist()
    # cleaning
    print(df_submissions.shape, "dropna", end=" ... ")
    
    # Make sure text is ignored when dropping na
    df_submissions.dropna(inplace=True, subset=subset_labels)
    print(df_submissions.shape, end=" ... ")
    # save as pickle for later use
    print("generating pickle file", end=" ... ")
    df_submissions.to_pickle(f"data/{basename}.pickle")

print(df_submissions.shape)
df_submissions.head(5)


Reading pickle file ... (2099686, 13)


Unnamed: 0,subreddit,id,score,numReplies,author,title,text,is_self,domain,url,permalink,upvote_ratio,date_created
0,greatawakening,8xuv4i,1,14,879f283b831c13474e219e88663d95b0763cca9b,I’ve been writing “Trump Lives Here” on my $20...,,False,i.redd.it,https://i.redd.it/h3mbbxvxq7911.jpg,/r/greatawakening/comments/8xuv4i/ive_been_wri...,-1.0,2018-07-11 00:27:24
1,greatawakening,8ydw3e,1,13,879f283b831c13474e219e88663d95b0763cca9b,Trying to take him seriously but...,,False,i.redd.it,https://i.redd.it/62gaw0th4l911.jpg,/r/greatawakening/comments/8ydw3e/trying_to_ta...,-1.0,2018-07-12 21:26:32
2,greatawakening,8ytwg0,1,0,879f283b831c13474e219e88663d95b0763cca9b,“It is all happening!” Crumb?,,False,i.redd.it,https://i.redd.it/yo9zscb1jx911.jpg,/r/greatawakening/comments/8ytwg0/it_is_all_ha...,-1.0,2018-07-14 15:09:25
3,greatawakening,8ytx4z,1,114,879f283b831c13474e219e88663d95b0763cca9b,“It is all happening!” Positive sign hopefully...,,False,i.redd.it,https://i.redd.it/v5c4zxcjjx911.jpg,/r/greatawakening/comments/8ytx4z/it_is_all_ha...,-1.0,2018-07-14 15:12:14
4,greatawakening,8yvgwt,1,23,879f283b831c13474e219e88663d95b0763cca9b,Pedogate is REAL! Happening here in my beloved...,,False,foxnews.com,http://www.foxnews.com/us/2018/07/14/texas-wom...,/r/greatawakening/comments/8yvgwt/pedogate_is_...,-1.0,2018-07-14 18:46:35


#### Subreddits

In [43]:
basename = "subreddits"

if os.path.isfile(f"data/{basename}.pickle"):
    # loading
    print("Reading pickle file", end=" ... ")
    df_subreddits = pd.read_pickle(f"data/{basename}.pickle")
else:
    # loading
    print("Reading csv", end=" ... ")
    df_subreddits = pd.read_csv(f"orig/{basename}.csv.gz", converters={"allModNames": lambda x: x.strip("[]").split(",")})
    # cleaning
    print(df_subreddits.shape, "dropna", end=" ... ")
    
    # Make sure text is ignored when dropping na
    df_subreddits.dropna(inplace=True)
    print(df_subreddits.shape, end=" ... ")
    # save as pickle for later use
    print("generating pickle file", end=" ... ")
    df_subreddits.to_pickle(f"data/{basename}.pickle")

print(df_subreddits.shape)
df_subreddits.head(5)

Reading csv ... (12987, 38) dropna ... (12987, 38) ... generating pickle file ... (12987, 38)


Unnamed: 0,subreddit,numSubscribers,status,allModNames,allMods,qModNames,qMods,top_qModNames,top_qMods,firstPostSubmission,lastPostSubmission,firstPostComment,lastPostComment,qModsRatio,top_qModsRatio,activePreBanOnly,activePreQ,activePostBan,qAuth,top_qAuth,qSubmissions,top_qSubmissions,nonTop_qSubmissions,qComments,top_qComments,nonTop_qComments,top_qPercent,qPercent,Monthly Average Total Authors,Monthly Average Total Submissions,Monthly Average UQ Authors,Monthly Average UQ Submissions,Monthly Average QAnon Authors,Monthly Average QAnon Submissions,% UQ Submissions,% UQ Authors,% QAnon Submissions,% QAnon Authors
0,Watches,1525243.0,public,"['f1c355408b78fd88ebc13aade4c9a7924005c2ab', '...",13,[],0.0,[],0.0,2016-12-07 03:21:16,2020-11-27 19:24:49,2016-11-26 13:24:54,2021-01-22 18:22:43,0.0,0.0,0,1,1,58,10,219,99,120,1681.0,244.0,1437.0,0.29,0.44,2881.384615,5911.846154,1.2,2.4,3.75,6.0,0.040596,0.041647,0.101491,0.130146
1,MMA,1518451.0,public,"['69e403df92bb49af60d5046c0be60f1a46bfd53d', '...",20,[],0.0,[],0.0,2016-11-01 05:51:55,2021-01-03 00:43:44,2016-10-28 00:39:15,2021-01-23 08:16:57,0.0,0.0,0,1,1,97,29,371,99,272,18910.0,2808.0,16102.0,0.83,0.74,1840.461538,4953.461538,2.916667,4.75,6.384615,9.923077,0.095893,0.158475,0.200326,0.346903
2,Seattle,281450.0,public,"['14a0b21d55a0415e5541be3389d27ee3b3232c90', '...",7,[],0.0,[],0.0,2016-11-07 07:12:42,2021-01-18 23:13:14,2016-11-03 16:45:54,2021-01-21 14:56:00,0.0,0.0,0,1,1,53,13,188,99,89,1559.0,214.0,1345.0,0.37,0.4,732.769231,1140.692308,1.2,1.2,3.272727,3.818182,0.105199,0.163762,0.334725,0.446625
3,UnusAnnus,195890.0,quarantined/private,['f7fd9c68f804acda665d2ab082217bb1583318f2'],1,[],0.0,[],0.0,2020-02-05 05:04:36,2020-11-14 05:24:35,2020-06-17 23:26:32,2020-11-13 09:04:06,0.0,0.0,0,0,1,1,1,99,99,0,3.0,0.0,0.0,0.03,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,GlobalOffensiveTrade,209007.0,public,"['f78b47357eabba6f7580e47da7560a322287eaee', '...",22,[],0.0,[],0.0,2016-12-02 06:33:48,2020-08-30 21:38:25,2017-04-05 12:44:22,2019-12-19 01:06:07,0.0,0.0,0,1,1,6,1,122,99,23,86.0,46.0,40.0,0.03,0.05,2132.923077,17479.307692,1.0,19.0,1.0,19.0,0.1087,0.046884,0.1087,0.046884


#### Paper

In [6]:
basename = "paper"

if os.path.isfile(f"data/{basename}.pickle"):
    # loading
    print("Reading pickle file", end=" ... ")
    df_paper = pd.read_pickle(f"data/{basename}.pickle")
else:
    # loading
    print("Reading csv", end=" ... ")
    df_paper = pd.read_csv(f"orig/{basename}.csv", dtype=str)
    # cleaning
    print(df_paper.shape, "dropna", end=" ... ")
    
    # Make sure text is ignored when dropping na
    df_paper.dropna(inplace=True)
    print(df_paper.shape, end=" ... ")
    # save as pickle for later use
    print("generating pickle file", end=" ... ")
    df_paper.to_pickle(f"data/{basename}.pickle")

print(df_paper.shape)
df_paper.head(5)

Reading pickle file ... (19, 1)


Unnamed: 0,subreddit
0,greatawakening
1,The_GreatAwakening
2,AFTERTHESTQRM
3,TheGreatAwakening
4,QAnon


## Verify datasets

In [47]:
n_test = 7
n_true = 0
# df_authors.author is a uniqueid

val = df_authors['author'].is_unique
print("df_authors.author is unique: {}".format(val))
if val == True:
    n_true += 1
# no missing values in df_authors

val = df_authors.notna().any().any()
print("no missing values in df_authors: {}".format(val))
if val == True:
    n_true += 1
# df_comments.id is unique id

val = df_comments['id'].is_unique
print("df_comments.id is unique id: {}".format(val))
if val == True:
    n_true += 1
# each comment author is in list of qanon authors

val = df_comments['author'].isin(df_authors['author']).all()
print("each comment author is in list of qanon authors: {}".format(val))
if val == True:
    n_true += 1
    
# df_submissions.id is a unique id

val = df_submissions['id'].is_unique
print("df_submissions.id is a unique id: {}".format(val))
if val == True:
    n_true += 1
    
# each submission author is in list of qanon authors
val = df_submissions['author'].isin(df_authors['author']).all()
print("each submission author is in list of qanon authors: {}".format(val))
if val == True:
    n_true += 1
    
# in df_subreddits, the number of moderates in allmodnames matches count in allmods
val = np.where(df_subreddits['allModNames'].str.len() == df_subreddits['allMods'], True, False).all()
print("in df_subreddits, the number of moderators in allmodnames matches count in allmods: {}".format(val))
if val == True:
    n_true += 1

df_authors.author is unique: True
no missing values in df_authors: True
df_comments.id is unique id: True
each comment author is in list of qanon authors: True
df_submissions.id is a unique id: True
each submission author is in list of qanon authors: True
in df_subreddits, the number of moderators in allmodnames matches count in allmods: False


In [54]:
print("{}/{} verification tests passed".format(n_true, n_test))

6/7 verification tests passed


In [53]:
match_count = np.where(df_subreddits['allModNames'].str.len() == df_subreddits['allMods'], True, False)
np.unique(match_count, return_counts = True)

(array([False,  True]), array([  425, 12562], dtype=int64))

In [35]:
df_subreddits['allModNames']

0        ['f1c355408b78fd88ebc13aade4c9a7924005c2ab', '...
1        ['69e403df92bb49af60d5046c0be60f1a46bfd53d', '...
2        ['14a0b21d55a0415e5541be3389d27ee3b3232c90', '...
3             ['f7fd9c68f804acda665d2ab082217bb1583318f2']
4        ['f78b47357eabba6f7580e47da7560a322287eaee', '...
                               ...                        
12982    ['bfcde1ace3566aca6c58b4e91864cf1dc3580838', '...
12983         ['dba23e7b20b6aabac1013515b3966f5d8162dbac']
12984    ['3125329c1621a3c3e2a68b53d68668dc7a997ace', '...
12985    ['cf35f27bf6511bf07145c8ae3de98c9a7dc0ab8e', '...
12986    ['da6467ebe26a9644834051d737c3385fb244f50d', '...
Name: allModNames, Length: 12987, dtype: object

In [45]:
df_subreddits['allMods']

0        13
1        20
2         7
3         1
4        22
         ..
12982     7
12983     1
12984     7
12985     5
12986     6
Name: allMods, Length: 12987, dtype: int64

In [44]:
df_subreddits['allModNames'].apply(len)

0        13
1        20
2         7
3         1
4        22
         ..
12982     7
12983     1
12984     7
12985     5
12986     6
Name: allModNames, Length: 12987, dtype: int64

In [20]:
len(['cool', 'dog'])
df'

2

In [57]:
df_submissions['numReplies']

0           14
1           13
2            0
3          114
4           23
          ... 
2775258    130
2775259      3
2775260      9
2775261      3
2775262     16
Name: numReplies, Length: 2099686, dtype: object

## Match Paper