# QAnon Reddit NLP

Name: Mateusz Kolodziejczyk

Student Number: 20084190

## Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
plt.style.use("seaborn-darkgrid")
pd.set_option('display.max_columns', None)  

DEBUG = False
SEED = 666

import os
for d in ['orig','data','output']: os.makedirs(d, exist_ok=True)


## Setup, load and prepare datasets

#### Authors

In [2]:
basename = "authors"

if os.path.isfile(f"data/{basename}.pickle"):
    # loading
    print("Reading pickle file", end=" ... ")
    df_authors = pd.read_pickle(f"data/{basename}.pickle")
else:
    # loading
    print("Reading csv", end=" ... ")
    df_authors = pd.read_csv(f"orig/{basename}.csv.gz")
    # cleaning
    print(df_authors.shape, end=" ... ")
    df_authors.rename(columns={"QAuthor":"author"}, inplace=True)
    # save as pickle for later use
    print("generating pickle file", end=" ... ")
    df_authors.to_pickle(f"data/{basename}.pickle")

print(df_authors.shape)
df_authors.head(5)

Reading pickle file ... (13182, 3)


Unnamed: 0,author,isUQ,status
0,aa65b7dd5d5fa660d058e094669f884bf7d52299,0,Active
1,2b1505f289338751829dfa129c0b52d145c9eceb,1,Active
2,4eeddb9abeb3c4889f1b037016bf2aeb834bb66d,0,Active
3,08a6fae5a56fcdb495b2de8a02625ea2b4abe32f,1,Active
4,01301652214982c57efe894efe7e2c7d57df2801,0,Active


#### Comments

In [3]:
basename = "comments"

if os.path.isfile(f"data/{basename}.pickle"):
    # loading
    print("Reading pickle file", end=" ... ")
    df_comments = pd.read_pickle(f"data/{basename}.pickle")
else:
    # loading
    print("Reading csv", end=" ... ")
    df_comments = pd.read_csv(f"orig/{basename}.csv.gz", dtype=str, parse_dates=["date_created"])
    # cleaning
    print(df_comments.shape, "dropna", end=" ... ")
    df_comments.dropna(inplace=True)
    print(df_comments.shape, end=" ... ")
    # save as pickle for later use
    print("generating pickle file", end=" ... ")
    df_comments.to_pickle(f"data/{basename}.pickle")

print(df_comments.shape)
df_comments.head(5)

Reading pickle file ... (10831841, 7)


Unnamed: 0,id,link_id,parent_id,author,subreddit,body,date_created
0,e0mztbn,t3_8qy7gp,t3_8qy7gp,182c774799aac38a84f5117fc59cde99b0df19af,greatawakening,My account is new because i lost my password t...,2018-06-14 02:17:37
1,e0n0e9q,t3_8qy9wy,t3_8qy9wy,182c774799aac38a84f5117fc59cde99b0df19af,greatawakening,new account only because i lost the password t...,2018-06-14 02:28:21
2,e0n11j4,t3_8qy9wy,t3_8qy9wy,182c774799aac38a84f5117fc59cde99b0df19af,greatawakening,i appreciate all the comments i'll read thru t...,2018-06-14 02:40:09
3,e0n1q4v,t3_8qy9wy,t1_e0n0vfm,182c774799aac38a84f5117fc59cde99b0df19af,greatawakening,why would rosenstein threaten those asking (li...,2018-06-14 02:52:41
4,e0n1u7v,t3_8qy9wy,t1_e0n0ltk,182c774799aac38a84f5117fc59cde99b0df19af,greatawakening,interesting i'm reading now. i'm still confuse...,2018-06-14 02:54:46


#### Submissions

In [4]:
basename = "submissions"

if os.path.isfile(f"data/{basename}.pickle"):
    # loading
    print("Reading pickle file", end=" ... ")
    df_submissions = pd.read_pickle(f"data/{basename}.pickle")
else:
    # loading
    print("Reading csv", end=" ... ")
    df_submissions = pd.read_csv(f"orig/{basename}.csv.gz", parse_dates=["date_created"])
    subset_labels = df_submissions.drop(['text'], axis=1).columns.values.tolist()
    # cleaning
    print(df_submissions.shape, "dropna", end=" ... ")
    
    # Make sure text is ignored when dropping na
    df_submissions.dropna(inplace=True, subset=subset_labels)

    print(df_submissions.shape, end=" ... ")
    # save as pickle for later use
    print("generating pickle file", end=" ... ")
    df_submissions.to_pickle(f"data/{basename}.pickle")

print(df_submissions.shape)
df_submissions.head(5)


Reading pickle file ... (2099686, 13)


Unnamed: 0,subreddit,id,score,numReplies,author,title,text,is_self,domain,url,permalink,upvote_ratio,date_created
0,greatawakening,8xuv4i,1,14,879f283b831c13474e219e88663d95b0763cca9b,I’ve been writing “Trump Lives Here” on my $20...,,False,i.redd.it,https://i.redd.it/h3mbbxvxq7911.jpg,/r/greatawakening/comments/8xuv4i/ive_been_wri...,-1.0,2018-07-11 00:27:24
1,greatawakening,8ydw3e,1,13,879f283b831c13474e219e88663d95b0763cca9b,Trying to take him seriously but...,,False,i.redd.it,https://i.redd.it/62gaw0th4l911.jpg,/r/greatawakening/comments/8ydw3e/trying_to_ta...,-1.0,2018-07-12 21:26:32
2,greatawakening,8ytwg0,1,0,879f283b831c13474e219e88663d95b0763cca9b,“It is all happening!” Crumb?,,False,i.redd.it,https://i.redd.it/yo9zscb1jx911.jpg,/r/greatawakening/comments/8ytwg0/it_is_all_ha...,-1.0,2018-07-14 15:09:25
3,greatawakening,8ytx4z,1,114,879f283b831c13474e219e88663d95b0763cca9b,“It is all happening!” Positive sign hopefully...,,False,i.redd.it,https://i.redd.it/v5c4zxcjjx911.jpg,/r/greatawakening/comments/8ytx4z/it_is_all_ha...,-1.0,2018-07-14 15:12:14
4,greatawakening,8yvgwt,1,23,879f283b831c13474e219e88663d95b0763cca9b,Pedogate is REAL! Happening here in my beloved...,,False,foxnews.com,http://www.foxnews.com/us/2018/07/14/texas-wom...,/r/greatawakening/comments/8yvgwt/pedogate_is_...,-1.0,2018-07-14 18:46:35


#### Subreddits

In [5]:
basename = "subreddits"

if os.path.isfile(f"data/{basename}.pickle"):
    # loading
    print("Reading pickle file", end=" ... ")
    df_subreddits = pd.read_pickle(f"data/{basename}.pickle")
else:
    # loading
    print("Reading csv", end=" ... ")
    df_subreddits = pd.read_csv(f"orig/{basename}.csv.gz", converters={"allModNames": lambda x: x.strip("[]").split(",")})
    # cleaning
    print(df_subreddits.shape, "dropna", end=" ... ")
    
    # Make sure text is ignored when dropping na
    df_subreddits.dropna(inplace=True)
    print(df_subreddits.shape, end=" ... ")
    # save as pickle for later use
    print("generating pickle file", end=" ... ")
    df_subreddits.to_pickle(f"data/{basename}.pickle")

print(df_subreddits.shape)
df_subreddits.head(5)

Reading pickle file ... (12987, 38)


Unnamed: 0,subreddit,numSubscribers,status,allModNames,allMods,qModNames,qMods,top_qModNames,top_qMods,firstPostSubmission,lastPostSubmission,firstPostComment,lastPostComment,qModsRatio,top_qModsRatio,activePreBanOnly,activePreQ,activePostBan,qAuth,top_qAuth,qSubmissions,top_qSubmissions,nonTop_qSubmissions,qComments,top_qComments,nonTop_qComments,top_qPercent,qPercent,Monthly Average Total Authors,Monthly Average Total Submissions,Monthly Average UQ Authors,Monthly Average UQ Submissions,Monthly Average QAnon Authors,Monthly Average QAnon Submissions,% UQ Submissions,% UQ Authors,% QAnon Submissions,% QAnon Authors
0,Watches,1525243.0,public,"['f1c355408b78fd88ebc13aade4c9a7924005c2ab', '...",13,[],0.0,[],0.0,2016-12-07 03:21:16,2020-11-27 19:24:49,2016-11-26 13:24:54,2021-01-22 18:22:43,0.0,0.0,0,1,1,58,10,219,99,120,1681.0,244.0,1437.0,0.29,0.44,2881.384615,5911.846154,1.2,2.4,3.75,6.0,0.040596,0.041647,0.101491,0.130146
1,MMA,1518451.0,public,"['69e403df92bb49af60d5046c0be60f1a46bfd53d', '...",20,[],0.0,[],0.0,2016-11-01 05:51:55,2021-01-03 00:43:44,2016-10-28 00:39:15,2021-01-23 08:16:57,0.0,0.0,0,1,1,97,29,371,99,272,18910.0,2808.0,16102.0,0.83,0.74,1840.461538,4953.461538,2.916667,4.75,6.384615,9.923077,0.095893,0.158475,0.200326,0.346903
2,Seattle,281450.0,public,"['14a0b21d55a0415e5541be3389d27ee3b3232c90', '...",7,[],0.0,[],0.0,2016-11-07 07:12:42,2021-01-18 23:13:14,2016-11-03 16:45:54,2021-01-21 14:56:00,0.0,0.0,0,1,1,53,13,188,99,89,1559.0,214.0,1345.0,0.37,0.4,732.769231,1140.692308,1.2,1.2,3.272727,3.818182,0.105199,0.163762,0.334725,0.446625
3,UnusAnnus,195890.0,quarantined/private,['f7fd9c68f804acda665d2ab082217bb1583318f2'],1,[],0.0,[],0.0,2020-02-05 05:04:36,2020-11-14 05:24:35,2020-06-17 23:26:32,2020-11-13 09:04:06,0.0,0.0,0,0,1,1,1,99,99,0,3.0,0.0,0.0,0.03,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,GlobalOffensiveTrade,209007.0,public,"['f78b47357eabba6f7580e47da7560a322287eaee', '...",22,[],0.0,[],0.0,2016-12-02 06:33:48,2020-08-30 21:38:25,2017-04-05 12:44:22,2019-12-19 01:06:07,0.0,0.0,0,1,1,6,1,122,99,23,86.0,46.0,40.0,0.03,0.05,2132.923077,17479.307692,1.0,19.0,1.0,19.0,0.1087,0.046884,0.1087,0.046884


#### Paper

In [6]:
basename = "paper"

if os.path.isfile(f"data/{basename}.pickle"):
    # loading
    print("Reading pickle file", end=" ... ")
    df_paper = pd.read_pickle(f"data/{basename}.pickle")
else:
    # loading
    print("Reading csv", end=" ... ")
    df_paper = pd.read_csv(f"orig/{basename}.csv", dtype=str)
    # cleaning
    print(df_paper.shape, "dropna", end=" ... ")
    
    # Make sure text is ignored when dropping na
    df_paper.dropna(inplace=True)
    print(df_paper.shape, end=" ... ")
    # save as pickle for later use
    print("generating pickle file", end=" ... ")
    df_paper.to_pickle(f"data/{basename}.pickle")

print(df_paper.shape)
df_paper.head(5)

Reading pickle file ... (19, 1)


Unnamed: 0,subreddit
0,greatawakening
1,The_GreatAwakening
2,AFTERTHESTQRM
3,TheGreatAwakening
4,QAnon


## Verify datasets

In [7]:
n_test = 7
n_true = 0
# df_authors.author is a uniqueid

val = df_authors['author'].is_unique
print("df_authors.author is unique: {}".format(val))
if val == True:
    n_true += 1
# no missing values in df_authors

val = df_authors.notna().any().any()
print("no missing values in df_authors: {}".format(val))
if val == True:
    n_true += 1
# df_comments.id is unique id

val = df_comments['id'].is_unique
print("df_comments.id is unique id: {}".format(val))
if val == True:
    n_true += 1
# each comment author is in list of qanon authors

val = df_comments['author'].isin(df_authors['author']).all()
print("each comment author is in list of qanon authors: {}".format(val))
if val == True:
    n_true += 1
    
# df_submissions.id is a unique id

val = df_submissions['id'].is_unique
print("df_submissions.id is a unique id: {}".format(val))
if val == True:
    n_true += 1
    
# each submission author is in list of qanon authors
val = df_submissions['author'].isin(df_authors['author']).all()
print("each submission author is in list of qanon authors: {}".format(val))
if val == True:
    n_true += 1
    
# in df_subreddits, the number of moderates in allmodnames matches count in allmods
val = np.where(df_subreddits['allModNames'].str.len() == df_subreddits['allMods'], True, False).all()
print("in df_subreddits, the number of moderators in allmodnames matches count in allmods: {}".format(val))
if val == True:
    n_true += 1

df_authors.author is unique: True
no missing values in df_authors: True
df_comments.id is unique id: True
each comment author is in list of qanon authors: True
df_submissions.id is a unique id: True
each submission author is in list of qanon authors: True
in df_subreddits, the number of moderators in allmodnames matches count in allmods: False


In [8]:
print("{}/{} verification tests passed".format(n_true, n_test))

6/7 verification tests passed


In [9]:
match_count = np.where(df_subreddits['allModNames'].str.len() == df_subreddits['allMods'], True, False)
np.unique(match_count, return_counts = True)

(array([False,  True]), array([  425, 12562], dtype=int64))

In [10]:
difference = np.absolute(df_subreddits['allModNames'].str.len() - df_subreddits['allMods'])
difference.unique()

array([0, 1], dtype=int64)

One verification test failed, but most values match. Furthermore if the values don't match, they're only off by 1.

## Match Paper

### Table 1

Get QAnon enthusiastic (top 25% most active with five or more submissions to the 19 QANON subreddits) Should be 3506

In [11]:
## Remove all non Q subreddits from submissions
filtered_submissions = df_submissions[df_submissions['subreddit'].isin(df_paper['subreddit'])]
authors = filtered_submissions['author']
author_counts = authors.value_counts()
cutoff = 5
    

q_enthusiastic_counts = author_counts[author_counts >= cutoff]
q_interested_counts = author_counts[author_counts < cutoff]

q_enthusiastic = q_enthusiastic_counts.index.to_series()
q_interested = q_interested_counts.index.to_series()

indexes = ['q_interested', 'q_enthusiastic']
q_users={indexes[0]: q_interested, indexes[1]: q_enthusiastic}


table1 = {indexes[0]: {"Total":0, "Active": 0, "Does Not Exist": 0, "Suspended": 0},
                   indexes[1]: {"Total":0, "Active": 0, "Does Not Exist": 0, "Suspended": 0}  }

table1[indexes[0]]["Total"] = q_users[indexes[0]].count()

for k in q_users:
    d = table1[k]
    users = q_users[k]
    d["Total"] = users.count()
    account_statuses = df_authors[df_authors['author'].isin(users)]['status'].value_counts()
    d["Active"] = account_statuses.at['Active']
    d["Does Not Exist"] = account_statuses.at['DNE']
    d["Suspended"] = account_statuses.at['Is_suspended']

    
pd.DataFrame.from_dict(table1).transpose()



Unnamed: 0,Total,Active,Does Not Exist,Suspended
q_interested,9583,8007,1143,433
q_enthusiastic,3599,3007,481,111


### Table 2

Iterate through each suberddit, get date and populate dictionary


In [12]:
table2 = {}
for sub in df_paper['subreddit'].values:
    # All submission rows for this subreddit
    submissions = df_submissions[df_submissions['subreddit'] == sub]
    # Get number of submission authors
    author_count = submissions['author'].value_counts().count()
    # Get number of submissions
    submission_count = submissions['id'].count()
    table2[sub] = {"submissions_authors": author_count, "submissions": submission_count}

table2

pd.DataFrame.from_dict(table2).transpose()

Unnamed: 0,submissions_authors,submissions
greatawakening,12862,95644
The_GreatAwakening,698,3982
AFTERTHESTQRM,46,730
TheGreatAwakening,12,32
QAnon,59,119
QGreatAwakening,23,34
QanonUK,12,27
QanonTools,3,18
greatawakening2,6,7
Quincels,4,5


## Text Preprocessingimport emoji, string, nltk


In [13]:
import emoji, string, nltk
import spacy
from nltk.tokenize import RegexpTokenizer
import gensim
from nltk.util import ngrams
from gensim.models import Phrases
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MK\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MK\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
sample_text = [
    "The movie was crap ",
    "The book was kind of good.",
    "A really bad, horrible book. 🤨", 
    "Today sux", 
    "12 Today kinda sux! But I'll get by, lol 😃",
    "sentiment analysis is shit. ",
    "sentiment analysis is the shit.",
    "I like to hate Michael Bay films, but I couldn't fault this one",
]


In [15]:

# define three possible tokenizers
tokenizer_1 = str.split
tokenizer_2 = nltk.word_tokenize
tokenizer_3 = RegexpTokenizer("\w+|\$[\d\.]+|http\S+").tokenize

# define two possible default_stopwords
default_stopwords_1 = nltk.corpus.stopwords.words('english')
nlp = spacy.load("en_core_web_sm")
default_stopwords_2 = nlp.Defaults.stop_words

lemmatizer = nltk.WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()


In [16]:
annoying_quotes = ['“', '”']
def generate_tokens(text, lower=True, 
    tokenizer=None,
    default_stopwords=None, extra_stopwords=None, 
    lemmatizer=None, stemmer=None,
    min_token_length=0):

    """
    Given a string, separate into tokens and clean.
    """
    # lower case
    text = text.lower() if lower else text 

    # remove emoji 
    text = emoji.replace_emoji(text, replace='')

    # tokenize text (always needed)
    assert tokenizer is not None, "Need to specify a tokenizer to split text into tokens"
    tokens = tokenizer(text)

    # TODO drop default_stopwords and extra_stopwords (if parameters are provided) and punctuation
    if default_stopwords is not None:
        [t for t in tokens if t not in default_stopwords]
    if extra_stopwords is not None:
        [t for t in tokens if t not in extra_stopwords]
        
    tokens = [t for t in tokens if t not in string.punctuation and t not in annoying_quotes]
    
    # TODO lemmatize each token (only if lemmatizer parameter is provided)
    if lemmatizer is not None:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
        
    # TODO stem each token (only if stemmer parameter is provided)
    if stemmer is not None:
        tokens = [stemmer.stem(t) for t in tokens]
        
    if min_token_length>0:
        tokens = [t for t in tokens if len(t)>=min_token_length]

    return tokens


Lemma

In [17]:
for text in sample_text:
    tokens = generate_tokens(text, tokenizer=tokenizer_3, default_stopwords=default_stopwords_1, lemmatizer=lemmatizer)
    print(f"{text}\n\t{tokens}")
    


The movie was crap 
	['the', 'movie', 'wa', 'crap']
The book was kind of good.
	['the', 'book', 'wa', 'kind', 'of', 'good']
A really bad, horrible book. 🤨
	['a', 'really', 'bad', 'horrible', 'book']
Today sux
	['today', 'sux']
12 Today kinda sux! But I'll get by, lol 😃
	['12', 'today', 'kinda', 'sux', 'but', 'i', 'll', 'get', 'by', 'lol']
sentiment analysis is shit. 
	['sentiment', 'analysis', 'is', 'shit']
sentiment analysis is the shit.
	['sentiment', 'analysis', 'is', 'the', 'shit']
I like to hate Michael Bay films, but I couldn't fault this one
	['i', 'like', 'to', 'hate', 'michael', 'bay', 'film', 'but', 'i', 'couldn', 't', 'fault', 'this', 'one']


Lemma + stem

In [18]:
# create tokens of sample_text for testing
docs = [
    generate_tokens(doc, tokenizer=tokenizer_3, lemmatizer=lemmatizer, default_stopwords=default_stopwords_1, min_token_length=3)
    for doc in sample_text]


def generate_ngrams(selectedTokens, bigram, trigram):     
    return trigram[bigram[selectedTokens]]




In [19]:
def genTokensOrLoad(name, df):
    if os.path.isfile(f"data/{name}_tokens.pickle"):
        # loading
        print("Reading pickle file", end=" ... ")
        return pd.read_pickle(f"data/{name}_tokens.pickle")
    else:
        # loading
        print("Generating Tokens", end=" ... ")
        tokens = df[name].apply(lambda t: generate_tokens(t, tokenizer=tokenizer_3, lemmatizer=lemmatizer, default_stopwords=default_stopwords_1, min_token_length=3) if isinstance(t, str) else t)
        # save as pickle for later use
        print("Generating pickle file", end=" ... ")
        tokens.to_pickle(f"data/{name}_tokens.pickle")
        return tokens

In [20]:
# Generate tokens if vlaue is a string, otherwise put in t
# text_tokens = df_submissions['text'].apply(lambda t: generate_tokens(t, tokenizer=tokenizer_3, lemmatizer=lemmatizer, default_stopwords=default_stopwords_1, min_token_length=3) if isinstance(t, str) else t)
# text_tokens
# title_tokens = df_submissions['title'].apply(lambda t: generate_tokens(t, tokenizer=tokenizer_3, lemmatizer=lemmatizer, default_stopwords=default_stopwords_1, min_token_length=3))
# title_tokens

In [21]:

# # Add the two token lists to the dataframe
# df_submissions['text_tokens'] = genTokensOrLoad('text', df_submissions)
# df_submissions['title_tokens'] = genTokensOrLoad('title', df_submissions)
# df_submissions


In [22]:
df_submissions.dropna()

Unnamed: 0,subreddit,id,score,numReplies,author,title,text,is_self,domain,url,permalink,upvote_ratio,date_created
6,greatawakening,8zbz30,1,169,879f283b831c13474e219e88663d95b0763cca9b,Putin Just Said that $400MM eluded Russian and...,Isn't that a HUGE admission and sign that he w...,True,self.greatawakening,https://www.reddit.com/r/greatawakening/commen...,/r/greatawakening/comments/8zbz30/putin_just_s...,-1.0,2018-07-16 15:46:51
7,greatawakening,8zc0k7,1,25,879f283b831c13474e219e88663d95b0763cca9b,OMG! Where's the SERVER!!! CRAZY! This is happ...,"OMG, OMG! I love him so much!",True,self.greatawakening,https://www.reddit.com/r/greatawakening/commen...,/r/greatawakening/comments/8zc0k7/omg_wheres_t...,-1.0,2018-07-16 15:51:42
8,greatawakening,8zc21l,1,14,879f283b831c13474e219e88663d95b0763cca9b,"Who is that Jackass reporter asking about ""Com...",Also the same one who asked about interference...,True,self.greatawakening,https://www.reddit.com/r/greatawakening/commen...,/r/greatawakening/comments/8zc21l/who_is_that_...,-1.0,2018-07-16 15:56:37
10,greatawakening,90gl0h,1,6,879f283b831c13474e219e88663d95b0763cca9b,Tucker's Final Exam 7/19 - Lights Out/Storm!,Tucker has been making little drops here and t...,True,self.greatawakening,https://www.reddit.com/r/greatawakening/commen...,/r/greatawakening/comments/90gl0h/tuckers_fina...,-1.0,2018-07-20 14:46:01
16,greatawakening,93jk5j,1,5,879f283b831c13474e219e88663d95b0763cca9b,Way to go PATRIOTS in Tampa!,I have chills and tears of joy watching you al...,True,self.greatawakening,https://www.reddit.com/r/greatawakening/commen...,/r/greatawakening/comments/93jk5j/way_to_go_pa...,-1.0,2018-07-31 23:20:55
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2773808,howardstern,62004j,12,8,c039f11ff985f7d8547748c10ccd5c8416b8404d,BABABOOEY on the wrap up show,Is talking so godamn fast with his JOHN hein c...,True,self.howardstern,https://www.reddit.com/r/howardstern/comments/...,/r/howardstern/comments/62004j/bababooey_on_th...,-1.0,2017-03-28 15:25:29
2774089,The_Donald,8eky0u,13,3,c039f11ff985f7d8547748c10ccd5c8416b8404d,IF IRAN RESTARTS THEIR NUCLEAR PROGRAM THEY WI...,[removed],True,self.The_Donald,https://www.reddit.com/r/The_Donald/comments/8...,/r/The_Donald/comments/8eky0u/if_iran_restarts...,-1.0,2018-04-24 14:46:14
2774632,The_Donald,9uoze6,1,0,c039f11ff985f7d8547748c10ccd5c8416b8404d,WE NEED HIGH ENERGY TODAY PEDES!! NEVER FORGET!!!,[removed],True,self.The_Donald,https://www.reddit.com/r/The_Donald/comments/9...,/r/The_Donald/comments/9uoze6/we_need_high_ene...,-1.0,2018-11-06 14:57:01
2774930,unpopularopinion,avxrw3,6,4,c039f11ff985f7d8547748c10ccd5c8416b8404d,Miracle whip is much better than mayo.,I don’t know if it’s because of where I grew u...,True,self.unpopularopinion,https://www.reddit.com/r/unpopularopinion/comm...,/r/unpopularopinion/comments/avxrw3/miracle_wh...,-1.0,2019-02-28 23:43:24


## Sample Dataset

In [23]:
subreddit = "TruthLeaks"
# Get only submissions from given subreddit
df = df_submissions[df_submissions['subreddit'] == subreddit]
# Drop na in subset
df = df.dropna(subset=['text'])
df = df[df['text'] != df['title']]

## Removed [removed] from text and title
df = df[df['title'] != "[removed]"]
df = df[df['text'] != "[removed]"]

## Remove duplicates in text/title
df = df.drop_duplicates(subset=['text', 'title'])
df

Unnamed: 0,subreddit,id,score,numReplies,author,title,text,is_self,domain,url,permalink,upvote_ratio,date_created
259450,TruthLeaks,c2bbpp,2,0,ce90110e1a553f8e6c03aed8d5183b5303b3721e,hacks and leaks rough timeline,\n\n{I've been trying to read up on the claim...,True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/c...,/r/TruthLeaks/comments/c2bbpp/hacks_and_leaks_...,-1.0,2019-06-19 02:24:41
259454,TruthLeaks,c2tbsn,3,3,ce90110e1a553f8e6c03aed8d5183b5303b3721e,Catholic Relief Services (1985) involved in di...,pg1\n\n[https://web.archive.org/web/2017112119...,True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/c...,/r/TruthLeaks/comments/c2tbsn/catholic_relief_...,-1.0,2019-06-20 08:10:04
260132,TruthLeaks,he84kn,1,2,ce90110e1a553f8e6c03aed8d5183b5303b3721e,Steve Bing to Pay $200K for Clinton Korea Trip...,\n\n# Steve Bing to Pay $200K for Clinton Kor...,True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/h...,/r/TruthLeaks/comments/he84kn/steve_bing_to_pa...,1.0,2020-06-23 04:42:46
277659,TruthLeaks,6chtnt,4,0,017d9eab0596673d332fd9d916c3775deb4568e1,Mainlining Money - Major Southern California N...,May 21 2017 Orange County Register published t...,True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/6...,/r/TruthLeaks/comments/6chtnt/mainlining_money...,-1.0,2017-05-21 17:40:13
277664,TruthLeaks,6d007z,3,0,017d9eab0596673d332fd9d916c3775deb4568e1,How Ironic - Podesta Received an Email From Da...,Here's WL Podesta email he received from Daily...,True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/6...,/r/TruthLeaks/comments/6d007z/how_ironic_podes...,-1.0,2017-05-24 04:42:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2539097,TruthLeaks,j5k1jo,1,2,15ab943a0e8ec63f3b75e841a35ae082b5878521,"The Left's Scaredy-Karen Response to COVID, Sh...","They'll never make the connection, so we have ...",True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/j...,/r/TruthLeaks/comments/j5k1jo/the_lefts_scared...,1.0,2020-10-05 14:08:48
2539098,TruthLeaks,j84m0e,1,0,15ab943a0e8ec63f3b75e841a35ae082b5878521,"Stop Posting ""Different types of aliens"" on ea...",This message is for that one group of people t...,True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/j...,/r/TruthLeaks/comments/j84m0e/stop_posting_dif...,1.0,2020-10-09 18:37:31
2539099,TruthLeaks,j84oqa,1,3,15ab943a0e8ec63f3b75e841a35ae082b5878521,"Let's Get Real Here: If the ""Fly"" Won the VP D...",Change my mind\n\nEttiquete Robot vs Phoney De...,True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/j...,/r/TruthLeaks/comments/j84oqa/lets_get_real_he...,1.0,2020-10-09 18:41:38
2539122,TruthLeaks,kpdboe,1,0,15ab943a0e8ec63f3b75e841a35ae082b5878521,Neighborhood News Studio on Bitchute -- George...,Bitchute NNS Channel\n\nhttps://www.bitch yut...,True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/k...,/r/TruthLeaks/comments/kpdboe/neighborhood_new...,1.0,2021-01-03 04:09:49


## Topic Modellingfrom gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel


In [24]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel


In [25]:
def align_topics(verbose=False):
    """Finds the best relabelling (permutation) to maxamise alignment of title_topic and text_topic.
    """

    from itertools import permutations

    data = pd.crosstab(df.title_topic, df.text_topic).values
    n = data.shape[0]

    max_score = np.finfo(float).min
    max_permutation = None
    for permutation in permutations(range(n)):
        tmp =  np.array([data[k] for k in permutation])
        score = tmp.trace()
        if score>max_score:
            max_score = score
            max_permutation = permutation

    if verbose:
        mapping = ", ".join([f"{k}->{v}"  for k,v in enumerate(max_permutation)])
        print(f"Max score {max_score} obtained with relabeling:\n\t {mapping}\nand resulting cross table of\n{data}")

    return max_score, max_permutation, np.array([data[k] for k in max_permutation])



In [26]:
def gen_tokens(t):
    return generate_tokens(t, tokenizer=tokenizer_3, lemmatizer=lemmatizer, default_stopwords=default_stopwords_1, min_token_length=3)

Generate tokens and ngrams

In [27]:
# Text
df['text_tokens'] = df['text'].apply(gen_tokens)

bigram = Phrases(df['text_tokens'], min_count=10)
trigram = Phrases(bigram[df['text_tokens']])
df['text_ngrams'] = df['text_tokens'].apply(lambda t: generate_ngrams(t, bigram, trigram))

# Title
df['title_tokens'] = df['title'].apply(gen_tokens)

bigram = Phrases(df['title_tokens'], min_count=10)
trigram = Phrases(bigram[df['title_tokens']])
df['title_ngrams'] = df['title_tokens'].apply(lambda t: generate_ngrams(t, bigram, trigram))

df


Unnamed: 0,subreddit,id,score,numReplies,author,title,text,is_self,domain,url,permalink,upvote_ratio,date_created,text_tokens,text_ngrams,title_tokens,title_ngrams
259450,TruthLeaks,c2bbpp,2,0,ce90110e1a553f8e6c03aed8d5183b5303b3721e,hacks and leaks rough timeline,\n\n{I've been trying to read up on the claim...,True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/c...,/r/TruthLeaks/comments/c2bbpp/hacks_and_leaks_...,-1.0,2019-06-19 02:24:41,"[been, trying, read, the, claim, about, russia...","[been, trying, read, the, claim, about, russia...","[hack, and, leak, rough, timeline]","[hack, and, leak, rough, timeline]"
259454,TruthLeaks,c2tbsn,3,3,ce90110e1a553f8e6c03aed8d5183b5303b3721e,Catholic Relief Services (1985) involved in di...,pg1\n\n[https://web.archive.org/web/2017112119...,True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/c...,/r/TruthLeaks/comments/c2tbsn/catholic_relief_...,-1.0,2019-06-20 08:10:04,"[pg1, http, web, archive, org, web, 2017112119...","[pg1, http_web_archive, org_web, 2017112119482...","[catholic, relief, service, 1985, involved, di...","[catholic, relief, service, 1985, involved, di..."
260132,TruthLeaks,he84kn,1,2,ce90110e1a553f8e6c03aed8d5183b5303b3721e,Steve Bing to Pay $200K for Clinton Korea Trip...,\n\n# Steve Bing to Pay $200K for Clinton Kor...,True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/h...,/r/TruthLeaks/comments/he84kn/steve_bing_to_pa...,1.0,2020-06-23 04:42:46,"[steve, bing, pay, $200, for, clinton, korea, ...","[steve, bing, pay, $200, for, clinton, korea, ...","[steve, bing, pay, $200, for, clinton, korea, ...","[steve, bing, pay, $200, for, clinton, korea, ..."
277659,TruthLeaks,6chtnt,4,0,017d9eab0596673d332fd9d916c3775deb4568e1,Mainlining Money - Major Southern California N...,May 21 2017 Orange County Register published t...,True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/6...,/r/TruthLeaks/comments/6chtnt/mainlining_money...,-1.0,2017-05-21 17:40:13,"[may, 2017, orange, county, register, publishe...","[may_2017, orange, county, register, published...","[mainlining, money, major, southern, californi...","[mainlining, money, major, southern, californi..."
277664,TruthLeaks,6d007z,3,0,017d9eab0596673d332fd9d916c3775deb4568e1,How Ironic - Podesta Received an Email From Da...,Here's WL Podesta email he received from Daily...,True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/6...,/r/TruthLeaks/comments/6d007z/how_ironic_podes...,-1.0,2017-05-24 04:42:00,"[here, podesta, email, received, from, daily, ...","[here, podesta_email, received, from, daily_ne...","[how, ironic, podesta, received, email, from, ...","[how, ironic, podesta, received, email, from, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2539097,TruthLeaks,j5k1jo,1,2,15ab943a0e8ec63f3b75e841a35ae082b5878521,"The Left's Scaredy-Karen Response to COVID, Sh...","They'll never make the connection, so we have ...",True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/j...,/r/TruthLeaks/comments/j5k1jo/the_lefts_scared...,1.0,2020-10-05 14:08:48,"[they, never, make, the, connection, have, mak...","[they, never, make, the, connection, have, mak...","[the, left, scaredy, karen, response, covid, s...","[the_left, scaredy, karen, response, covid, sh..."
2539098,TruthLeaks,j84m0e,1,0,15ab943a0e8ec63f3b75e841a35ae082b5878521,"Stop Posting ""Different types of aliens"" on ea...",This message is for that one group of people t...,True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/j...,/r/TruthLeaks/comments/j84m0e/stop_posting_dif...,1.0,2020-10-09 18:37:31,"[this, message, for, that, one, group, people,...","[this, message, for, that, one, group, people,...","[stop, posting, different, type, alien, earth,...","[stop, posting, different, type, alien, earth,..."
2539099,TruthLeaks,j84oqa,1,3,15ab943a0e8ec63f3b75e841a35ae082b5878521,"Let's Get Real Here: If the ""Fly"" Won the VP D...",Change my mind\n\nEttiquete Robot vs Phoney De...,True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/j...,/r/TruthLeaks/comments/j84oqa/lets_get_real_he...,1.0,2020-10-09 18:41:38,"[change, mind, ettiquete, robot, phoney, demon...","[change, mind, ettiquete, robot, phoney, demon...","[let, get, real, here, the, fly, won, the, deb...","[let, get, real, here, the, fly, won, the, deb..."
2539122,TruthLeaks,kpdboe,1,0,15ab943a0e8ec63f3b75e841a35ae082b5878521,Neighborhood News Studio on Bitchute -- George...,Bitchute NNS Channel\n\nhttps://www.bitch yut...,True,self.TruthLeaks,https://www.reddit.com/r/TruthLeaks/comments/k...,/r/TruthLeaks/comments/kpdboe/neighborhood_new...,1.0,2021-01-03 04:09:49,"[bitchute, nns, channel, http, www, bitch, yut...","[bitchute, nns, channel, http_www, bitch, yute...","[neighborhood, news, studio, bitchute, george,...","[neighborhood, news, studio, bitchute, george_..."


In [38]:
def generate_LDAModel(docs, num_topics=5, chunkize=2000, passes=20, iterations=400, eval_every=1):
    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=20, no_above=0.5)
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    dictionary[0]
    id2word = dictionary.id2token

    model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize,
        alpha='auto', eta='auto',
        iterations=iterations, num_topics=num_topics,
        passes=passes, eval_every=eval_every)
    
    coherence_model_lda = CoherenceModel(model=model, texts=docs, dictionary=dictionary,coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence Score: ', coherence_lda)
    
    def get_topic(doc):
        bow = dictionary.doc2bow(doc)               # convert tokens to bow
        topics = model.get_document_topics(bow)     # get topic,prob based on bow
        return np.array(topics).argmax(axis=0)[1]   # pick topic with highest probability

    return docs.apply(get_topic)

In [39]:
print("\nTitle Topic")
df['title_topic'] = generate_LDAModel(df['title_ngrams'])
print("\nText Topic")
df['text_topic'] = generate_LDAModel(df['text_ngrams'])



Title Topic
Coherence Score:  0.5401265280133998

Text Topic
Coherence Score:  0.3008391294331425


In [40]:
score, permutation, data = align_topics(True)

Max score 548 obtained with relabeling:
	 0->0, 1->1, 2->2, 3->3, 4->4
and resulting cross table of
[[303  71  14   0  33]
 [278 188  54   3  23]
 [165 121  55   6   0]
 [134  88  21   0   0]
 [165 106  36   1   2]]
