In [80]:
import csv
import pandas as pd
import seaborn as sns
import re
import numpy as np

from collections import Counter, defaultdict
from html import unescape

In [130]:
# load 50k documents from each subreddit
comment_dict = {}

# Define function to clean text
def clean(text):
    text = unescape(text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = ' '.join([w.lower() for w in text.split()])
    return text

# for the five politics subreddits
pol_df = pd.read_csv("../../0_data/clean/labelled_reddit/total/train_rand_320k.csv")

for subreddit in pd.unique(pol_df.label):
    comment_dict[subreddit] = pol_df[pol_df.label==subreddit].clean_text.sample(n=50000, random_state=123).apply(lambda x: clean(x))

    
# for the two news subreddits
# r/news and r/worldnews are combined because labels cannot easily be reconstructed from unlabelled data
with open("../../0_data/clean/unlabelled_reddit/total/test_rand_50k.txt",'r', newline='') as f:
        lines = f.read().rstrip('\r\n').splitlines()
        lines = [x for x in lines if x.strip()]
        comment_dict["news+worldnews"] = pd.Series(lines).sample(n=50000, random_state=123).apply(lambda x: clean(x))

In [131]:
# Define function to create dictionary
def create_dict(comment_list):
    # Create a list of all words in the comments
    word_list = [w for r in comment_list for w in r.strip().split()]
    # Create a dictionary of words and corresponding frequency counts using a Counter
    vocab_dict = Counter(word_list)
    # Return filtered dictionary
    return Counter({w: c for w, c in vocab_dict.items()})

In [132]:
# Initialize dictionary for storing vocabularies
vocab_dicts = dict()

# Loop over categories and create vocabularies
for key in comment_dict:
    vocab_dicts[key] = create_dict(comment_dict[key])

In [133]:
# Define function to compute Jaccard similarity
def jaccard_sim(vocab_1, vocab_2):
    intersection = len(set(vocab_1).intersection(set(vocab_2)))
    union = len(set(vocab_1).union(set(vocab_2)))
    return intersection / union

In [134]:
# Initialize table as dictionary
table = dict()

subreddits = ["The_Donald", "Libertarian", "Conservative", "politics", "ChapoTrapHouse", "news+worldnews"]

for key_i in subreddits:
    
    table[key_i] = list()
    
    for key_j in subreddits:
        
        # Calculate Jaccard similarity
        table[key_i].append(jaccard_sim(vocab_dicts[key_i], vocab_dicts[key_j]))

In [135]:
# Display table
plot_df = pd.DataFrame.from_dict(table, orient='index', columns = subreddits)
plot_df.rename(columns={"Libertarian": "LIB", "ChapoTrapHouse": "CTH", "Conservative": "CON", "politics": "POL", "The_Donald": "T_D", "news+worldnews": "NWN"},
               index={"Libertarian": "LIB", "ChapoTrapHouse": "CTH", "Conservative": "CON", "politics": "POL", "The_Donald": "T_D", "news+worldnews": "NWN"}, inplace=True)

#plot_df.values[[np.arange(plot_df.shape[0])]*2]="1"

cm = sns.color_palette('Greens', as_cmap=True)
plot_df.style.background_gradient(cmap=cm, axis = 0).set_table_attributes('style="font-family: CMU Serif; font-size:26px"').set_precision(2)

Unnamed: 0,T_D,LIB,CON,POL,CTH,NWN
T_D,1.0,0.44,0.46,0.45,0.42,0.44
LIB,0.44,1.0,0.48,0.47,0.42,0.46
CON,0.46,0.48,1.0,0.48,0.43,0.46
POL,0.45,0.47,0.48,1.0,0.43,0.46
CTH,0.42,0.42,0.43,0.43,1.0,0.42
NWN,0.44,0.46,0.46,0.46,0.42,1.0
