# VOCAB JACCARD SIMILARITY
News and Politics Subreddits + BookCorpus

In [9]:
import csv
import pandas as pd
import seaborn as sns
import re
import numpy as np
import os

from collections import Counter, defaultdict
from html import unescape

In [2]:
# load 50k documents from each subreddit
comment_dict = {}

# Define function to clean text
def clean(text):
    text = unescape(text) # html conversion
    text = re.sub('[^a-zA-Z]', ' ', text) # remove punctuation
    text = ' '.join([w.lower() for w in text.split()]) # lowercase, collapse whitespace
    return text

# for the five politics subreddits
pol_df = pd.read_csv("../../0_data/clean/labelled_reddit/total/train_rand_320k.csv")

for subreddit in pd.unique(pol_df.label):
    comment_dict[subreddit] = pol_df[pol_df.label==subreddit].clean_text.sample(n=50000, random_state=123).apply(lambda x: clean(x))

    
# for the two news subreddits
# r/news and r/worldnews are combined because labels cannot easily be reconstructed from unlabelled data
with open("../../0_data/clean/unlabelled_reddit/total/test_rand_50k.txt",'r', newline='') as f:
        lines = f.read().rstrip('\r\n').splitlines()
        lines = [x for x in lines if x.strip()]
        comment_dict["news+worldnews"] = pd.Series(lines).sample(n=50000, random_state=123).apply(lambda x: clean(x))

In [58]:
%%time

# for the BooksCorpus that BERT was pretrained on

directory = "../../0_data/raw/BookCorpus"

lines = []
file_counter = 0
for filename in os.listdir(directory):
    file_counter+=1
    
    line_counter = -300 # offset to skip first n lines (title, copyright stuff etc.)
    with open(os.path.join(directory, filename), newline='') as f:
        for line in f:
            line_counter+=1
            if len(line)>15 and len(line)<240 and line_counter > 0:
                lines.append(line.rstrip('\r\n'))
            if line_counter > 15:
                break
    if file_counter%1000 == 0:
        print("progress:", file_counter)
        
comment_dict["bookcorpus"] = pd.Series(lines).sample(n=50000, random_state=123).apply(lambda x: clean(x))

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
CPU times: user 6.73 s, sys: 5.75 s, total: 12.5 s
Wall time: 1min 59s


In [63]:
# Define function to create dictionary
def create_dict(comment_list):
    # Create a list of all words in the comments
    word_list = [w for r in comment_list for w in r.strip().split()]
    # Create a dictionary of words and corresponding frequency counts using a Counter
    vocab_dict = Counter(word_list)
    # Return filtered dictionary
    return Counter({w: c for w, c in vocab_dict.items()})

In [64]:
# Initialize dictionary for storing vocabularies
vocab_dicts = dict()

# Loop over corpora and create vocabularies
for key in comment_dict:
    vocab_dicts[key] = create_dict(comment_dict[key])

In [65]:
# Define function to compute Jaccard similarity
def jaccard_sim(vocab_1, vocab_2):
    intersection = len(set(vocab_1).intersection(set(vocab_2)))
    union = len(set(vocab_1).union(set(vocab_2)))
    return intersection / union

In [75]:
# Initialize table as dictionary
table = dict()

for key_i in comment_dict:
    
    table[key_i] = list()
    
    for key_j in comment_dict:
        
        # Calculate Jaccard similarity
        table[key_i].append(jaccard_sim(vocab_dicts[key_i], vocab_dicts[key_j]))

In [84]:
# Display table
plot_df = pd.DataFrame.from_dict(table, orient='index', columns = table.keys())
plot_df.rename(columns={"Libertarian": "LIB", "ChapoTrapHouse": "CTH", "Conservative": "CON", "politics": "POL", "The_Donald": "T_D", "news+worldnews": "NWN", "bookcorpus": "BC"},
               index={"Libertarian": "LIB", "ChapoTrapHouse": "CTH", "Conservative": "CON", "politics": "POL", "The_Donald": "T_D", "news+worldnews": "NWN", "bookcorpus": "BC"}, inplace=True)

#plot_df.values[[np.arange(plot_df.shape[0])]*2]="1"

cm = sns.color_palette('Blues', as_cmap=True)
plot_df.style.background_gradient(cmap=cm, axis = None).set_table_attributes('style="font-family: CMU Serif; font-size:26px"').set_precision(2)

Unnamed: 0,LIB,CTH,CON,POL,T_D,NWN,BC
LIB,1.0,0.42,0.48,0.47,0.44,0.46,0.33
CTH,0.42,1.0,0.43,0.43,0.42,0.42,0.33
CON,0.48,0.43,1.0,0.48,0.46,0.46,0.34
POL,0.47,0.43,0.48,1.0,0.45,0.46,0.34
T_D,0.44,0.42,0.46,0.45,1.0,0.44,0.34
NWN,0.46,0.42,0.46,0.46,0.44,1.0,0.34
BC,0.33,0.33,0.34,0.34,0.34,0.34,1.0
