# EVALUATION OF PROPN FREQUENCY MEASURES IN PSP TEST SETS

In [1]:
import pandas as pd
import csv
import os
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, confusion_matrix
from ast import literal_eval
from scipy.stats import spearmanr, ttest_ind, pearsonr
from collections import Counter

### Import Data for Analysis

In [2]:
%%time

# load token breakdown of test documents

full_df_dict = {}

directory = f'../../0_data/clean/unlabelled_reddit/error_analysis'

for csv in sorted(os.listdir(directory)):
    if csv.endswith("_5k.csv"):
        print(f"loading {csv} token set")
        full_df_dict[os.path.splitext(csv)[0].lstrip("pol_")] = pd.read_csv(os.path.join(directory, csv),
                                                                            converters={"tokens_pos": literal_eval, 'text_pos': literal_eval})[["text", "text_pos", "tokens_pos"]]
        full_df_dict[os.path.splitext(csv)[0].lstrip("pol_")].index = full_df_dict[os.path.splitext(csv)[0].lstrip("pol_")].sample(frac=1, random_state=123).index
        full_df_dict[os.path.splitext(csv)[0].lstrip("pol_")].sort_index(inplace=True)


# load document-level MLM CE loss and merge with test set DFs

directory = f'../../0_data/clean/labelled_reddit/error_analysis'

for csv in os.listdir(directory):
    for key in full_df_dict.keys():
        if os.path.splitext(csv)[0] == key: # only load and merge matching test set
            full_df_dict[key] = full_df_dict[key].merge(pd.read_csv(os.path.join(directory, csv))[["label", "ce_diff_base_rand", "ce_diff_rand_match"]],
                                                        left_index=True, right_index=True)
            
            
id_to_label = {
    0: "ChapoTrapHouse",
    1: "Conservative",
    2: "Libertarian",
    3: "The_Donald",
    4: "politics"
}
            
# load document-level prediction results for different model types
for model, name in [("base+month", "base"), ("rand+month", "rand"),  ("month+month", "match")]:
    
    directory = f"../../0_results/classification/reddit/month-models/{model}"
    
    for csv in os.listdir(directory):
        
        for key in full_df_dict.keys():
            
            if (key in csv) and ("train_"+re.search("test_(.*?)_5k", key).group(1) in csv) and ("_20k-test" in csv):
                
                in_df = pd.read_csv(os.path.join(directory, csv))[["index", "prediction"]].rename(columns={"prediction": f"pred_{name}"}).set_index("index")
                
                for k_id in id_to_label:
                    in_df[f"pred_{name}"].replace(k_id, id_to_label[k_id], inplace=True)
                    
                full_df_dict[key] = full_df_dict[key].merge(in_df, left_index=True, right_index=True)
    

loading pol_test_2017_03_5k.csv token set
loading pol_test_2017_04_5k.csv token set
loading pol_test_2017_05_5k.csv token set
loading pol_test_2017_06_5k.csv token set
loading pol_test_2017_07_5k.csv token set
loading pol_test_2017_08_5k.csv token set
loading pol_test_2017_09_5k.csv token set
loading pol_test_2017_10_5k.csv token set
loading pol_test_2017_11_5k.csv token set
loading pol_test_2017_12_5k.csv token set
loading pol_test_2018_01_5k.csv token set
loading pol_test_2018_02_5k.csv token set
loading pol_test_2018_03_5k.csv token set
loading pol_test_2018_04_5k.csv token set
loading pol_test_2018_05_5k.csv token set
loading pol_test_2018_06_5k.csv token set
loading pol_test_2018_07_5k.csv token set
loading pol_test_2018_08_5k.csv token set
loading pol_test_2018_09_5k.csv token set
loading pol_test_2018_10_5k.csv token set
loading pol_test_2018_11_5k.csv token set
loading pol_test_2018_12_5k.csv token set
loading pol_test_2019_01_5k.csv token set
loading pol_test_2019_02_5k.csv to

### Concatenate Monthly Sets to Full DF for Analysis

In [3]:
%%time

# write source column to each df in dict, then concatenate all dfs into one overall df for analysis
for testset in full_df_dict:
    full_df_dict[testset]["source"] = testset
    
overall_df = pd.concat(full_df_dict.values(), ignore_index=True)
overall_df

CPU times: user 209 ms, sys: 145 ms, total: 353 ms
Wall time: 386 ms


Unnamed: 0,text,text_pos,tokens_pos,label,ce_diff_base_rand,ce_diff_rand_match,pred_base,pred_rand,pred_match,source
0,Nice argument there twinky. Tell your mom I sa...,"[[Nice, ADJ], [argument, NOUN], [there, ADV], ...","[[[CLS], SPECIAL], [nice, ADJ], [argument, NOU...",Libertarian,2.450879,0.389335,Libertarian,politics,Libertarian,test_2017_03_5k
1,"Yes, why should the rest of the world be entit...","[[Yes, INTJ], [,, PUNCT], [why, ADV], [should,...","[[[CLS], SPECIAL], [yes, INTJ], [,, PUNCT], [w...",Libertarian,-0.073872,0.014518,Libertarian,Libertarian,Libertarian,test_2017_03_5k
2,Definitely. They are getting plenty of corpora...,"[[Definitely, ADV], [., PUNCT], [They, PRON], ...","[[[CLS], SPECIAL], [definitely, ADV], [., PUNC...",politics,9.096548,1.122668,politics,politics,politics,test_2017_03_5k
3,"Their data is protected as ""national security""...","[[Their, PRON], [data, NOUN], [is, AUX], [prot...","[[[CLS], SPECIAL], [their, PRON], [data, NOUN]...",politics,0.090673,-1.162980,Libertarian,Libertarian,Libertarian,test_2017_03_5k
4,"they posted a pic months ago from a ""white hou...","[[they, PRON], [posted, VERB], [a, DET], [pic,...","[[[CLS], SPECIAL], [they, PRON], [posted, VERB...",ChapoTrapHouse,1.158620,1.648456,The_Donald,ChapoTrapHouse,politics,test_2017_03_5k
...,...,...,...,...,...,...,...,...,...,...
179995,I’d be disappointed in them if they didn’t. /s,"[[I, PRON], [’, VERB], [d, X], [be, AUX], [dis...","[[[CLS], SPECIAL], [i, PRON], [’, VERB], [d, X...",Conservative,51.887914,0.114969,Conservative,Conservative,Conservative,test_2020_02_5k
179996,Bernie has a wider margin against Trump when y...,"[[Bernie, PROPN], [has, VERB], [a, DET], [wide...","[[[CLS], SPECIAL], [bernie, PROPN], [has, VERB...",politics,5.673599,3.070925,politics,politics,politics,test_2020_02_5k
179997,I just wish Chelsea Handler would wind up as a...,"[[I, PRON], [just, ADV], [wish, VERB], [Chelse...","[[[CLS], SPECIAL], [i, PRON], [just, ADV], [wi...",Conservative,7.047268,2.106946,The_Donald,The_Donald,ChapoTrapHouse,test_2020_02_5k
179998,NO major newspaper coverage. [twatter (sic) li...,"[[NO, DET], [major, ADJ], [newspaper, NOUN], [...","[[[CLS], SPECIAL], [no, DET], [major, ADJ], [n...",The_Donald,14.046416,-1.121283,The_Donald,The_Donald,The_Donald,test_2020_02_5k


### Set up Counter() dictionaries

In [319]:
%%time

# create PROPN counters for each month and subreddit
propn_counter = {}
for month in full_df_dict:
    propn_counter[month] = {}
    for label in ['Libertarian', 'politics', 'ChapoTrapHouse', 'Conservative', 'The_Donald']:
        propn_counter[month][label] = Counter()
        for _, row in full_df_dict[month].iterrows():
            if row.label==label:
                previous_elems=[]
                for elem in row.tokens_pos:
                    if elem[1] == "PROPN":
                        if elem[0] not in previous_elems:  # count only first occurence --> how many docs rather than how many tokens
                            propn_counter[month][label][elem[0]] +=1
                            previous_elems.append(elem[0])

CPU times: user 1min 28s, sys: 11.5 s, total: 1min 39s
Wall time: 2min 2s


In [320]:
%%time
# create overall counters for each month
for month in propn_counter:
    propn_counter[month]["total"]=sum(propn_counter[month].values(), Counter())

# create overall counter across all months
overall_counter = {}
for label in ['Libertarian', 'politics', 'ChapoTrapHouse', 'Conservative', 'The_Donald', "total"]:
    overall_counter[label] = sum([propn_counter[month][label] for month in propn_counter], Counter())

CPU times: user 1.12 s, sys: 565 ms, total: 1.69 s
Wall time: 1.82 s


### Analysis of most-improved PROPNs

In [393]:
# import PROPN tokens with source month
import_df = pd.read_csv("../../0_data/clean/labelled_reddit/error_analysis/most_improved_propn.csv")

# select top N%
N=10
most_improved_df = import_df.head(int(import_df.shape[0]*N/100)).copy()
most_improved_df.drop_duplicates(subset=["masked_token_text", "source"], inplace=True)

In [395]:
%%time

# how many subreddits did they appear in?

def count_subs(row):
    counter = 0
    for sub in ['Libertarian', 'politics', 'ChapoTrapHouse', 'Conservative', 'The_Donald']:
        if propn_counter[row.source][sub][row.masked_token_text]>0:
            counter+=1
    return counter

def count_total(row):
    return propn_counter[row.source]["total"][row.masked_token_text]

def max_count_single_sub(row):
    counter = 0
    for sub in ['Libertarian', 'politics', 'ChapoTrapHouse', 'Conservative', 'The_Donald']:
        if propn_counter[row.source][sub][row.masked_token_text]>counter:
            counter = propn_counter[row.source][sub][row.masked_token_text]
    return counter

most_improved_df["n_sub_with_occurrence"] = most_improved_df.apply(lambda x: count_subs(x), axis=1)
most_improved_df["n_total"] = most_improved_df.apply(lambda x: count_total(x), axis=1)
most_improved_df["n_max_sub"] = most_improved_df.apply(lambda x: max_count_single_sub(x), axis=1)

CPU times: user 769 ms, sys: 6.16 ms, total: 775 ms
Wall time: 779 ms


In [400]:
display_df = pd.DataFrame(most_improved_df.groupby("n_sub_with_occurrence").n_sub_with_occurrence.count())
display_df = display_df.merge(most_improved_df.groupby("n_sub_with_occurrence").n_total.sum(), left_index=True, right_index=True)
display_df = display_df.merge(most_improved_df.groupby("n_sub_with_occurrence").n_total.mean(), left_index=True, right_index=True)
display_df = display_df.merge(most_improved_df.groupby("n_sub_with_occurrence").n_max_sub.mean(), left_index=True, right_index=True)
display_df.index.name = "n"
display_df.reset_index(inplace=True)
display_df["n_sub_average"] = display_df.n_total_y/display_df.n
display_df[["n", "n_sub_with_occurrence", "n_total_x", "n_sub_average"]].style.set_precision(2)

Unnamed: 0,n,n_sub_with_occurrence,n_total_x,n_sub_average
0,1,1403,1789,1.28
1,2,769,2316,1.51
2,3,559,3336,1.99
3,4,570,6950,3.05
4,5,819,33090,8.08


In [381]:
# frequency of particular token in particular month across subreddits

word = "##ugh"
month = "test_2018_10_5k"
for sub in propn_counter[month]:
    print(sub, propn_counter[month][sub][word] )

Libertarian 9
politics 17
ChapoTrapHouse 2
Conservative 28
The_Donald 11
total 67


In [372]:
# frequency of particular token across months

word = "##ugh"
for month in propn_counter:
    print(month, propn_counter[month]["total"][word])

test_2017_03_5k 0
test_2017_04_5k 1
test_2017_05_5k 0
test_2017_06_5k 0
test_2017_07_5k 0
test_2017_08_5k 0
test_2017_09_5k 0
test_2017_10_5k 0
test_2017_11_5k 1
test_2017_12_5k 0
test_2018_01_5k 0
test_2018_02_5k 0
test_2018_03_5k 0
test_2018_04_5k 1
test_2018_05_5k 0
test_2018_06_5k 2
test_2018_07_5k 9
test_2018_08_5k 1
test_2018_09_5k 107
test_2018_10_5k 67
test_2018_11_5k 11
test_2018_12_5k 6
test_2019_01_5k 7
test_2019_02_5k 6
test_2019_03_5k 9
test_2019_04_5k 3
test_2019_05_5k 4
test_2019_06_5k 3
test_2019_07_5k 4
test_2019_08_5k 2
test_2019_09_5k 8
test_2019_10_5k 1
test_2019_11_5k 6
test_2019_12_5k 4
test_2020_01_5k 1
test_2020_02_5k 0
