In [10]:
import pandas as pd
import csv
import os
import re
import seaborn as sns

## Load data

In [46]:
# load txt files, each containing pseudoperplexity

perplexity = {}
directory = '../../0_results/mlm'
for dataset in os.listdir(directory):
    perplexity[dataset]={}
    for test_type in os.listdir(os.path.join(directory, dataset)):
        perplexity[dataset][test_type]={}
        for result_txt in os.listdir(os.path.join(directory, dataset, test_type)):
            if result_txt.endswith(".txt"):
                with open(os.path.join(directory, dataset, test_type, result_txt), 'r') as file:
                    perplexity[dataset][test_type][os.path.splitext(result_txt)[0]] = float(file.read().replace('\n', ''))

# Results on Random Test Set

In [47]:
def results_on_rand(DATASET):
    plot_df = pd.DataFrame.from_dict(perplexity[DATASET]["test-on-rand"], orient="index")
    
    plot_df.sort_index(inplace=True)
    plot_df.rename(columns={0:'pseudo-perplexity'}, inplace=True)
    plot_df.index.name="Model"
    
    cm = sns.color_palette('Greens_r', as_cmap=True)
    display(plot_df.style.background_gradient(cmap=cm, axis = 0 ).format('{0:,.2f}'))

In [48]:
results_on_rand("reddit")

Unnamed: 0_level_0,pseudo-perplexity
Model,Unnamed: 1_level_1
bert-rand_1m-test_rand_10k,7.71
bert-rand_2m-test_rand_10k,7.22
bert-rand_5m-test_rand_10k,6.63


# Results on Monthly Test Sets: Absolute Values & Relative to Random Control Baseline

In [104]:
def results_on_month(DATASET, BASELINE_MODEL=""):    
    
    result_dict = perplexity[DATASET]["test-on-month"]
    
    perpl_series = {}
    for model in sorted(pd.unique(pd.Series(result_dict.keys()).apply(lambda x: re.search(r'(.*?)-test(.*?)', x).group(1)))):
        if "bert-rand" not in model and "bert-base" not in model:
            perpl_series[model] = {}
            for key in sorted(result_dict):
                if model in key:
                    perpl_series[model]["test" + re.search(r'(.*?)-test(.*)', key).group(2)] = result_dict[key]
    
    
    plot_df = pd.DataFrame.from_dict(perpl_series).reset_index().rename(columns={'index':'model'})
    plot_df.set_index('model', inplace=True)
    plot_df = plot_df.T
    
    cm = sns.color_palette('Greens_r', as_cmap=True)
    
    if BASELINE_MODEL!="":
        baseline_series = {}
        for key in sorted(result_dict):
                if BASELINE_MODEL in key:
                    baseline_series["test" + re.search(r'(.*?)-test(.*)', key).group(2)] = result_dict[key]
        for column in plot_df.columns:
            plot_df[column] = plot_df[column].divide(baseline_series[column])-1

        display(plot_df.style.background_gradient(cmap=cm, axis = None ).format('{0:.2%}'))
    
    else:
        display(plot_df.style.background_gradient(cmap=cm, axis = 0 ).format('{0:,.2f}'))

In [108]:
results_on_month("gab")

model,test_2018_01_10k,test_2018_02_10k,test_2018_03_10k,test_2018_04_10k,test_2018_05_10k,test_2018_06_10k,test_2018_07_10k,test_2018_08_10k,test_2018_09_10k,test_2018_10_10k
bert-01_1m,10.13,10.87,11.0,11.27,11.33,12.08,11.8,12.45,12.45,12.46
bert-02_1m,10.73,10.02,10.57,11.09,11.14,12.03,11.74,12.25,12.41,12.38
bert-03_1m,11.01,10.65,9.77,10.73,10.96,11.83,11.55,12.06,12.27,12.16
bert-04_1m,11.09,10.91,10.42,9.85,10.57,11.55,11.45,11.94,12.08,12.06
bert-05_1m,11.16,10.97,10.61,10.61,9.86,11.29,11.21,11.82,11.96,11.93
bert-06_1m,11.31,11.27,10.92,10.99,10.57,10.33,10.87,11.72,11.89,11.82
bert-07_1m,11.44,11.34,11.1,11.24,10.93,11.26,10.18,11.45,11.62,11.61
bert-08_1m,11.61,11.48,11.2,11.38,11.09,11.63,11.0,10.31,10.84,11.17
bert-09_1m,11.78,11.57,11.28,11.56,11.37,11.91,11.26,11.12,9.64,10.62
bert-10_1m,11.76,11.68,11.44,11.63,11.41,11.99,11.41,11.46,10.6,9.81
