# EVALUATE TEST SET-LEVEL MLM RESULTS

In [12]:
import pandas as pd
import csv
import os
import re
import numpy as np

from scipy.stats import wilcoxon

# plotting
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
import matplotlib.font_manager as font_manager
from matplotlib import colors

sns.set_theme()

## Load data

In [2]:
# load txt files, each containing pseudoperplexity

perplexity = {}
directory = '../../0_results/mlm'
for dataset in os.listdir(directory):
    perplexity[dataset]={}
    for test_type in os.listdir(os.path.join(directory, dataset)):
        perplexity[dataset][test_type]={}
        for result_txt in os.listdir(os.path.join(directory, dataset, test_type)):
            if result_txt.endswith(".txt"):
                with open(os.path.join(directory, dataset, test_type, result_txt), 'r') as file:
                    perplexity[dataset][test_type][os.path.splitext(result_txt)[0]] = float(file.read().replace('\n', ''))

# Results on Random Test Set

In [3]:
def results_on_rand(DATASET):
    plot_df = pd.DataFrame.from_dict(perplexity[DATASET]["test-on-rand"], orient="index")
    
    plot_df.sort_index(inplace=True)
    plot_df.rename(columns={0:'pseudo-perplexity'}, inplace=True)
    plot_df.index.name="Model"
    
    cm = sns.color_palette('Greens_r', as_cmap=True)
    display(plot_df.style.background_gradient(cmap=cm, axis = 0 ).set_table_attributes('style="font-family: CMU Serif; font-size:16px"').format('{0:,.2f}'))

In [4]:
results_on_rand("reddit")

Unnamed: 0_level_0,pseudo-perplexity
Model,Unnamed: 1_level_1
bert-2017_03_1m-test_rand_10k,8.02
bert-2017_04_1m-test_rand_10k,8.07
bert-2017_05_1m-test_rand_10k,8.07
bert-2017_06_1m-test_rand_10k,8.02
bert-2017_07_1m-test_rand_10k,7.99
bert-2017_08_1m-test_rand_10k,8.05
bert-2017_09_1m-test_rand_10k,8.02
bert-2017_10_1m-test_rand_10k,7.95
bert-2017_11_1m-test_rand_10k,7.91
bert-2017_12_1m-test_rand_10k,7.89


# Results on Monthly Test Sets: Absolute Values & Relative to Random Control Baseline

In [44]:
# helper function for diverging color palette
def background_gradient(s, m, M, cmap='BuPu', low=0, high=0):
    rng = M - m
    norm = colors.Normalize(m - (rng * low),
                            M + (rng * high))
    normed = norm(s.values)
    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
    return ['background-color: %s' % color for color in c]

def results_on_month(DATASET, BASELINE_MODEL="",
                     YEAR_FILTER = ["2017", "2018", "2019", "2020"],
                     MONTH_FILTER = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]):    
    
    result_dict = perplexity[DATASET]["politics-test"] #"test-on-month" / "politics-test"
    
    perpl_series = {}
    for model in sorted(pd.unique(pd.Series(result_dict.keys()).apply(lambda x: re.search(r'(.*?)-test(.*?)', x).group(1)))):
        if "bert-rand" not in model and "bert-base" not in model \
        and any(year in model for year in YEAR_FILTER) and any("_"+month in model for month in MONTH_FILTER):
            perpl_series[model] = {}
            for key in sorted(result_dict):
                if model in key and any("_"+year in re.search(r'(.*?)-test(.*)', key).group(2) for year in YEAR_FILTER) \
                and any("_"+month in re.search(r'(.*?)-test(.*)', key).group(2) for month in MONTH_FILTER):
                    perpl_series[model]["test" + re.search(r'(.*?)-test(.*)', key).group(2)] = result_dict[key]
    
    
    plot_df = pd.DataFrame.from_dict(perpl_series).reset_index().rename(columns={'index':'model'})
    plot_df.set_index('model', inplace=True)
    plot_df.index.name="Adapt."
    plot_df = plot_df.T
    
    cm = sns.color_palette('Greens_r', as_cmap=True)
    
    if BASELINE_MODEL!="":
        baseline_series = {}
        for key in sorted(result_dict):
                if BASELINE_MODEL in key:
                    baseline_series["test" + re.search(r'(.*?)-test(.*)', key).group(2)] = result_dict[key]
        for column in plot_df.columns: # divide by baseline model performance
            plot_df[column] = plot_df[column].divide(baseline_series[column])-1
        
        # format column names
        plot_df.columns = pd.Series(plot_df.columns).apply(lambda x: re.search("test_(.*?)_(.*?)_(.*)", x).group(1)[-2:] + "-" + re.search("test_(.*?)_(.*?)_(.*)", x).group(2))
        
        # format row names --> index
        for elem in plot_df.index:
            plot_df.rename(index={elem: re.search("bert-(.*?)_(.*?)_(.*)", elem).group(1)[-2:] + "-" + re.search("bert-(.*?)_(.*?)_(.*)", elem).group(2)}, inplace=True)
        
        # multiply by 100 to get pct values
        plot_df = plot_df.multiply(100)
        
        # define diverging color palette
        cm = sns.diverging_palette(250, 36, s=100, l=65, as_cmap=True)
        even_range = np.max([np.abs(plot_df.values.min()), np.abs(plot_df.values.max())])
        
        display(plot_df.style.apply(background_gradient,cmap=cm, m=-even_range, M=even_range)\
                .set_table_attributes('style="font-family: CMU Serif; font-size:26px"').set_precision(2))
    
    else:
        display(plot_df.style.background_gradient(cmap=cm, axis = 0 ).format('{0:,.2f}'))
        
    return plot_df

In [45]:
df = results_on_month("reddit", BASELINE_MODEL="bert-rand_1m", MONTH_FILTER = ["04", "08", "12"])
np.diag(df).mean()

Adapt.,17-04,17-08,17-12,18-04,18-08,18-12,19-04,19-08,19-12
17-04,-0.56,0.53,2.1,3.24,4.29,5.37,4.34,5.19,5.74
17-08,0.52,-1.62,1.96,2.89,1.98,4.58,4.05,3.2,4.87
17-12,1.48,0.42,-0.87,1.05,1.5,2.61,2.26,2.65,2.24
18-04,1.14,0.69,1.82,-0.95,0.98,2.47,2.14,2.67,2.54
18-08,1.19,-0.2,-0.06,0.34,-1.12,0.38,1.38,0.81,1.38
18-12,0.79,0.78,0.9,0.69,0.08,-1.03,0.98,1.14,1.34
19-04,2.01,1.09,1.64,1.28,0.18,0.62,-0.66,1.1,-0.19
19-08,3.05,2.06,2.21,1.54,1.85,2.08,1.26,-0.12,1.22
19-12,2.04,1.0,1.47,1.82,1.26,1.23,-0.18,0.7,-2.36


-1.0309416512516774

In [305]:
# test whether models perform better on past than on future

def wilcoxon_assymetry(df):
    
    x=np.empty(0)
    y=np.empty(0)
    
    for k in range(1,df.shape[0]):
        x = np.append(x, np.diag(df, k=k))
        y = np.append(y, np.diag(df, k=-k))
    
    dif = x-y
    print(dif.mean())
    
    return wilcoxon(x=x, y=y, alternative="greater")

wilcoxon_assymetry(df)

0.7769864840908813


WilcoxonResult(statistic=171004.0, pvalue=1.1873210639523903e-55)