# EVALUATION OF CLASSIFICATION RESULTS

In [107]:
import pandas as pd
import csv
import os
import seaborn as sns
import numpy as np
import re
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score
from matplotlib import colors

# Domain Adaptation vs Finetuning Size

In [187]:
# load overall results to dict of dicts, with each subdict corresponding to one dataset (--> ghc, gq and reddit)

results = {}
directory = '../../0_results/classification'
for dataset_name in os.listdir(directory):
    print(f"loading {dataset_name} results to sub-dict")
    results[dataset_name] = {}
    for result_csv in os.listdir(os.path.join(directory, dataset_name, "total-models")):
        if result_csv.endswith(".csv"):
            results[dataset_name][os.path.splitext(result_csv)[0]] = pd.read_csv(os.path.join(directory, dataset_name, "total-models", result_csv))

loading ghc results to sub-dict
loading reddit results to sub-dict
loading gq results to sub-dict


In [209]:
def domain_vs_finetuning_size(dict_of_results, F1_TYPE = "macro"):
    
    scores = {}

    for result in dict_of_results:
        scores[result] = f1_score(dict_of_results[result]['label'], dict_of_results[result]['prediction'], average=F1_TYPE)

    score_series = {}
    for model in sorted(pd.unique(pd.Series(scores.keys()).apply(lambda x: x.split('-train')[0]))):
        score_series[model] = {}
        for key in sorted(scores):
            if model in key and "train_rand_4k" not in key:
                finetuning_size = int(re.search(r'(.*?)-train_rand_(.*?)-test(.*?)', key).group(2).rstrip("k"))*1000
                score_series[model][finetuning_size if finetuning_size != 1552000 else 1552] = scores[key] # if catches special case for ghc: missing "k"
    
    plot_df = pd.DataFrame.from_dict(score_series).reset_index().rename(columns={'index':'model'})
    plot_df.set_index('model', inplace=True)
    plot_df.index.name="Adapt."
    
    # transpose to put models in rows, test sets in columns
    plot_df = plot_df.sort_index().T
    
    # rearrange so that largest pretraining size is at bottom of df
    plot_df = pd.concat([plot_df.loc[plot_df.index != "bert-rand_10m"], plot_df.loc[plot_df.index == "bert-rand_10m"]], axis=0)
    
    cm = sns.color_palette('Greens', as_cmap=True)
    
    # format column names
    plot_df.columns = pd.Series(plot_df.columns).apply(lambda x: str(x)[:-3] + "k" if x!=1552 else 1552)
    
    # format row names --> index
    plot_df.rename(index={"bert-base": "bert-rand_0m"}, inplace=True)
    for elem in plot_df.index:
        plot_df.rename(index={elem: re.search("(.*?)_(.*)", elem).group(2)}, inplace=True)
    plot_df.rename(index={"0m": "0"}, inplace=True)
    
    # multiply by 100 to get pct values
    plot_df = plot_df.multiply(100)
    
    display(plot_df.style.background_gradient(cmap=cm, axis = None ).set_table_attributes('style="font-family: CMU Serif; font-size:26px"').set_precision(2))
    
    return plot_df

In [211]:
for dataset in ["reddit", "ghc", "gq"]:
    print(dataset.upper())
    out = domain_vs_finetuning_size(results[dataset])
    print()

REDDIT


Adapt.,1k,2k,5k,10k,20k,40k,80k,160k,320k
0,34.19,35.7,39.22,41.65,43.22,44.65,46.65,47.92,49.44
1m,37.76,39.26,41.31,42.91,44.03,45.18,46.87,47.94,49.51
2m,38.45,39.57,42.05,42.69,43.43,45.39,46.93,48.38,48.98
5m,38.78,39.84,42.16,43.42,44.46,45.86,47.37,47.93,49.82
10m,38.7,40.37,42.4,43.47,44.53,45.84,47.23,48.44,50.05



GHC


Adapt.,1k,1552,2k,5k,10k,16k
0,65.28,66.27,68.34,68.53,69.49,71.93
1m,68.78,66.88,67.58,67.99,69.42,71.27
2m,66.37,67.69,68.44,69.11,71.28,69.96
5m,67.52,67.54,68.44,69.87,70.19,70.21
10m,66.63,67.11,70.61,68.47,71.35,69.39



GQ


Adapt.,1k,2k,5k,10k,20k,26k
0,90.48,90.73,90.98,91.4,91.8,92.0
1m,90.99,91.19,91.06,91.46,91.92,91.77
2m,90.77,91.25,91.35,91.66,91.72,91.77
5m,90.75,91.38,91.44,91.45,91.93,91.98
10m,90.93,91.3,91.3,91.5,91.89,91.86





# Results by Month: INDIVIDUAL MODELS

In [212]:
%%time

# load monthly results to dict of dicts of dicts, with each subdict corresponding to one dataset (--> ghc and reddit) and subsubdict to one model type

results = {}
directory = '../../0_results/classification'
for dataset_name in os.listdir(directory):
    if dataset_name != "gq":
        print(f"loading {dataset_name} results to sub-dict")
        results[dataset_name] = {}
        for model_type in sorted(os.listdir(os.path.join(directory, dataset_name, "month-models"))):
            if model_type!="z_old":
                print(f"  {model_type} to sub-sub-dict")
                results[dataset_name][model_type] = {}
                for result_csv in os.listdir(os.path.join(directory, dataset_name, "month-models", model_type)):
                    if result_csv.endswith(".csv"):
                        results[dataset_name][model_type][os.path.splitext(result_csv)[0]] = \
                        pd.read_csv(os.path.join(directory, dataset_name, "month-models", model_type, result_csv))

loading ghc results to sub-dict
  base+month to sub-sub-dict
  base+rand to sub-sub-dict
  month+month to sub-sub-dict
  month+rand to sub-sub-dict
  rand+month to sub-sub-dict
  rand+rand to sub-sub-dict
loading reddit results to sub-dict
  base+month to sub-sub-dict
  base+rand to sub-sub-dict
  month+month to sub-sub-dict
  month+rand to sub-sub-dict
  rand+month to sub-sub-dict
  rand+rand to sub-sub-dict
  shift+1 to sub-sub-dict
  shift+12 to sub-sub-dict
  shift+3 to sub-sub-dict
  shift+6 to sub-sub-dict
CPU times: user 51.2 s, sys: 14.3 s, total: 1min 5s
Wall time: 3min 42s


In [216]:
# helper function for diverging color palette
def background_gradient(s, m, M, cmap='BuPu', low=0, high=0):
    rng = M - m
    norm = colors.Normalize(m - (rng * low),
                            M + (rng * high))
    normed = norm(s.values)
    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
    return ['background-color: %s' % color for color in c]

def plot_F1_monthly(input_df, REDDIT_FILTER=0, F1_TYPE = "macro", BASELINE_MODEL="",
                    YEAR_FILTER = ["2017", "2018", "2019", "2020"],
                    MONTH_FILTER = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]):

    # calculate f1 
    scores = {}
    
    for key in input_df :
        if REDDIT_FILTER!=0:
            if f"_{REDDIT_FILTER}k-test" in key:
                scores[key] = f1_score(input_df[key]['label'], input_df[key]['prediction'], average=F1_TYPE)
        else:
            scores[key] = f1_score(input_df[key]['label'], input_df[key]['prediction'], average=F1_TYPE)
    
    score_series = {}
    for model in sorted(pd.unique(pd.Series(scores.keys()).apply(lambda x: re.search(r'(.*?)-test(.*?)', x).group(1)))):
        if any("_"+year in model for year in YEAR_FILTER) and any("_"+month in model for month in MONTH_FILTER):
            score_series[model] = {}
            for key in sorted(scores):
                if model in key and any("_"+year in re.search(r'(.*?)-test(.*)', key).group(2) for year in YEAR_FILTER) \
                and any("_"+month in re.search(r'(.*?)-test(.*)', key).group(2) for month in MONTH_FILTER):
                    score_series[model]["test" +re.search(r'(.*?)-test(.*)', key).group(2)] = scores[key]
    
    
    plot_df = pd.DataFrame.from_dict(score_series).reset_index().rename(columns={'index':'model'})

    plot_df.set_index('model', inplace=True)
    plot_df.index.name = "Finetune"

    plot_df = plot_df.T
    
    cm = sns.color_palette('Greens', as_cmap=True)

    
    if BASELINE_MODEL!="":
        baseline_series = {}
        if REDDIT_FILTER != 0:
            for key in results["reddit"][BASELINE_MODEL]:
                if f"{REDDIT_FILTER}k" in key:
                    baseline_series["test" + re.search(r'(.*?)-test(.*)', key).group(2)] = f1_score(results["reddit"][BASELINE_MODEL][key]['label'],
                                                                                                        results["reddit"][BASELINE_MODEL][key]['prediction'],
                                                                                                        average=F1_TYPE)
        else:
            for key in results["ghc"][BASELINE_MODEL]:
                baseline_series["test" + re.search(r'(.*?)-test(.*)', key).group(2)] = f1_score(results["ghc"][BASELINE_MODEL][key]['label'],
                                                                                                        results["ghc"][BASELINE_MODEL][key]['prediction'],
                                                                                                        average=F1_TYPE)
                    
        for column in plot_df.columns:
            plot_df[column] = plot_df[column].divide(baseline_series[column])-1

        
        # format column names
        plot_df.columns = pd.Series(plot_df.columns).apply(lambda x: re.search("test_(.*?)_(.*?)_(.*)", x).group(1)[-2:] + "-" + re.search("test_(.*?)_(.*?)_(.*)", x).group(2))
        
        # format row names --> index
        for elem in plot_df.index:
            plot_df.rename(index={elem: re.search("bert-(.*?)-train_(.*?)_(.*?)_(.*)", elem).group(2)[-2:] + "-" + re.search("bert-(.*?)-train_(.*?)_(.*?)_(.*)", elem).group(3)}, inplace=True)
    
        # multiply by 100 to get pct values
        plot_df = plot_df.multiply(100)
        
        # define diverging color palette
        cm = sns.diverging_palette(10, 150, as_cmap=True)
        even_range = np.max([np.abs(plot_df.values.min()), np.abs(plot_df.values.max())])
        
        display(plot_df.style.apply(background_gradient,cmap=cm, m=-even_range, M=even_range)\
                .set_table_attributes('style="font-family: CMU Serif; font-size:26px"').set_precision(2))
    
    else:
        # format column names
        plot_df.columns = pd.Series(plot_df.columns).apply(lambda x: re.search("test_(.*?)_(.*?)_(.*)", x).group(1)[-2:] + "-" + re.search("test_(.*?)_(.*?)_(.*)", x).group(2))
        
        # format row names --> index
        for elem in plot_df.index:
            plot_df.rename(index={elem: re.search("bert-(.*?)-train_(.*?)_(.*?)_(.*)", elem).group(2)[-2:] + "-" + re.search("bert-(.*?)-train_(.*?)_(.*?)_(.*)", elem).group(3)}, inplace=True)
    
        # multiply by 100 to get pct values
        plot_df = plot_df.multiply(100)
        
        display(plot_df.style.background_gradient(cmap=cm, axis = 0 ).set_table_attributes('style="font-family: CMU Serif; font-size:26px"').set_precision(2))
        

    return plot_df

In [219]:
df = plot_F1_monthly(input_df = results["ghc"]["base+month"], BASELINE_MODEL="base+rand")#, MONTH_FILTER = ["04", "08", "12"])

Finetune,18-01,18-02,18-03,18-04,18-05,18-06,18-07,18-08,18-09,18-10
18-01,6.24,-6.72,-12.39,-4.77,3.02,-6.42,1.43,3.7,1.68,-4.77
18-02,3.92,-0.31,5.4,-0.06,11.11,4.51,6.25,8.4,11.8,-1.57
18-03,0.83,1.77,-3.06,-11.23,15.08,3.48,-4.81,11.36,9.18,-3.33
18-04,7.0,5.73,5.4,4.28,13.11,6.0,5.31,11.36,1.77,-2.74
18-05,3.79,1.04,3.07,-1.88,14.72,1.21,6.51,6.34,5.89,5.32
18-06,7.73,1.75,-1.28,-2.64,11.65,1.21,-0.68,0.39,6.5,1.9
18-07,1.53,-6.52,-12.3,0.67,10.06,1.44,7.75,4.98,4.15,-3.31
18-08,5.22,1.75,-3.62,0.0,13.11,4.26,-3.8,15.29,5.29,-2.66
18-09,-0.7,-0.19,-0.21,9.2,11.11,-0.1,3.24,3.7,14.79,-6.31
18-10,11.6,-0.96,1.52,8.86,16.22,8.69,6.69,8.14,16.15,-5.75


In [176]:
df = plot_F1_monthly(input_df = results["reddit"]["base+month"], REDDIT_FILTER = 20, BASELINE_MODEL="base+rand")#, MONTH_FILTER = ["04", "08", "12"])
np.diag(df).mean()

Finetune,17-03,17-04,17-05,17-06,17-07,17-08,17-09,17-10,17-11,17-12,18-01,18-02,18-03,18-04,18-05,18-06,18-07,18-08,18-09,18-10,18-11,18-12,19-01,19-02,19-03,19-04,19-05,19-06,19-07,19-08,19-09,19-10,19-11,19-12,20-01,20-02
17-03,8.1,4.55,4.28,0.41,-1.9,-3.75,-0.53,-0.44,-4.99,-4.03,-3.9,-6.08,-0.26,-3.61,-3.92,-4.17,-5.62,-4.37,-5.38,-8.0,-4.45,-2.87,-4.62,-9.47,-8.79,-8.22,-7.49,-5.84,-8.21,-9.4,-6.33,-6.86,-7.96,-10.63,-7.01,-9.37
17-04,7.04,6.81,4.21,-3.08,0.17,0.48,0.3,1.31,-0.8,-0.23,-2.45,-2.57,-1.86,-1.13,-3.61,-2.39,-3.13,-5.83,-4.57,-4.82,-5.17,-1.32,-3.74,-5.92,-3.87,-4.47,-5.75,-6.04,-4.2,-8.58,-5.7,-0.67,-6.72,-8.32,-6.07,-5.03
17-05,3.11,1.31,10.93,4.03,-0.24,-0.36,-2.5,-0.35,-0.17,-0.29,-0.54,-4.28,-0.37,-2.71,-2.07,-4.77,-3.99,-2.42,-2.34,-5.83,-4.89,1.48,-3.51,-5.53,-4.6,-4.77,-5.06,-6.87,-4.72,-7.23,-6.63,-2.7,-1.66,-4.35,-4.69,-6.42
17-06,1.51,2.34,5.19,6.99,-1.79,1.87,-1.26,-0.36,1.23,-0.53,0.81,-2.33,1.84,-0.44,-2.38,-1.53,-4.44,-2.79,-2.16,-4.74,-0.27,0.09,-2.24,-4.97,-4.69,0.16,-3.17,-3.8,-5.19,-4.84,-1.39,0.36,-1.72,-3.82,-2.3,-8.32
17-07,1.72,-0.77,4.84,-0.25,5.53,0.84,-0.92,0.67,1.5,0.61,1.07,-2.01,-0.99,-3.64,-1.13,-1.49,-4.51,-2.33,-2.47,-3.06,-0.97,1.82,-1.62,-5.09,-3.78,-3.53,-2.93,-4.33,-5.92,-6.37,-5.06,-0.2,-1.8,-5.27,-3.54,-7.22
17-08,-0.27,-1.2,1.34,1.6,1.32,6.0,2.38,1.35,1.79,-0.32,0.05,-1.78,1.51,0.67,-2.06,-3.63,-3.42,-3.15,-2.03,-4.26,-2.62,0.28,-2.19,-6.54,-6.22,-2.77,-4.92,-1.59,-4.25,-5.02,-6.99,-3.08,-4.17,-5.23,-3.88,-7.87
17-09,-0.14,1.83,2.42,-0.62,-1.14,2.62,5.85,2.03,0.79,-1.56,-1.48,-1.68,-0.73,0.47,-0.84,-2.71,-3.45,-3.17,-2.58,-3.36,-3.13,0.04,-1.32,-5.82,-3.13,-2.64,-2.84,-4.68,-0.36,-2.97,-5.28,-3.24,-2.89,-7.64,-3.43,-6.13
17-10,0.45,-1.32,0.99,-2.16,-0.29,2.43,3.12,4.97,0.33,2.58,0.83,-2.18,-0.4,-0.75,-1.44,-1.8,-4.74,-3.16,-1.85,-3.03,-2.06,-1.38,-0.6,-4.47,-2.73,-3.08,-3.73,-6.47,-1.96,-4.25,-5.34,-1.58,-3.09,-5.7,-5.06,-9.37
17-11,2.79,-0.27,2.75,-2.18,0.18,-0.35,-0.01,0.77,3.18,2.4,-0.44,-1.43,1.91,0.27,-2.64,-1.06,-3.87,-2.44,-1.51,-2.39,-1.66,1.15,-3.32,-1.32,-0.79,-1.53,-1.35,-5.35,-1.91,-4.94,-2.22,-2.92,-2.94,-5.83,-4.52,-8.54
17-12,0.29,-4.84,3.85,-0.49,-0.29,1.16,1.2,0.64,2.34,4.95,1.82,0.0,-0.83,-2.76,-2.68,-1.98,-2.22,-0.81,-3.23,-2.06,0.27,-1.03,0.96,-7.07,-2.73,-5.16,-3.79,-6.35,-2.07,-5.8,-5.35,-1.17,-3.02,-2.63,-2.51,-7.27


5.088061792947261

# Results by Month: MODEL COMPARISON - MATCH MODELS

In [132]:
def model_comparison_matchmonths(input_dict, F1_TYPE = "macro", REDDIT_FILTER = 0):

    # calculate f1 scores
    scores = {}

    for model_type in input_dict:
        scores[model_type] = {}
        for key in input_dict[model_type]:
            if REDDIT_FILTER!=0:
                if f"_{REDDIT_FILTER}k-test" in key:
                    scores[model_type][key] = f1_score(input_dict[model_type][key]['label'], input_dict[model_type][key]['prediction'], average=F1_TYPE)
            else:
                scores[model_type][key] = f1_score(input_dict[model_type][key]['label'], input_dict[model_type][key]['prediction'], average=F1_TYPE)

    # write series of scores for each model type, only for test = train/adapt month where applicable
    score_series = {}
    
    for model_type in input_dict:
        score_series[model_type]={}
        
        if "+month" in model_type:
            for key in scores[model_type]:
                if (re.search(r'(.*?)train_(.*?)_(.*?)_(.*?)', key).group(2) == re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(2)) and \
                (re.search(r'(.*?)train_(.*?)_(.*?)_(.*?)', key).group(3) == re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(3)):
                    score_series[model_type][re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(2) + "_" +\
                                             re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(3)] = scores[model_type][key]
                    
        elif "month+rand" in model_type:
            for key in scores[model_type]:
                if (re.search(r'bert-(.*?)_(.*?)_(.*?)', key).group(1) == re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(2)) and \
                (re.search(r'bert-(.*?)_(.*?)_(.*?)', key).group(2) == re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(3)):
                    score_series[model_type][re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(2) + "_" +\
                                             re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(3)] = scores[model_type][key]
                    
        else:
            for key in scores[model_type]:
                score_series[model_type][re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(2) + "_" +\
                                         re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(3)] = scores[model_type][key]
    
    month_results_df = pd.DataFrame.from_dict(score_series, orient = "index").T
    month_results_df = month_results_df[['base+rand', 'rand+rand', 'month+rand', 'base+month', 'rand+month', 'month+month']]
    month_results_df.sort_index(inplace=True)
    
    cm = sns.color_palette('Greens', as_cmap=True)
    
    #display(month_results_df.style.background_gradient(cmap=cm, axis = 1 ).format('{0:,.2%}'))
    
    plot_df = pd.DataFrame(month_results_df.mean(axis=0))
    
    display(plot_df.rename(columns={0: f"avg. {F1_TYPE} F1 across months"}).style.background_gradient(cmap=cm, axis = 0 ).format('{0:,.2%}'))
    
    return plot_df

In [140]:
plot_df = {}
for size in [2, 20]:
    print(f"REDDIT, {size}K")
    plot_df[size] = model_comparison_matchmonths(input_dict = results["reddit"], REDDIT_FILTER = size, F1_TYPE="macro")

REDDIT, 2K


Unnamed: 0,avg. macro F1 across months
base+rand,35.95%
rand+rand,39.11%
month+rand,39.01%
base+month,37.50%
rand+month,40.19%
month+month,40.38%


REDDIT, 20K


Unnamed: 0,avg. macro F1 across months
base+rand,43.21%
rand+rand,43.84%
month+rand,43.81%
base+month,45.41%
rand+month,46.02%
month+month,46.12%


In [159]:
plot = plot_df[2].rename(columns={0: "2k"}).merge(plot_df[20].rename(columns={0: "20k"}),left_index=True, right_index=True)

cm = sns.color_palette('Greens', as_cmap=True)    
    
# multiply by 100 to get pct values
plot = plot.multiply(100)
plot.rename(index={"base+rand": "1) base+rand",
                  "rand+rand": "2) rand+rand",
                  "month+rand": "3) month+rand",
                  "base+month": "4) base+month",
                  "rand+month": "5) rand+month",
                  "month+month": "6) rand+month"}, inplace=True)
        
plot.style.background_gradient(cmap=cm, axis = None ).set_table_attributes('style="font-family: CMU Serif; font-size:26px"').set_precision(2)

Unnamed: 0,2k,20k
1) base+rand,35.95,43.21
2) rand+rand,39.11,43.84
3) month+rand,39.01,43.81
4) base+month,37.5,45.41
5) rand+month,40.19,46.02
6) rand+month,40.38,46.12


In [216]:
# separate analysis for GQ, which is entirely from October 2018

results["gq"] = {}

# load data
directory = '../../0_results/classification/gq'
for split in os.listdir(directory):        
    for result_csv in os.listdir(os.path.join(directory, split)):
        if result_csv.endswith(".csv"):
            results["gq"][os.path.splitext(result_csv)[0]] = pd.read_csv(os.path.join(directory, split, result_csv))
            
# calculate f1 scores --> only for models finetuned on same amount of data (26k)
F1_TYPE="macro"

scores = {}

for key in results["gq"]:
    if "train_rand_26k" in key:
        scores[re.search("(.*?)-train(.*?)",key).group(1)] = f1_score(results["gq"][key]['label'], results["gq"][key]['prediction'], average=F1_TYPE)

plot_df = pd.DataFrame.from_dict(scores, orient="index")
plot_df.sort_index(inplace=True)
plot_df.rename(columns={0: f"{F1_TYPE} F1"}).style.background_gradient(cmap=cm, axis = 0 ).format('{0:,.2%}')

Unnamed: 0,macro F1
bert-2018_01_1m,91.82%
bert-2018_02_1m,91.82%
bert-2018_03_1m,91.85%
bert-2018_04_1m,91.69%
bert-2018_05_1m,91.75%
bert-2018_06_1m,91.73%
bert-2018_07_1m,91.72%
bert-2018_08_1m,91.78%
bert-2018_09_1m,91.80%
bert-2018_10_1m,92.18%


# Results by Month: MODEL COMPARISON - SHIFT MODELS

In [150]:
def model_comparison_shiftmonths(DATASET="reddit", SHIFT_OFFSET=12, F1_TYPE = "macro", REDDIT_FILTER = 2):
    
    # for months where adapt = test
    # how does the base model finetuned on same month as the shift model perform?
    # how does the month-adapt model finetuned on same month as the shift model perform?
    # how does the shift-adapt model perform?

    shift_keys=pd.Series(results[DATASET][f"shift+{SHIFT_OFFSET}"].keys())
    shift_keys=pd.Series(sorted(shift_keys[shift_keys.apply(lambda x: re.search("bert-(.*?)_1m", x).group(1) == re.search("(.*?)-test_(.*?)_5k", x).group(2))]))
    shift_keys=shift_keys[shift_keys.apply(lambda x: f"_{REDDIT_FILTER}k" in x)]
    
    base_keys=pd.Series(results[DATASET]["base+month"].keys())
    base_keys=sorted(base_keys[base_keys.apply(lambda x: any((re.search("(.*?)-train_(.*?)-test(.*)", x).group(2) == re.search("(.*?)-train_(.*?)-test(.*)", sk).group(2)) and \
                                               (re.search("(.*?)-train_(.*?)-test(.*)", x).group(3) == re.search("(.*?)-train_(.*?)-test(.*)", sk).group(3)) for sk in shift_keys))] )
    
    rand_keys=pd.Series(results[DATASET]["rand+month"].keys())
    rand_keys=sorted(rand_keys[rand_keys.apply(lambda x: any((re.search("(.*?)-train_(.*?)-test(.*)", x).group(2) == re.search("(.*?)-train_(.*?)-test(.*)", sk).group(2)) and \
                                               (re.search("(.*?)-train_(.*?)-test(.*)", x).group(3) == re.search("(.*?)-train_(.*?)-test(.*)", sk).group(3)) for sk in shift_keys))] )
    
    match_keys=pd.Series(results[DATASET]["month+month"].keys())
    match_keys=sorted(match_keys[match_keys.apply(lambda x: any((re.search("(.*?)-train_(.*?)-test(.*)", x).group(2) == re.search("(.*?)-train_(.*?)-test(.*)", sk).group(2)) and \
                                               (re.search("(.*?)-train_(.*?)-test(.*)", x).group(3) == re.search("(.*?)-train_(.*?)-test(.*)", sk).group(3)) for sk in shift_keys))] )
    
    
    # calculate f1 scores
    scores = {}
    for model_type, keys in [(f"base+month", base_keys), (f"month+month", match_keys), (f"shift+{SHIFT_OFFSET}", shift_keys)]: #(f"rand+month", rand_keys),
        scores[model_type] = {}
        for key in keys:
            scores[model_type]["test"+re.search(r'(.*?)-test(.*)',key).group(2)] = f1_score(results[DATASET][model_type][key]['label'], results[DATASET][model_type][key]['prediction'], average=F1_TYPE)
    
    month_results_df = pd.DataFrame.from_dict(scores, orient = "index").T
    month_results_df.sort_index(inplace=True)
    
    cm = sns.color_palette('Greens', as_cmap=True)
    
    # table with separate row for each month
    #display(month_results_df.style.background_gradient(cmap=cm, axis = 1 ).format('{0:,.4%}'))
    
    # table with averages across months
    plot_df = pd.DataFrame(month_results_df.mean(axis=0)).rename(columns={0: f"avg. {F1_TYPE} F1 across months"})
    display(plot_df.style.background_gradient(cmap=cm, axis = 0 ).format('{0:,.2%}'))
    
    return plot_df

In [151]:
shift_results = {}

for size_filter in [2, 20]:
    
    shift_results[f"{size_filter}k"] = {}
    
    for offset in [1,3,6,12]:
        print(f"{size_filter}k finetuning size -- adapted to data from {offset} months ahead of finetuning period")
        shift_results[f"{size_filter}k"][f"shift+{offset}"] = model_comparison_shiftmonths(SHIFT_OFFSET=offset, REDDIT_FILTER=size_filter)
        print()
    
    print()

2k finetuning size -- adapted to data from 1 months ahead of finetuning period


Unnamed: 0,avg. macro F1 across months
base+month,36.82%
month+month,39.52%
shift+1,39.59%



2k finetuning size -- adapted to data from 3 months ahead of finetuning period


Unnamed: 0,avg. macro F1 across months
base+month,36.66%
month+month,39.03%
shift+3,39.03%



2k finetuning size -- adapted to data from 6 months ahead of finetuning period


Unnamed: 0,avg. macro F1 across months
base+month,36.12%
month+month,38.43%
shift+6,38.47%



2k finetuning size -- adapted to data from 12 months ahead of finetuning period


Unnamed: 0,avg. macro F1 across months
base+month,35.65%
month+month,37.76%
shift+12,37.78%




20k finetuning size -- adapted to data from 1 months ahead of finetuning period


Unnamed: 0,avg. macro F1 across months
base+month,43.62%
month+month,44.15%
shift+1,44.13%



20k finetuning size -- adapted to data from 3 months ahead of finetuning period


Unnamed: 0,avg. macro F1 across months
base+month,42.88%
month+month,43.40%
shift+3,43.47%



20k finetuning size -- adapted to data from 6 months ahead of finetuning period


Unnamed: 0,avg. macro F1 across months
base+month,42.27%
month+month,42.61%
shift+6,42.86%



20k finetuning size -- adapted to data from 12 months ahead of finetuning period


Unnamed: 0,avg. macro F1 across months
base+month,41.28%
month+month,41.75%
shift+12,41.79%






In [170]:
def summary_plot_shiftmodels(FINETUNING_SIZE):

    plot_df = pd.DataFrame()
    for shift in shift_results[f"{FINETUNING_SIZE}k"]:

        load_df = shift_results[f"{FINETUNING_SIZE}k"][shift].rename(index={shift: "shift"}, columns={"avg. macro F1 across months": shift})

        if plot_df.shape[0] == 0:
            plot_df = load_df
        else:
            plot_df = plot_df.merge(load_df,left_index=True, right_index=True)

    cm = sns.color_palette('Greens', as_cmap=True)
    
    plot_df = plot_df.multiply(100)
    
    # format column names
    plot_df.columns = pd.Series(plot_df.columns).apply(lambda x: re.search("shift(.*)", x).group(1))

    # format row names --> index
    
    plot_df.rename(index={"base+month": "3) no adapt."}, inplace=True)
    plot_df.rename(index={"month+month": "2) ft-month adapt."}, inplace=True)
    plot_df.rename(index={"shift": "1) test-month adapt."}, inplace=True)
    
    plot_df = plot_df.iloc[::-1]
    
    display(plot_df.style.background_gradient(cmap=cm, axis = None ).set_table_attributes('style="font-family: CMU Serif; font-size:26px"').set_precision(2))
    
for fs in [2, 20]:
    print(f"FINETUNING SIZE: {fs}K")
    summary_plot_shiftmodels(fs)
    print()

FINETUNING SIZE: 2K


Unnamed: 0,+1,+3,+6,+12
1) test-month adapt.,39.59,39.03,38.47,37.78
2) ft-month adapt.,39.52,39.03,38.43,37.76
3) no adapt.,36.82,36.66,36.12,35.65



FINETUNING SIZE: 20K


Unnamed: 0,+1,+3,+6,+12
1) test-month adapt.,44.13,43.47,42.86,41.79
2) ft-month adapt.,44.15,43.4,42.61,41.75
3) no adapt.,43.62,42.88,42.27,41.28





In [18]:
# double-check results
model_type="base+month"
adapt_time="base"
train_time="2019_11"
train_size=2
test_time="2020_02"

result=results["reddit"][model_type][f"bert-{adapt_time}-train_{train_time}_{train_size}k-test_{test_time}_5k"]
f1_score(result['label'], result['prediction'], average="macro")

0.3594026419627837

In [293]:
# double-check results
model_type="shift+3"
adapt_time="2020_02"
train_time="2019_11"
train_size=2
test_time="2020_02"

result=results["reddit"][model_type][f"bert-{adapt_time}_1m-train_{train_time}_{train_size}k-test_{test_time}_5k"]
f1_score(result['label'], result['prediction'], average="macro")

0.3782501615876449