# EVALUATION OF CLASSIFICATION RESULTS

In [107]:
import pandas as pd
import csv
import os
import seaborn as sns
import numpy as np
import re
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score
from matplotlib import colors

# Domain Adaptation vs Finetuning Size

In [167]:
# load overall results to dict of dicts, with each subdict corresponding to one dataset (--> ghc, gq and reddit)

results = {}
directory = '../../0_results/classification'
for dataset_name in os.listdir(directory):
    print(f"loading {dataset_name} results to sub-dict")
    results[dataset_name] = {}
    for result_csv in os.listdir(os.path.join(directory, dataset_name, "total-models")):
        if result_csv.endswith(".csv"):
            results[dataset_name][os.path.splitext(result_csv)[0]] = pd.read_csv(os.path.join(directory, dataset_name, "total-models", result_csv))

loading ghc results to sub-dict
loading reddit results to sub-dict
loading gq results to sub-dict


In [168]:
def domain_vs_finetuning_size(dict_of_results, F1_TYPE = "macro"):
    
    scores = {}

    for result in dict_of_results:
        scores[result] = f1_score(dict_of_results[result]['label'], dict_of_results[result]['prediction'], average=F1_TYPE)

    score_series = {}
    for model in sorted(pd.unique(pd.Series(scores.keys()).apply(lambda x: x.split('-train')[0]))):
        score_series[model] = {}
        for key in sorted(scores):
            if model in key and "4k" not in key:
                finetuning_size = int(re.search(r'(.*?)-train_rand_(.*?)-test(.*?)', key).group(2).rstrip("k"))*1000
                score_series[model][finetuning_size if finetuning_size != 1552000 else 1552] = scores[key] # if catches special case for ghc: missing "k"
    
    plot_df = pd.DataFrame.from_dict(score_series).reset_index().rename(columns={'index':'model'})
    plot_df.set_index('model', inplace=True)
    plot_df.index.name="Adapt."
    
    # transpose to put models in rows, test sets in columns
    plot_df = plot_df.sort_index().T
    
    # rearrange so that largest pretraining size is at bottom of df
    plot_df = pd.concat([plot_df.loc[plot_df.index != "bert-rand_10m"], plot_df.loc[plot_df.index == "bert-rand_10m"]], axis=0)
    
    cm = sns.color_palette('Greens', as_cmap=True)
    
    # format column names
    plot_df.columns = pd.Series(plot_df.columns).apply(lambda x: str(x)[:-3] + "k")
    
    # format row names --> index
    plot_df.rename(index={"bert-base": "bert-rand_0m"}, inplace=True)
    for elem in plot_df.index:
        plot_df.rename(index={elem: re.search("(.*?)_(.*)", elem).group(2)}, inplace=True)
    plot_df.rename(index={"0m": "0"}, inplace=True)
    
    # multiply by 100 to get pct values
    plot_df = plot_df.multiply(100)
    
    display(plot_df.style.background_gradient(cmap=cm, axis = None ).set_table_attributes('style="font-family: CMU Serif; font-size:26px"').set_precision(2))
    
    return plot_df

In [169]:
for dataset in ["reddit", "ghc", "gq"]:
    print(dataset.upper())
    out = domain_vs_finetuning_size(results[dataset])
    print()

REDDIT


Adapt.,1k,2k,5k,10k,20k,40k,80k,160k,320k
0,34.19,35.7,39.22,41.65,43.22,44.65,46.65,47.92,49.44
1m,37.76,39.26,41.31,42.91,44.03,45.18,46.87,47.94,49.51
2m,38.45,39.57,42.05,42.69,43.43,45.39,46.93,48.38,48.98
5m,38.78,39.84,42.16,43.42,44.46,45.86,47.37,47.93,49.82
10m,38.7,40.37,42.4,43.47,44.53,45.84,47.23,48.44,50.05



GHC


ValueError: zero-size array to reduction operation fmin which has no identity

<pandas.io.formats.style.Styler at 0x7f8a9b9cb5e0>


GQ


Adapt.,1k,2k,5k,10k,20k,26k
0,90.48,90.73,90.98,91.4,91.8,92.0
1m,90.99,91.19,91.06,91.46,91.92,91.77
2m,90.77,91.25,91.35,91.66,91.72,91.77
5m,90.75,91.38,91.44,91.45,91.93,91.98
10m,90.93,91.3,91.3,91.5,91.89,91.86





# Results by Month: INDIVIDUAL MODELS

In [82]:
%%time

# load monthly results to dict of dicts of dicts, with each subdict corresponding to one dataset (--> ghc and reddit) and subsubdict to one model type

results = {}
directory = '../../0_results/classification'
for dataset_name in os.listdir(directory):
    if dataset_name != "gq":
        print(f"loading {dataset_name} results to sub-dict")
        results[dataset_name] = {}
        for model_type in sorted(os.listdir(os.path.join(directory, dataset_name, "month-models"))):
            if model_type!="z_old":
                print(f"  {model_type} to sub-sub-dict")
                results[dataset_name][model_type] = {}
                for result_csv in os.listdir(os.path.join(directory, dataset_name, "month-models", model_type)):
                    if result_csv.endswith(".csv"):
                        results[dataset_name][model_type][os.path.splitext(result_csv)[0]] = \
                        pd.read_csv(os.path.join(directory, dataset_name, "month-models", model_type, result_csv))

loading ghc results to sub-dict
  base+month to sub-sub-dict
  base+rand to sub-sub-dict
  month+month to sub-sub-dict
  month+rand to sub-sub-dict
  rand+month to sub-sub-dict
  rand+rand to sub-sub-dict
loading reddit results to sub-dict
  base+month to sub-sub-dict
  base+rand to sub-sub-dict
  month+month to sub-sub-dict
  month+rand to sub-sub-dict
  rand+month to sub-sub-dict
  rand+rand to sub-sub-dict
  shift+1 to sub-sub-dict
  shift+12 to sub-sub-dict
  shift+3 to sub-sub-dict
  shift+6 to sub-sub-dict
CPU times: user 49.7 s, sys: 14.3 s, total: 1min 4s
Wall time: 3min 5s


In [163]:
# helper function for diverging color palette
def background_gradient(s, m, M, cmap='BuPu', low=0, high=0):
    rng = M - m
    norm = colors.Normalize(m - (rng * low),
                            M + (rng * high))
    normed = norm(s.values)
    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
    return ['background-color: %s' % color for color in c]

def plot_F1_monthly(input_df, REDDIT_FILTER=0, F1_TYPE = "macro", BASELINE_MODEL="",
                    YEAR_FILTER = ["2017", "2018", "2019", "2020"],
                    MONTH_FILTER = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]):

    # calculate f1 
    scores = {}
    
    for key in input_df :
        if REDDIT_FILTER!=0:
            if f"_{REDDIT_FILTER}k-test" in key:
                scores[key] = f1_score(input_df[key]['label'], input_df[key]['prediction'], average=F1_TYPE)
        else:
            scores[key] = f1_score(input_df[key]['label'], input_df[key]['prediction'], average=F1_TYPE)
    
    score_series = {}
    for model in sorted(pd.unique(pd.Series(scores.keys()).apply(lambda x: re.search(r'(.*?)-test(.*?)', x).group(1)))):
        if any("_"+year in model for year in YEAR_FILTER) and any("_"+month in model for month in MONTH_FILTER):
            score_series[model] = {}
            for key in sorted(scores):
                if model in key and any("_"+year in re.search(r'(.*?)-test(.*)', key).group(2) for year in YEAR_FILTER) \
                and any("_"+month in re.search(r'(.*?)-test(.*)', key).group(2) for month in MONTH_FILTER):
                    score_series[model]["test" +re.search(r'(.*?)-test(.*)', key).group(2)] = scores[key]
    
    
    plot_df = pd.DataFrame.from_dict(score_series).reset_index().rename(columns={'index':'model'})

    plot_df.set_index('model', inplace=True)
    plot_df.index.name = "Finetune"

    plot_df = plot_df.T
    
    cm = sns.color_palette('Greens', as_cmap=True)

    
    if BASELINE_MODEL!="":
        baseline_series = {}
        for key in results["reddit"][BASELINE_MODEL]:
                if f"{REDDIT_FILTER}k" in key:
                    baseline_series["test" + re.search(r'(.*?)-test(.*)', key).group(2)] = f1_score(results["reddit"][BASELINE_MODEL][key]['label'],
                                                                                                    results["reddit"][BASELINE_MODEL][key]['prediction'],
                                                                                                    average=F1_TYPE)
        for column in plot_df.columns:
            plot_df[column] = plot_df[column].divide(baseline_series[column])-1

        
        # format column names
        plot_df.columns = pd.Series(plot_df.columns).apply(lambda x: re.search("test_(.*?)_(.*?)_(.*)", x).group(1)[-2:] + "-" + re.search("test_(.*?)_(.*?)_(.*)", x).group(2))
        
        # format row names --> index
        for elem in plot_df.index:
            plot_df.rename(index={elem: re.search("bert-(.*?)-train_(.*?)_(.*?)_(.*)", elem).group(2)[-2:] + "-" + re.search("bert-(.*?)-train_(.*?)_(.*?)_(.*)", elem).group(3)}, inplace=True)
    
        # multiply by 100 to get pct values
        plot_df = plot_df.multiply(100)
        
        # define diverging color palette
        cm = sns.diverging_palette(10, 150, as_cmap=True)
        even_range = np.max([np.abs(plot_df.values.min()), np.abs(plot_df.values.max())])
        
        display(plot_df.style.apply(background_gradient,cmap=cm, m=-even_range, M=even_range)\
                .set_table_attributes('style="font-family: CMU Serif; font-size:26px"').set_precision(2))
    
    else:
        # format column names
        plot_df.columns = pd.Series(plot_df.columns).apply(lambda x: re.search("test_(.*?)_(.*?)_(.*)", x).group(1)[-2:] + "-" + re.search("test_(.*?)_(.*?)_(.*)", x).group(2))
        
        # format row names --> index
        for elem in plot_df.index:
            plot_df.rename(index={elem: re.search("bert-(.*?)-train_(.*?)_(.*?)_(.*)", elem).group(2)[-2:] + "-" + re.search("bert-(.*?)-train_(.*?)_(.*?)_(.*)", elem).group(3)}, inplace=True)
    
        # multiply by 100 to get pct values
        plot_df = plot_df.multiply(100)
        
        display(plot_df.style.background_gradient(cmap=cm, axis = 0 ).set_table_attributes('style="font-family: CMU Serif; font-size:26px"').set_precision(2))

In [124]:
plot_F1_monthly(input_df = results["reddit"]["base+month"], REDDIT_FILTER = 20, MONTH_FILTER = ["04", "08", "12"])

model,17-04,17-08,17-12,18-04,18-08,18-12,19-04,19-08,19-12
17-04,49.43,44.66,44.15,42.03,39.44,41.05,40.75,37.18,38.83
17-08,45.73,47.12,44.1,42.8,40.57,41.71,41.47,38.62,40.13
17-12,44.04,44.96,46.43,41.34,41.55,41.17,40.45,38.31,41.24
18-04,45.0,43.68,44.18,44.04,41.87,41.75,41.4,38.58,39.69
18-08,45.02,44.21,43.95,42.09,42.45,42.3,41.42,39.42,39.93
18-12,44.47,43.72,42.68,42.13,41.59,44.1,42.32,39.5,40.14
19-04,43.55,42.31,42.16,41.25,41.26,41.39,45.26,40.43,43.01
19-08,41.27,41.59,40.92,40.97,39.95,40.71,42.18,42.24,41.59
19-12,41.51,41.33,41.67,40.24,39.84,40.79,40.63,41.04,43.63


In [164]:
plot_F1_monthly(input_df = results["reddit"]["base+month"], REDDIT_FILTER = 20, BASELINE_MODEL="base+rand", MONTH_FILTER = ["04", "08", "12"])

Finetune,17-04,17-08,17-12,18-04,18-08,18-12,19-04,19-08,19-12
17-04,6.81,0.48,-0.23,-1.13,-5.83,-1.32,-4.47,-8.58,-8.32
17-08,-1.2,6.0,-0.32,0.67,-3.15,0.28,-2.77,-5.02,-5.23
17-12,-4.84,1.16,4.95,-2.76,-0.81,-1.03,-5.16,-5.8,-2.63
18-04,-2.76,-1.72,-0.14,3.6,-0.04,0.37,-2.93,-5.14,-6.28
18-08,-2.71,-0.54,-0.66,-0.99,1.35,1.7,-2.89,-3.07,-5.7
18-12,-3.91,-1.64,-3.53,-0.89,-0.7,6.01,-0.79,-2.85,-5.22
19-04,-5.89,-4.82,-4.72,-2.96,-1.51,-0.49,6.12,-0.58,1.57
19-08,-10.82,-6.43,-7.52,-3.62,-4.62,-2.13,-1.1,3.88,-1.78
19-12,-10.31,-7.02,-5.82,-5.35,-4.88,-1.93,-4.75,0.92,3.03


# Results by Month: MODEL COMPARISON - MATCH MODELS

In [132]:
def model_comparison_matchmonths(input_dict, F1_TYPE = "macro", REDDIT_FILTER = 0):

    # calculate f1 scores
    scores = {}

    for model_type in input_dict:
        scores[model_type] = {}
        for key in input_dict[model_type]:
            if REDDIT_FILTER!=0:
                if f"_{REDDIT_FILTER}k-test" in key:
                    scores[model_type][key] = f1_score(input_dict[model_type][key]['label'], input_dict[model_type][key]['prediction'], average=F1_TYPE)
            else:
                scores[model_type][key] = f1_score(input_dict[model_type][key]['label'], input_dict[model_type][key]['prediction'], average=F1_TYPE)

    # write series of scores for each model type, only for test = train/adapt month where applicable
    score_series = {}
    
    for model_type in input_dict:
        score_series[model_type]={}
        
        if "+month" in model_type:
            for key in scores[model_type]:
                if (re.search(r'(.*?)train_(.*?)_(.*?)_(.*?)', key).group(2) == re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(2)) and \
                (re.search(r'(.*?)train_(.*?)_(.*?)_(.*?)', key).group(3) == re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(3)):
                    score_series[model_type][re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(2) + "_" +\
                                             re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(3)] = scores[model_type][key]
                    
        elif "month+rand" in model_type:
            for key in scores[model_type]:
                if (re.search(r'bert-(.*?)_(.*?)_(.*?)', key).group(1) == re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(2)) and \
                (re.search(r'bert-(.*?)_(.*?)_(.*?)', key).group(2) == re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(3)):
                    score_series[model_type][re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(2) + "_" +\
                                             re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(3)] = scores[model_type][key]
                    
        else:
            for key in scores[model_type]:
                score_series[model_type][re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(2) + "_" +\
                                         re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(3)] = scores[model_type][key]
    
    month_results_df = pd.DataFrame.from_dict(score_series, orient = "index").T
    month_results_df = month_results_df[['base+rand', 'rand+rand', 'month+rand', 'base+month', 'rand+month', 'month+month']]
    month_results_df.sort_index(inplace=True)
    
    cm = sns.color_palette('Greens', as_cmap=True)
    
    #display(month_results_df.style.background_gradient(cmap=cm, axis = 1 ).format('{0:,.2%}'))
    
    plot_df = pd.DataFrame(month_results_df.mean(axis=0))
    
    display(plot_df.rename(columns={0: f"avg. {F1_TYPE} F1 across months"}).style.background_gradient(cmap=cm, axis = 0 ).format('{0:,.2%}'))
    
    return plot_df

In [140]:
plot_df = {}
for size in [2, 20]:
    print(f"REDDIT, {size}K")
    plot_df[size] = model_comparison_matchmonths(input_dict = results["reddit"], REDDIT_FILTER = size, F1_TYPE="macro")

REDDIT, 2K


Unnamed: 0,avg. macro F1 across months
base+rand,35.95%
rand+rand,39.11%
month+rand,39.01%
base+month,37.50%
rand+month,40.19%
month+month,40.38%


REDDIT, 20K


Unnamed: 0,avg. macro F1 across months
base+rand,43.21%
rand+rand,43.84%
month+rand,43.81%
base+month,45.41%
rand+month,46.02%
month+month,46.12%


In [159]:
plot = plot_df[2].rename(columns={0: "2k"}).merge(plot_df[20].rename(columns={0: "20k"}),left_index=True, right_index=True)

cm = sns.color_palette('Greens', as_cmap=True)    
    
# multiply by 100 to get pct values
plot = plot.multiply(100)
plot.rename(index={"base+rand": "1) base+rand",
                  "rand+rand": "2) rand+rand",
                  "month+rand": "3) month+rand",
                  "base+month": "4) base+month",
                  "rand+month": "5) rand+month",
                  "month+month": "6) rand+month"}, inplace=True)
        
plot.style.background_gradient(cmap=cm, axis = None ).set_table_attributes('style="font-family: CMU Serif; font-size:26px"').set_precision(2)

Unnamed: 0,2k,20k
1) base+rand,35.95,43.21
2) rand+rand,39.11,43.84
3) month+rand,39.01,43.81
4) base+month,37.5,45.41
5) rand+month,40.19,46.02
6) rand+month,40.38,46.12


In [129]:

print("REDDIT, 20K")
model_comparison_matchmonths(input_dict = results["reddit"], REDDIT_FILTER = 20, F1_TYPE="macro")
print()

print("REDDIT, 2K")
model_comparison_matchmonths(input_dict = results["reddit"], REDDIT_FILTER = 2, F1_TYPE="macro")
print()

print("GHC")
model_comparison_matchmonths(input_dict = results["ghc"], F1_TYPE="macro")

REDDIT, 20K


KeyboardInterrupt: 

In [216]:
# separate analysis for GQ, which is entirely from October 2018

results["gq"] = {}

# load data
directory = '../../0_results/classification/gq'
for split in os.listdir(directory):        
    for result_csv in os.listdir(os.path.join(directory, split)):
        if result_csv.endswith(".csv"):
            results["gq"][os.path.splitext(result_csv)[0]] = pd.read_csv(os.path.join(directory, split, result_csv))
            
# calculate f1 scores --> only for models finetuned on same amount of data (26k)
F1_TYPE="macro"

scores = {}

for key in results["gq"]:
    if "train_rand_26k" in key:
        scores[re.search("(.*?)-train(.*?)",key).group(1)] = f1_score(results["gq"][key]['label'], results["gq"][key]['prediction'], average=F1_TYPE)

plot_df = pd.DataFrame.from_dict(scores, orient="index")
plot_df.sort_index(inplace=True)
plot_df.rename(columns={0: f"{F1_TYPE} F1"}).style.background_gradient(cmap=cm, axis = 0 ).format('{0:,.2%}')

Unnamed: 0,macro F1
bert-2018_01_1m,91.82%
bert-2018_02_1m,91.82%
bert-2018_03_1m,91.85%
bert-2018_04_1m,91.69%
bert-2018_05_1m,91.75%
bert-2018_06_1m,91.73%
bert-2018_07_1m,91.72%
bert-2018_08_1m,91.78%
bert-2018_09_1m,91.80%
bert-2018_10_1m,92.18%


# Results by Month: MODEL COMPARISON - SHIFT MODELS

In [150]:
def model_comparison_shiftmonths(DATASET="reddit", SHIFT_OFFSET=12, F1_TYPE = "macro", REDDIT_FILTER = 2):
    
    # for months where adapt = test
    # how does the base model finetuned on same month as the shift model perform?
    # how does the month-adapt model finetuned on same month as the shift model perform?
    # how does the shift-adapt model perform?

    shift_keys=pd.Series(results[DATASET][f"shift+{SHIFT_OFFSET}"].keys())
    shift_keys=pd.Series(sorted(shift_keys[shift_keys.apply(lambda x: re.search("bert-(.*?)_1m", x).group(1) == re.search("(.*?)-test_(.*?)_5k", x).group(2))]))
    shift_keys=shift_keys[shift_keys.apply(lambda x: f"_{REDDIT_FILTER}k" in x)]
    
    base_keys=pd.Series(results[DATASET]["base+month"].keys())
    base_keys=sorted(base_keys[base_keys.apply(lambda x: any((re.search("(.*?)-train_(.*?)-test(.*)", x).group(2) == re.search("(.*?)-train_(.*?)-test(.*)", sk).group(2)) and \
                                               (re.search("(.*?)-train_(.*?)-test(.*)", x).group(3) == re.search("(.*?)-train_(.*?)-test(.*)", sk).group(3)) for sk in shift_keys))] )
    
    rand_keys=pd.Series(results[DATASET]["rand+month"].keys())
    rand_keys=sorted(rand_keys[rand_keys.apply(lambda x: any((re.search("(.*?)-train_(.*?)-test(.*)", x).group(2) == re.search("(.*?)-train_(.*?)-test(.*)", sk).group(2)) and \
                                               (re.search("(.*?)-train_(.*?)-test(.*)", x).group(3) == re.search("(.*?)-train_(.*?)-test(.*)", sk).group(3)) for sk in shift_keys))] )
    
    match_keys=pd.Series(results[DATASET]["month+month"].keys())
    match_keys=sorted(match_keys[match_keys.apply(lambda x: any((re.search("(.*?)-train_(.*?)-test(.*)", x).group(2) == re.search("(.*?)-train_(.*?)-test(.*)", sk).group(2)) and \
                                               (re.search("(.*?)-train_(.*?)-test(.*)", x).group(3) == re.search("(.*?)-train_(.*?)-test(.*)", sk).group(3)) for sk in shift_keys))] )
    
    
    # calculate f1 scores
    scores = {}
    for model_type, keys in [(f"base+month", base_keys), (f"month+month", match_keys), (f"shift+{SHIFT_OFFSET}", shift_keys)]: #(f"rand+month", rand_keys),
        scores[model_type] = {}
        for key in keys:
            scores[model_type]["test"+re.search(r'(.*?)-test(.*)',key).group(2)] = f1_score(results[DATASET][model_type][key]['label'], results[DATASET][model_type][key]['prediction'], average=F1_TYPE)
    
    month_results_df = pd.DataFrame.from_dict(scores, orient = "index").T
    month_results_df.sort_index(inplace=True)
    
    cm = sns.color_palette('Greens', as_cmap=True)
    
    # table with separate row for each month
    #display(month_results_df.style.background_gradient(cmap=cm, axis = 1 ).format('{0:,.4%}'))
    
    # table with averages across months
    plot_df = pd.DataFrame(month_results_df.mean(axis=0)).rename(columns={0: f"avg. {F1_TYPE} F1 across months"})
    display(plot_df.style.background_gradient(cmap=cm, axis = 0 ).format('{0:,.2%}'))
    
    return plot_df

In [151]:
shift_results = {}

for size_filter in [2, 20]:
    
    shift_results[f"{size_filter}k"] = {}
    
    for offset in [1,3,6,12]:
        print(f"{size_filter}k finetuning size -- adapted to data from {offset} months ahead of finetuning period")
        shift_results[f"{size_filter}k"][f"shift+{offset}"] = model_comparison_shiftmonths(SHIFT_OFFSET=offset, REDDIT_FILTER=size_filter)
        print()
    
    print()

2k finetuning size -- adapted to data from 1 months ahead of finetuning period


Unnamed: 0,avg. macro F1 across months
base+month,36.82%
month+month,39.52%
shift+1,39.59%



2k finetuning size -- adapted to data from 3 months ahead of finetuning period


Unnamed: 0,avg. macro F1 across months
base+month,36.66%
month+month,39.03%
shift+3,39.03%



2k finetuning size -- adapted to data from 6 months ahead of finetuning period


Unnamed: 0,avg. macro F1 across months
base+month,36.12%
month+month,38.43%
shift+6,38.47%



2k finetuning size -- adapted to data from 12 months ahead of finetuning period


Unnamed: 0,avg. macro F1 across months
base+month,35.65%
month+month,37.76%
shift+12,37.78%




20k finetuning size -- adapted to data from 1 months ahead of finetuning period


Unnamed: 0,avg. macro F1 across months
base+month,43.62%
month+month,44.15%
shift+1,44.13%



20k finetuning size -- adapted to data from 3 months ahead of finetuning period


Unnamed: 0,avg. macro F1 across months
base+month,42.88%
month+month,43.40%
shift+3,43.47%



20k finetuning size -- adapted to data from 6 months ahead of finetuning period


Unnamed: 0,avg. macro F1 across months
base+month,42.27%
month+month,42.61%
shift+6,42.86%



20k finetuning size -- adapted to data from 12 months ahead of finetuning period


Unnamed: 0,avg. macro F1 across months
base+month,41.28%
month+month,41.75%
shift+12,41.79%






In [170]:
def summary_plot_shiftmodels(FINETUNING_SIZE):

    plot_df = pd.DataFrame()
    for shift in shift_results[f"{FINETUNING_SIZE}k"]:

        load_df = shift_results[f"{FINETUNING_SIZE}k"][shift].rename(index={shift: "shift"}, columns={"avg. macro F1 across months": shift})

        if plot_df.shape[0] == 0:
            plot_df = load_df
        else:
            plot_df = plot_df.merge(load_df,left_index=True, right_index=True)

    cm = sns.color_palette('Greens', as_cmap=True)
    
    plot_df = plot_df.multiply(100)
    
    # format column names
    plot_df.columns = pd.Series(plot_df.columns).apply(lambda x: re.search("shift(.*)", x).group(1))

    # format row names --> index
    
    plot_df.rename(index={"base+month": "3) no adapt."}, inplace=True)
    plot_df.rename(index={"month+month": "2) ft-month adapt."}, inplace=True)
    plot_df.rename(index={"shift": "1) test-month adapt."}, inplace=True)
    
    plot_df = plot_df.iloc[::-1]
    
    display(plot_df.style.background_gradient(cmap=cm, axis = None ).set_table_attributes('style="font-family: CMU Serif; font-size:26px"').set_precision(2))
    
for fs in [2, 20]:
    print(f"FINETUNING SIZE: {fs}K")
    summary_plot_shiftmodels(fs)
    print()

FINETUNING SIZE: 2K


Unnamed: 0,+1,+3,+6,+12
1) test-month adapt.,39.59,39.03,38.47,37.78
2) ft-month adapt.,39.52,39.03,38.43,37.76
3) no adapt.,36.82,36.66,36.12,35.65



FINETUNING SIZE: 20K


Unnamed: 0,+1,+3,+6,+12
1) test-month adapt.,44.13,43.47,42.86,41.79
2) ft-month adapt.,44.15,43.4,42.61,41.75
3) no adapt.,43.62,42.88,42.27,41.28





In [18]:
# double-check results
model_type="base+month"
adapt_time="base"
train_time="2019_11"
train_size=2
test_time="2020_02"

result=results["reddit"][model_type][f"bert-{adapt_time}-train_{train_time}_{train_size}k-test_{test_time}_5k"]
f1_score(result['label'], result['prediction'], average="macro")

0.3594026419627837

In [293]:
# double-check results
model_type="shift+3"
adapt_time="2020_02"
train_time="2019_11"
train_size=2
test_time="2020_02"

result=results["reddit"][model_type][f"bert-{adapt_time}_1m-train_{train_time}_{train_size}k-test_{test_time}_5k"]
f1_score(result['label'], result['prediction'], average="macro")

0.3782501615876449