In [64]:
import pandas as pd
import csv
import os
import seaborn as sns
import numpy as np

from sklearn.metrics import f1_score

In [41]:
# load csv files with results from different models
results = {}
directory = '../../0_results/classification/month-models'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        results[os.path.splitext(filename)[0]] = pd.read_csv(os.path.join(directory, filename))
        continue
    else:
        continue

In [58]:
# f1 scores for base model finetuned on different months

# calculate f1 
f1_type = 'macro'
scores = {}

for key in results :
    if "base" in key:
        scores[key] = f1_score(results[key]['label'], results[key]['prediction'], average=f1_type)

score_series = {}
for model in sorted(pd.unique(pd.Series(scores.keys()).str[:18])):
    score_series[model] = []
    for key in sorted(scores):
        if model in key:
            score_series[model].append(scores[key])

out_df = pd.DataFrame.from_dict(score_series).reset_index().rename(columns={'index':'model'})
out_df.columns = out_df.columns.str.replace('bert-base-', '')
out_df['model'] = out_df['model'].apply(lambda x: f'test-0{x+1}' if x+1!=10 else f'test-{x+1}')

out_df.set_index('model', inplace=True)

# write to df for later use
base_df = out_df.T.copy()

cm = sns.color_palette('Greens', as_cmap=True)

out_df.T.style.background_gradient(cmap=cm, axis = 0 ).format('{0:,.2%}')

model,test-01,test-02,test-03,test-04,test-05,test-06,test-07,test-08,test-09,test-10
train-01,73.92%,64.76%,58.72%,62.67%,61.85%,61.18%,70.08%,65.36%,59.08%,70.08%
train-02,72.30%,69.21%,70.65%,65.76%,66.70%,68.33%,73.41%,68.33%,64.97%,72.43%
train-03,70.15%,70.66%,64.98%,58.41%,69.09%,67.65%,65.76%,70.19%,63.44%,71.14%
train-04,74.44%,73.41%,70.65%,68.62%,67.91%,69.30%,72.76%,70.19%,59.14%,71.57%
train-05,72.21%,70.15%,69.09%,64.57%,68.87%,66.17%,73.59%,67.03%,61.53%,77.50%
train-06,74.95%,70.64%,66.17%,64.06%,67.03%,66.17%,68.62%,63.28%,61.89%,74.99%
train-07,70.64%,64.90%,58.78%,66.24%,66.07%,66.32%,74.44%,66.17%,60.52%,71.15%
train-08,73.21%,70.64%,64.60%,65.80%,67.91%,68.16%,66.47%,72.67%,61.18%,71.63%
train-09,69.09%,69.30%,66.89%,71.85%,66.70%,65.31%,71.33%,65.36%,66.70%,68.94%
train-10,77.64%,68.76%,68.05%,71.63%,69.77%,71.06%,73.71%,68.16%,67.49%,69.35%


In [59]:
# f1 scores for month-adapted models finetuned on different months

# calculate f1 
f1_type = 'macro'
scores = {}

for key in results :
    if "base" not in key:
        scores[key] = f1_score(results[key]['label'], results[key]['prediction'], average=f1_type)

score_series = {}
for model in sorted(pd.unique(pd.Series(scores.keys()).str[:19])):
    score_series[model] = []
    for key in sorted(scores):
        if model in key:
            score_series[model].append(scores[key])

out_df = pd.DataFrame.from_dict(score_series).reset_index().rename(columns={'index':'model'})
out_df.columns = out_df.columns.str.replace('bert-', '')
out_df['model'] = out_df['model'].apply(lambda x: f'test-0{x+1}' if x+1!=10 else f'test-{x+1}')

out_df.set_index('model', inplace=True)

# write to df for later use
adapted_df = out_df.T.copy()

cm = sns.color_palette('Greens', as_cmap=True)

out_df.T.style.background_gradient(cmap=cm, axis = 0 ).format('{0:,.2%}')

model,test-01,test-02,test-03,test-04,test-05,test-06,test-07,test-08,test-09,test-10
01-1m-train-01,77.08%,65.80%,65.69%,65.38%,66.47%,64.60%,71.63%,64.18%,62.74%,68.16%
02-1m-train-02,76.91%,71.57%,68.05%,64.06%,67.09%,68.16%,73.41%,72.91%,61.18%,73.92%
03-1m-train-03,78.97%,66.87%,71.85%,65.38%,61.14%,68.45%,71.10%,73.41%,63.08%,75.46%
04-1m-train-04,80.10%,72.55%,69.30%,70.08%,67.91%,66.07%,72.67%,74.72%,60.84%,71.57%
05-1m-train-05,72.21%,71.15%,71.14%,68.62%,64.57%,69.57%,68.92%,70.08%,67.03%,71.63%
06-1m-train-06,72.78%,70.15%,70.64%,65.34%,65.36%,66.59%,65.80%,71.63%,63.28%,64.97%
07-1m-train-07,76.91%,66.24%,64.57%,64.97%,63.08%,67.65%,72.55%,68.76%,64.60%,70.66%
08-1m-train-08,73.21%,71.15%,73.06%,67.03%,65.76%,62.91%,71.96%,71.63%,62.54%,73.92%
09-1m-train-09,73.77%,71.15%,65.36%,74.99%,65.36%,71.14%,74.44%,64.18%,68.62%,72.05%
10-1m-train-10,76.32%,69.35%,72.61%,71.10%,65.95%,73.05%,71.96%,69.35%,69.35%,74.22%


In [69]:
# difference in f1 scores between base and month-adapted models for each test case

(adapted_df-base_df.values).style.background_gradient(cmap=cm, axis = 0 ).format('{0:,.2%}')

model,test-01,test-02,test-03,test-04,test-05,test-06,test-07,test-08,test-09,test-10
01-1m-train-01,3.16%,1.04%,6.96%,2.71%,4.62%,3.42%,1.55%,-1.18%,3.66%,-1.92%
02-1m-train-02,4.61%,2.36%,-2.60%,-1.70%,0.39%,-0.17%,0.00%,4.58%,-3.79%,1.49%
03-1m-train-03,8.82%,-3.79%,6.88%,6.97%,-7.95%,0.80%,5.34%,3.22%,-0.35%,4.33%
04-1m-train-04,5.66%,-0.86%,-1.35%,1.46%,0.00%,-3.23%,-0.09%,4.53%,1.71%,0.00%
05-1m-train-05,0.00%,1.00%,2.05%,4.05%,-4.30%,3.40%,-4.67%,3.05%,5.50%,-5.87%
06-1m-train-06,-2.18%,-0.49%,4.47%,1.28%,-1.66%,0.42%,-2.81%,8.35%,1.39%,-10.02%
07-1m-train-07,6.27%,1.34%,5.78%,-1.28%,-2.99%,1.33%,-1.89%,2.59%,4.08%,-0.49%
08-1m-train-08,0.00%,0.51%,8.46%,1.22%,-2.14%,-5.26%,5.49%,-1.04%,1.36%,2.29%
09-1m-train-09,4.68%,1.85%,-1.52%,3.13%,-1.34%,5.82%,3.12%,-1.18%,1.92%,3.11%
10-1m-train-10,-1.32%,0.59%,4.56%,-0.53%,-3.83%,2.00%,-1.75%,1.19%,1.86%,4.86%


In [71]:
# difference in finetune-month performance between base and month-adapted model
np.diag(adapted_df).mean() - np.diag(base_df).mean()
# --> slight advantage of month-adapted models
# on off-diagonal months, the month-adapted models still have a domain advantage but not a temporal advantage

0.013823256844179177