# EVALUATION OF CLASSIFICATION RESULTS

In [2]:
import pandas as pd
import csv
import os
import seaborn as sns
import numpy as np
from collections import defaultdict
import re

from sklearn.metrics import f1_score, classification_report

# Domain Adaptation vs Finetuning Size

In [70]:
# load overall results to dict of dicts, with each subdict corresponding to one dataset (--> ghc, gq and reddit)

results = {}
directory = '../../0_results/classification'
for dataset_name in os.listdir(directory):
    print(f"loading {dataset_name} results to sub-dict")
    results[dataset_name] = {}
    for result_csv in os.listdir(os.path.join(directory, dataset_name, "total-models")):
        if result_csv.endswith(".csv"):
            results[dataset_name][os.path.splitext(result_csv)[0]] = pd.read_csv(os.path.join(directory, dataset_name, "total-models", result_csv))

loading ghc results to sub-dict
loading reddit results to sub-dict
loading gq results to sub-dict


In [85]:
def domain_vs_finetuning_size(dict_of_results, F1_TYPE = "macro"):
    
    scores = {}

    for result in dict_of_results:
        scores[result] = f1_score(dict_of_results[result]['label'], dict_of_results[result]['prediction'], average=F1_TYPE)

    score_series = {}
    for model in sorted(pd.unique(pd.Series(scores.keys()).apply(lambda x: x.split('-train')[0]))):
        score_series[model] = {}
        for key in sorted(scores):
            if model in key:
                finetuning_size = int(re.search(r'(.*?)-train_rand_(.*?)-test(.*?)', key).group(2).rstrip("k"))*1000
                score_series[model][finetuning_size if finetuning_size != 1552000 else 1552] = scores[key] # if catches special case for ghc: missing "k"
    
    plot_df = pd.DataFrame.from_dict(score_series).reset_index().rename(columns={'index':'model'})
    plot_df.set_index('model', inplace=True)
    
    # transpose to put models in rows, test sets in columns
    plot_df = plot_df.sort_index().T
    
    # rearrange so that largest pretraining size is at bottom of df
    plot_df = pd.concat([plot_df.loc[plot_df.index != "bert-rand_10m"], plot_df.loc[plot_df.index == "bert-rand_10m"]], axis=0)
    
    cm = sns.color_palette('Greens', as_cmap=True)
    display(plot_df.style.background_gradient(cmap=cm, axis = None ).format('{0:,.2%}'))
    
    return plot_df

In [86]:
for dataset in ["reddit", "ghc", "gq"]:
    print(dataset.upper())
    out = domain_vs_finetuning_size(results[dataset])
    print()

REDDIT


model,1000,2000,4000,5000,10000,20000,40000,80000,160000,320000
bert-base,34.19%,35.70%,38.61%,39.22%,41.65%,43.22%,44.65%,46.65%,47.92%,49.44%
bert-rand_1m,37.76%,39.26%,41.39%,41.31%,42.91%,44.03%,45.18%,46.87%,47.94%,49.51%
bert-rand_2m,38.45%,39.57%,40.83%,42.05%,42.69%,43.43%,45.39%,46.93%,48.38%,48.98%
bert-rand_5m,38.78%,39.84%,41.76%,42.16%,43.42%,44.46%,45.86%,47.37%,47.93%,49.82%
bert-rand_10m,38.70%,40.37%,41.53%,42.40%,43.47%,44.53%,45.84%,47.23%,48.44%,50.05%



GHC


model,1000,1552,2000,5000,10000,16000
bert-base,65.28%,66.27%,68.34%,68.53%,69.49%,71.93%
bert-rand_1m,68.78%,66.88%,67.58%,67.99%,69.42%,71.27%
bert-rand_2m,66.37%,67.69%,68.44%,69.11%,71.28%,69.96%
bert-rand_5m,67.52%,67.54%,68.44%,69.87%,70.19%,70.21%
bert-rand_10m,66.63%,67.11%,70.61%,68.47%,71.35%,69.39%



GQ


model,1000,2000,5000,10000,20000,26000
bert-base,90.48%,90.73%,90.98%,91.40%,91.80%,92.00%
bert-rand_1m,90.99%,91.19%,91.06%,91.46%,91.92%,91.77%
bert-rand_2m,90.77%,91.25%,91.35%,91.66%,91.72%,91.77%
bert-rand_5m,90.75%,91.38%,91.44%,91.45%,91.93%,91.98%
bert-rand_10m,90.93%,91.30%,91.30%,91.50%,91.89%,91.86%





# Results by Month: INDIVIDUAL MODELS

In [217]:
# load monthly results to dict of dicts of dicts, with each subdict corresponding to one dataset (--> ghc and reddit) and subsubdict to one model type

results = {}
directory = '../../0_results/classification'
for dataset_name in os.listdir(directory):
    if dataset_name != "gq":
        print(f"loading {dataset_name} results to sub-dict")
        results[dataset_name] = {}
        for model_type in os.listdir(os.path.join(directory, dataset_name, "month-models")):
            if model_type!="z_old":
                print(f"  {model_type} to sub-sub-dict")
                results[dataset_name][model_type] = {}
                for result_csv in os.listdir(os.path.join(directory, dataset_name, "month-models", model_type)):
                    if result_csv.endswith(".csv"):
                        results[dataset_name][model_type][os.path.splitext(result_csv)[0]] = \
                        pd.read_csv(os.path.join(directory, dataset_name, "month-models", model_type, result_csv))

loading ghc results to sub-dict
  rand+rand to sub-sub-dict
  month+rand to sub-sub-dict
  base+month to sub-sub-dict
  month+month to sub-sub-dict
  base+rand to sub-sub-dict
  rand+month to sub-sub-dict
loading reddit results to sub-dict
  rand+rand to sub-sub-dict
  month+rand to sub-sub-dict
  base+month to sub-sub-dict
  month+month to sub-sub-dict
  base+rand to sub-sub-dict
  rand+month to sub-sub-dict


In [136]:
def plot_F1_monthly(input_df, REDDIT_FILTER=0, F1_TYPE = "macro", YEAR_FILTER = ["2017", "2018", "2019", "2020"]):

    # calculate f1 
    scores = {}
    
    for key in input_df :
        if REDDIT_FILTER!=0:
            if f"_{REDDIT_FILTER}k-test" in key:
                scores[key] = f1_score(input_df[key]['label'], input_df[key]['prediction'], average=F1_TYPE)
        else:
            scores[key] = f1_score(input_df[key]['label'], input_df[key]['prediction'], average=F1_TYPE)
    
    score_series = {}
    for model in sorted(pd.unique(pd.Series(scores.keys()).apply(lambda x: re.search(r'(.*?)-test(.*?)', x).group(1)))):
        if any("_"+year in model for year in YEAR_FILTER):
            score_series[model] = {}
            for key in sorted(scores):
                if model in key and any("_"+year in re.search(r'(.*?)-test(.*)', key).group(2) for year in YEAR_FILTER):
                    score_series[model]["test" +re.search(r'(.*?)-test(.*)', key).group(2)] = scores[key]
    
    plot_df = pd.DataFrame.from_dict(score_series).reset_index().rename(columns={'index':'model'})

    plot_df.set_index('model', inplace=True)

    cm = sns.color_palette('Greens', as_cmap=True)

    display(plot_df.T.style.background_gradient(cmap=cm, axis = 0 ).format('{0:,.2%}'))


plot_F1_monthly(input_df = results["reddit"]["rand+rand"], REDDIT_FILTER = 20, YEAR_FILTER = ["2017"])

# Results by Month: MODEL COMPARISON

In [199]:
def model_comparison_matchmonths(input_dict, F1_TYPE = "macro", REDDIT_FILTER = 0):

    # calculate f1 scores
    scores = {}

    for model_type in input_dict:
        scores[model_type] = {}
        for key in input_dict[model_type]:
            if REDDIT_FILTER!=0:
                if f"_{REDDIT_FILTER}k-test" in key:
                    scores[model_type][key] = f1_score(input_dict[model_type][key]['label'], input_dict[model_type][key]['prediction'], average=F1_TYPE)
            else:
                scores[model_type][key] = f1_score(input_dict[model_type][key]['label'], input_dict[model_type][key]['prediction'], average=F1_TYPE)

    # write series of scores for each model type, only for test = train/adapt month where applicable
    score_series = {}
    
    for model_type in input_dict:
        score_series[model_type]={}
        
        if "+month" in model_type:
            for key in scores[model_type]:
                if (re.search(r'(.*?)train_(.*?)_(.*?)_(.*?)', key).group(2) == re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(2)) and \
                (re.search(r'(.*?)train_(.*?)_(.*?)_(.*?)', key).group(3) == re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(3)):
                    score_series[model_type][re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(2) + "_" +\
                                             re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(3)] = scores[model_type][key]
                    
        elif "month+rand" in model_type:
            for key in scores[model_type]:
                if (re.search(r'bert-(.*?)_(.*?)_(.*?)', key).group(1) == re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(2)) and \
                (re.search(r'bert-(.*?)_(.*?)_(.*?)', key).group(2) == re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(3)):
                    score_series[model_type][re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(2) + "_" +\
                                             re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(3)] = scores[model_type][key]
                    
        else:
            for key in scores[model_type]:
                score_series[model_type][re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(2) + "_" +\
                                         re.search(r'(.*?)test_(.*?)_(.*?)_(.*?)', key).group(3)] = scores[model_type][key]
    
    month_results_df = pd.DataFrame.from_dict(score_series, orient = "index").T
    month_results_df = month_results_df[['base+rand', 'rand+rand', 'month+rand', 'base+month', 'rand+month', 'month+month']]
    month_results_df.sort_index(inplace=True)
    
    display(month_results_df.style.background_gradient(cmap=cm, axis = 1 ).format('{0:,.2%}'))
    
    display(pd.DataFrame(month_results_df.mean(axis=0)).rename(columns={0: f"avg. {F1_TYPE} F1 across months"}).style.background_gradient(cmap=cm, axis = 0 ).format('{0:,.2%}'))

In [204]:

print("REDDIT, 20K")
model_comparison_matchmonths(input_dict = results["reddit"], REDDIT_FILTER = 20, F1_TYPE="macro")
print()

print("REDDIT, 2K")
model_comparison_matchmonths(input_dict = results["reddit"], REDDIT_FILTER = 2, F1_TYPE="macro")
print()

print("GHC")
model_comparison_matchmonths(input_dict = results["ghc"], F1_TYPE="macro")

REDDIT, 20K


Unnamed: 0,base+rand,rand+rand,month+rand,base+month,rand+month,month+month
2017_03,45.75%,45.85%,46.29%,49.45%,50.57%,51.03%
2017_04,46.28%,46.68%,47.04%,49.43%,50.03%,50.29%
2017_05,45.08%,46.39%,45.28%,50.01%,49.71%,49.84%
2017_06,45.43%,45.93%,46.70%,48.60%,48.97%,49.07%
2017_07,46.95%,47.27%,47.35%,49.55%,50.09%,49.94%
2017_08,44.45%,44.11%,44.75%,47.12%,47.54%,47.77%
2017_09,45.27%,45.42%,45.74%,47.92%,48.35%,48.70%
2017_10,44.31%,45.41%,45.58%,46.51%,48.09%,47.29%
2017_11,45.81%,45.33%,45.32%,47.27%,48.84%,48.72%
2017_12,44.25%,44.47%,45.02%,46.43%,47.14%,47.39%


Unnamed: 0,avg. macro F1 across months
base+rand,43.21%
rand+rand,43.84%
month+rand,43.81%
base+month,45.41%
rand+month,46.02%
month+month,46.12%



REDDIT, 2K


Unnamed: 0,base+rand,rand+rand,month+rand,base+month,rand+month,month+month
2017_03,37.01%,41.49%,40.29%,40.67%,43.00%,43.51%
2017_04,37.04%,40.48%,40.66%,40.25%,44.54%,43.87%
2017_05,38.21%,40.56%,40.59%,40.29%,42.88%,43.96%
2017_06,37.13%,40.36%,40.65%,41.14%,42.45%,42.40%
2017_07,38.16%,42.71%,42.65%,42.74%,44.65%,45.15%
2017_08,36.40%,39.16%,39.69%,38.48%,41.79%,42.77%
2017_09,37.84%,40.96%,41.19%,39.85%,43.27%,43.72%
2017_10,37.20%,40.31%,40.53%,38.93%,42.29%,42.78%
2017_11,38.13%,40.95%,40.36%,40.09%,41.67%,41.61%
2017_12,36.63%,40.17%,39.29%,37.63%,40.38%,40.82%


Unnamed: 0,avg. macro F1 across months
base+rand,35.95%
rand+rand,39.11%
month+rand,39.01%
base+month,37.50%
rand+month,40.19%
month+month,40.38%



GHC


Unnamed: 0,base+rand,rand+rand,month+rand,base+month,rand+month,month+month
2018_01,69.57%,71.15%,70.64%,73.92%,75.78%,77.08%
2018_02,69.43%,65.34%,64.06%,69.21%,71.10%,71.57%
2018_03,67.03%,70.19%,66.87%,64.98%,72.61%,71.85%
2018_04,65.80%,63.08%,65.69%,68.62%,66.24%,70.08%
2018_05,60.03%,60.52%,56.98%,68.87%,63.08%,64.57%
2018_06,65.38%,66.87%,69.67%,66.17%,66.17%,66.59%
2018_07,69.09%,73.21%,70.15%,74.44%,73.06%,72.55%
2018_08,63.03%,70.64%,67.72%,72.67%,69.21%,71.63%
2018_09,58.11%,54.01%,57.25%,66.70%,59.71%,68.62%
2018_10,73.59%,71.63%,80.14%,69.35%,73.51%,74.22%


Unnamed: 0,avg. macro F1 across months
base+rand,66.11%
rand+rand,66.66%
month+rand,66.92%
base+month,69.49%
rand+month,69.05%
month+month,70.88%


In [216]:
# separate analysis for GQ, which is entirely from October 2018

results["gq"] = {}

# load data
directory = '../../0_results/classification/gq'
for split in os.listdir(directory):        
    for result_csv in os.listdir(os.path.join(directory, split)):
        if result_csv.endswith(".csv"):
            results["gq"][os.path.splitext(result_csv)[0]] = pd.read_csv(os.path.join(directory, split, result_csv))
            
# calculate f1 scores --> only for models finetuned on same amount of data (26k)
F1_TYPE="macro"

scores = {}

for key in results["gq"]:
    if "train_rand_26k" in key:
        scores[re.search("(.*?)-train(.*?)",key).group(1)] = f1_score(results["gq"][key]['label'], results["gq"][key]['prediction'], average=F1_TYPE)

plot_df = pd.DataFrame.from_dict(scores, orient="index")
plot_df.sort_index(inplace=True)
plot_df.rename(columns={0: f"{F1_TYPE} F1"}).style.background_gradient(cmap=cm, axis = 0 ).format('{0:,.2%}')

Unnamed: 0,macro F1
bert-2018_01_1m,91.82%
bert-2018_02_1m,91.82%
bert-2018_03_1m,91.85%
bert-2018_04_1m,91.69%
bert-2018_05_1m,91.75%
bert-2018_06_1m,91.73%
bert-2018_07_1m,91.72%
bert-2018_08_1m,91.78%
bert-2018_09_1m,91.80%
bert-2018_10_1m,92.18%
