In [1]:
import wandb
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import re

# Loading the data

In [2]:
api = wandb.Api(timeout=30)

entity = "mosaic-ml"
project = "paper-mlm-schedule"

min_perf = 0.3

In [3]:
task_to_metric = {
    "cola": "metrics/glue_cola/MulticlassMatthewsCorrCoef",
    "mnli": ["metrics/glue_mnli/MulticlassAccuracy", "metrics/glue_mnli_mismatched/MulticlassAccuracy"],
    "mrpc": ["metrics/glue_mrpc/BinaryF1Score", "metrics/glue_mrpc/MulticlassAccuracy"],
    "qnli": "metrics/glue_qnli/MulticlassAccuracy",
    "qqp": ["metrics/glue_qqp/BinaryF1Score", "metrics/glue_qqp/MulticlassAccuracy"],
    "rte": "metrics/glue_rte/MulticlassAccuracy",
    "sst-2": "metrics/glue_sst2/MulticlassAccuracy",
    "stsb": "metrics/glue_stsb/SpearmanCorrCoef"
}
tasks = [
    'mnli',
    'qnli',
    'qqp',
    'cola',
    'sst-2',
    'rte',
    'mrpc',
    'stsb',
]
tasks_formatted = [
    "cola/MulticlassMatthewsCorrCoef",
    "mnli/MulticlassAccuracy",
    "mnli_mismatched/MulticlassAccuracy",
    "mrpc/BinaryF1Score",
    "mrpc/MulticlassAccuracy",
    "qnli/MulticlassAccuracy",
    "qqp/BinaryF1Score",
    "qqp/MulticlassAccuracy",
    "rte/MulticlassAccuracy",
    "sst2/MulticlassAccuracy",
    "stsb/SpearmanCorrCoef"
]

In [4]:
min_task_acc = {
    "cola/MulticlassMatthewsCorrCoef": 45,
    "mnli/MulticlassAccuracy": 75,
    "mnli_mismatched/MulticlassAccuracy": 75,
    "mrpc/BinaryF1Score": 82,
    "mrpc/MulticlassAccuracy": 75,
    "qnli/MulticlassAccuracy": 80,
    "qqp/BinaryF1Score": 75,
    "qqp/MulticlassAccuracy": 80,
    "rte/MulticlassAccuracy": 65,
    "sst2/MulticlassAccuracy": 80,
    "stsb/SpearmanCorrCoef": 75,
}

In [6]:
MODEL_SIZE = "base"

In [7]:
def get_runs(skip_ids=[]):
    run_lookup = {}
    runs = api.runs("mosaic-ml/paper-mlm-schedule", filters={
        "$and": [{'tags': "best-ckpt"}, {"tags": f"bert-{MODEL_SIZE}"}, {"tags": "glue"}]})
    for run in tqdm(runs):
        if run.state != "finished":
            continue
        
        if run.id in skip_ids:
            continue

        run_name = run.name
        task = [s for s in run_name.split("_") if "task=" in s][0].split("=")[1]
        tags = run.tags
        group = run.group

        # parsing group to speed up
        # subset_runs = [f"subset-linear-initial-0.3-final-0.15-og-seed-{seed}-ckpt-70000" for seed in [17, 3047, 2048]]
        # linear_runs = [f"linear-initial-0.3-final-0.15-og-seed-{seed}" for seed in [17, 3047, 2048]]
        # constant_runs = [f"constant-initial-0.15-final-0.15-og-seed-{seed}" for seed in [17, 3047, 2048]]
        # if group not in subset_runs + linear_runs + constant_runs:
        #     continue
        if "large-" in group:
            group = group.replace("large-", "")
        if "schedule-" in group:
            group = group.replace("schedule-", "")
        experiment_name = group.split("-seed-")[0]
        if "subset" in group:
            scheduler = experiment_name.split("-")[1]
        else:
            scheduler = experiment_name.split("-")[0]
        init_rate = [float(t.split("-")[1]) for t in tags if "initial" in t][0]
        final_rate = [float(t.split("-")[1]) for t in tags if "final" in t][0]
        experiment_name = f"{scheduler}-{init_rate}-{final_rate}"
        if "subset" in group:
            experiment_name = "subset-" + experiment_name
        pretrain_seed = int(re.search(r"seed-(\d+)", group).group(1))
        glue_seed = int(re.search(r"seed=(\d+)", run_name).group(1))
                
        if task == 'sst2':
            task = 'sst-2'
        
        if task not in tasks:
            raise ValueError(f"Task {task} not recognized.")
        
        try:
            metric_names = task_to_metric[task]
            if isinstance(metric_names, str):
                metric_names = [metric_names]
            elif not isinstance(metric_names, list):
                raise Exception("Unsupported type for 'metric_name'")

            for metric_idx, metric_name in enumerate(metric_names):
                metric_hist = run.history(keys=[metric_name]).to_numpy()[:, 1:].mean(axis=1)
                final_metric = max(metric_hist)
                
                # if final_metric < min_perf:
                #     continue
                
                metric_task = metric_name.replace("metrics/glue_", "")
                run_lookup[run.id + f"-{metric_idx}"] = {
                    'task': metric_task,
                    'experiment_name': experiment_name,
                    'final_metric': 100 * final_metric,
                    'pretrain_seed': pretrain_seed,
                    "glue_seed": glue_seed,
                    'scheduler': scheduler,
                    "init_rate": init_rate,
                    "final_rate": final_rate
                }
        except:
            print(f"Error for run: {group} with id ({run.id})")
                
    return run_lookup

In [8]:
def save_runs(save_name, runs):
    with open(save_name, 'wb') as f:
        pickle.dump(runs, f)

def load_runs(save_name):
    with open(save_name, 'rb') as f:
        runs = pickle.load(f)
    return runs

In [9]:
load_path = f"glue/bert-{MODEL_SIZE}-results.pkl"
save_path = f"glue/bert-{MODEL_SIZE}-results-prime.pkl"

old_run_ids = set(load_runs(load_path).keys())
# old_run_ids = set()

run_lookup = get_runs(skip_ids=old_run_ids)

save_runs(save_path, run_lookup)

100%|██████████| 2520/2520 [25:08<00:00,  1.67it/s]  


In [10]:
run_lookup = load_runs(save_path)

In [32]:
columns=["task", "experiment_name", "final_metric", "pretrain_seed",
         "glue_seed", "scheduler", "init_rate", "final_rate"]
results = []
for run_info in run_lookup.values():
    if run_info is None:
        continue
    if run_info["final_metric"] < min_task_acc[run_info["task"]]:
        continue
    results.append([run_info[c] for c in columns])
base_df = pd.DataFrame(results, columns = columns)
base_df[["final_metric", 'pretrain_seed', "glue_seed", "init_rate", "final_rate"]] = base_df[["final_metric", 'pretrain_seed', "glue_seed", "init_rate", "final_rate"]].apply(pd.to_numeric)
base_df = base_df.sort_values(by=['experiment_name'], ascending=False)

base_df

Unnamed: 0,task,experiment_name,final_metric,pretrain_seed,glue_seed,scheduler,init_rate,final_rate
247,qqp/MulticlassAccuracy,subset-linear-0.3-0.15,91.251546,17,10536,linear,0.30,0.15
204,mrpc/BinaryF1Score,subset-linear-0.3-0.15,92.280704,17,19,linear,0.30,0.15
222,rte/MulticlassAccuracy,subset-linear-0.3-0.15,76.173288,3047,717,linear,0.30,0.15
221,rte/MulticlassAccuracy,subset-linear-0.3-0.15,78.700364,3047,19,linear,0.30,0.15
220,mrpc/MulticlassAccuracy,subset-linear-0.3-0.15,87.990195,3047,8364,linear,0.30,0.15
...,...,...,...,...,...,...,...,...
3164,stsb/SpearmanCorrCoef,constant-0.15-0.15,89.078087,17,90166,constant,0.15,0.15
3160,stsb/SpearmanCorrCoef,constant-0.15-0.15,89.544421,3047,8364,constant,0.15,0.15
3279,qqp/BinaryF1Score,constant-0.15-0.15,88.270420,2048,717,constant,0.15,0.15
3280,qqp/MulticlassAccuracy,constant-0.15-0.15,91.254020,2048,717,constant,0.15,0.15


# Sanity check data

In [33]:
for task in tasks_formatted:
    print(task)
    print(base_df[base_df.task==task].experiment_name.value_counts())
    print(' ')

cola/MulticlassMatthewsCorrCoef
experiment_name
linear-0.3-0.15           30
subset-linear-0.3-0.15    15
step-0.3-0.15             15
constant-0.2-0.2          15
constant-0.25-0.25        15
constant-0.3-0.3          15
constant-0.35-0.35        15
constant-0.4-0.4          15
cosine-0.3-0.15           15
linear-0.15-0.3           15
linear-0.3-0.2            15
linear-0.3-0.35           15
linear-0.3-0.4            15
linear-0.3-0.45           15
linear-0.35-0.15          15
linear-0.35-0.2           15
linear-0.35-0.25          15
linear-0.35-0.3           15
constant-0.15-0.15        15
linear-0.3-0.25           14
Name: count, dtype: int64
 
mnli/MulticlassAccuracy
experiment_name
linear-0.3-0.15           29
subset-linear-0.3-0.15    15
step-0.3-0.15             15
constant-0.2-0.2          15
constant-0.25-0.25        15
constant-0.3-0.3          15
constant-0.35-0.35        15
constant-0.4-0.4          15
cosine-0.3-0.15           15
linear-0.3-0.2            15
linear-0.3-0.3

In [34]:
from scipy.stats import ttest_ind

In [35]:
# base_df = base_df[base_df["glue_seed"] != 8364]
# base_df = base_df[base_df["pretrain_seed"] != 17]

In [36]:
metric_stand_err = base_df.groupby(["experiment_name", "task"])["final_metric"].sem().reset_index()
metric_stand_err
grouped_df = base_df.groupby(["experiment_name", "task"]).mean(numeric_only=True).reset_index()
grouped_df["error"] = metric_stand_err["final_metric"]
grouped_df = grouped_df.round({'final_metric': 2, 'error': 2})
grouped_df

Unnamed: 0,experiment_name,task,final_metric,pretrain_seed,glue_seed,init_rate,final_rate,error
0,constant-0.15-0.15,cola/MulticlassMatthewsCorrCoef,55.89,1704.0,21960.4,0.15,0.15,0.45
1,constant-0.15-0.15,mnli/MulticlassAccuracy,84.30,1704.0,21960.4,0.15,0.15,0.06
2,constant-0.15-0.15,mnli_mismatched/MulticlassAccuracy,84.71,1704.0,21960.4,0.15,0.15,0.04
3,constant-0.15-0.15,mrpc/BinaryF1Score,91.94,1704.0,21960.4,0.15,0.15,0.16
4,constant-0.15-0.15,mrpc/MulticlassAccuracy,88.69,1704.0,21960.4,0.15,0.15,0.24
...,...,...,...,...,...,...,...,...
215,subset-linear-0.3-0.15,qqp/BinaryF1Score,88.31,1704.0,21960.4,0.30,0.15,0.03
216,subset-linear-0.3-0.15,qqp/MulticlassAccuracy,91.31,1704.0,21960.4,0.30,0.15,0.03
217,subset-linear-0.3-0.15,rte/MulticlassAccuracy,74.37,1704.0,21960.4,0.30,0.15,0.68
218,subset-linear-0.3-0.15,sst2/MulticlassAccuracy,92.72,1704.0,21960.4,0.30,0.15,0.07


In [37]:
grouped_df.pivot(index="experiment_name", columns="task", values="final_metric")

task,cola/MulticlassMatthewsCorrCoef,mnli/MulticlassAccuracy,mnli_mismatched/MulticlassAccuracy,mrpc/BinaryF1Score,mrpc/MulticlassAccuracy,qnli/MulticlassAccuracy,qqp/BinaryF1Score,qqp/MulticlassAccuracy,rte/MulticlassAccuracy,sst2/MulticlassAccuracy,stsb/SpearmanCorrCoef
experiment_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
constant-0.15-0.15,55.89,84.3,84.71,91.94,88.69,90.38,88.31,91.31,76.65,92.91,89.38
constant-0.2-0.2,56.45,84.46,84.95,91.63,88.3,90.64,88.24,91.25,76.73,92.59,89.6
constant-0.25-0.25,56.74,84.28,84.79,92.06,89.05,90.61,88.3,91.31,76.27,92.54,89.84
constant-0.3-0.3,57.24,84.5,84.83,92.18,89.2,90.82,88.31,91.31,76.56,92.79,89.85
constant-0.35-0.35,55.62,84.4,84.99,91.67,88.45,90.84,88.31,91.32,77.81,92.86,89.88
constant-0.4-0.4,55.74,84.14,84.67,91.75,88.5,90.6,88.32,91.32,75.76,92.47,89.93
cosine-0.3-0.15,57.45,84.55,84.97,91.94,88.91,90.94,88.39,91.38,77.67,92.91,89.64
linear-0.15-0.3,56.46,84.31,84.74,91.96,88.81,90.62,88.28,91.3,75.28,92.74,89.31
linear-0.3-0.15,58.4,84.64,85.16,91.93,88.82,90.85,88.32,91.31,76.43,92.7,89.86
linear-0.3-0.2,57.29,84.57,84.89,91.38,87.84,90.87,88.33,91.32,77.04,92.84,89.78


In [38]:
metrics_with_errors = []
for task_name in tasks_formatted:
    metrics_with_errors.append(task_name)
    metrics_with_errors.append(task_name + " STE")
metrics_with_errors

['cola/MulticlassMatthewsCorrCoef',
 'cola/MulticlassMatthewsCorrCoef STE',
 'mnli/MulticlassAccuracy',
 'mnli/MulticlassAccuracy STE',
 'mnli_mismatched/MulticlassAccuracy',
 'mnli_mismatched/MulticlassAccuracy STE',
 'mrpc/BinaryF1Score',
 'mrpc/BinaryF1Score STE',
 'mrpc/MulticlassAccuracy',
 'mrpc/MulticlassAccuracy STE',
 'qnli/MulticlassAccuracy',
 'qnli/MulticlassAccuracy STE',
 'qqp/BinaryF1Score',
 'qqp/BinaryF1Score STE',
 'qqp/MulticlassAccuracy',
 'qqp/MulticlassAccuracy STE',
 'rte/MulticlassAccuracy',
 'rte/MulticlassAccuracy STE',
 'sst2/MulticlassAccuracy',
 'sst2/MulticlassAccuracy STE',
 'stsb/SpearmanCorrCoef',
 'stsb/SpearmanCorrCoef STE']

In [39]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
#pd.set_option('max_colwidth', -1)

In [40]:
BASELINE_RUN = "constant-0.15-0.15"

In [41]:
task_grouped = []
for experiment_name in grouped_df["experiment_name"]:
    experiment_grouped = [experiment_name]
    for metric_name in tasks_formatted:
        print(experiment_name, metric_name)
        performance = grouped_df[
            (grouped_df["experiment_name"] == experiment_name) & (grouped_df["task"] == metric_name)]["final_metric"].values[0]

        error = grouped_df[
            (grouped_df["experiment_name"] == experiment_name) & (grouped_df["task"] == metric_name)]["error"].values[0]
        
        experiment_grouped.append(performance)
        experiment_grouped.append(error)
        
    task_grouped.append(experiment_grouped)

task_grouped_df = pd.DataFrame(task_grouped, columns=["experiment_name"]+metrics_with_errors)
task_grouped_df = task_grouped_df.drop_duplicates()
task_grouped_df["glue_mean"] = task_grouped_df[tasks_formatted].mean(axis=1)
constant_avg = task_grouped_df[
    task_grouped_df["experiment_name"] == BASELINE_RUN]["glue_mean"].values[0]
task_grouped_df["glue_delta"] = task_grouped_df["glue_mean"].map(lambda avg: avg - constant_avg)
task_grouped_df = task_grouped_df.sort_values(by=["glue_mean"], ascending=False)
task_grouped_df

constant-0.15-0.15 cola/MulticlassMatthewsCorrCoef
constant-0.15-0.15 mnli/MulticlassAccuracy
constant-0.15-0.15 mnli_mismatched/MulticlassAccuracy
constant-0.15-0.15 mrpc/BinaryF1Score
constant-0.15-0.15 mrpc/MulticlassAccuracy
constant-0.15-0.15 qnli/MulticlassAccuracy
constant-0.15-0.15 qqp/BinaryF1Score
constant-0.15-0.15 qqp/MulticlassAccuracy
constant-0.15-0.15 rte/MulticlassAccuracy
constant-0.15-0.15 sst2/MulticlassAccuracy
constant-0.15-0.15 stsb/SpearmanCorrCoef
constant-0.15-0.15 cola/MulticlassMatthewsCorrCoef
constant-0.15-0.15 mnli/MulticlassAccuracy
constant-0.15-0.15 mnli_mismatched/MulticlassAccuracy
constant-0.15-0.15 mrpc/BinaryF1Score
constant-0.15-0.15 mrpc/MulticlassAccuracy
constant-0.15-0.15 qnli/MulticlassAccuracy
constant-0.15-0.15 qqp/BinaryF1Score
constant-0.15-0.15 qqp/MulticlassAccuracy
constant-0.15-0.15 rte/MulticlassAccuracy
constant-0.15-0.15 sst2/MulticlassAccuracy
constant-0.15-0.15 stsb/SpearmanCorrCoef
constant-0.15-0.15 cola/MulticlassMatthewsCorr

Unnamed: 0,experiment_name,cola/MulticlassMatthewsCorrCoef,cola/MulticlassMatthewsCorrCoef STE,mnli/MulticlassAccuracy,mnli/MulticlassAccuracy STE,mnli_mismatched/MulticlassAccuracy,mnli_mismatched/MulticlassAccuracy STE,mrpc/BinaryF1Score,mrpc/BinaryF1Score STE,mrpc/MulticlassAccuracy,mrpc/MulticlassAccuracy STE,qnli/MulticlassAccuracy,qnli/MulticlassAccuracy STE,qqp/BinaryF1Score,qqp/BinaryF1Score STE,qqp/MulticlassAccuracy,qqp/MulticlassAccuracy STE,rte/MulticlassAccuracy,rte/MulticlassAccuracy STE,sst2/MulticlassAccuracy,sst2/MulticlassAccuracy STE,stsb/SpearmanCorrCoef,stsb/SpearmanCorrCoef STE,glue_mean,glue_delta
66,cosine-0.3-0.15,57.45,0.43,84.55,0.08,84.97,0.06,91.94,0.12,88.91,0.17,90.94,0.04,88.39,0.03,91.38,0.02,77.67,0.23,92.91,0.1,89.64,0.1,85.340909,0.389091
88,linear-0.3-0.15,58.4,0.25,84.64,0.03,85.16,0.04,91.93,0.12,88.82,0.16,90.85,0.05,88.32,0.02,91.31,0.01,76.43,0.16,92.7,0.06,89.86,0.05,85.310909,0.359091
154,linear-0.35-0.15,56.92,0.25,84.61,0.06,84.96,0.08,92.23,0.19,89.22,0.26,91.11,0.07,88.35,0.02,91.35,0.02,76.8,0.42,92.85,0.07,89.92,0.08,85.301818,0.35
165,linear-0.35-0.2,57.37,0.2,84.34,0.05,85.0,0.04,91.67,0.11,88.51,0.14,91.03,0.06,88.26,0.04,91.28,0.03,77.5,0.32,92.9,0.08,89.99,0.1,85.259091,0.307273
33,constant-0.3-0.3,57.24,0.34,84.5,0.04,84.83,0.04,92.18,0.19,89.2,0.24,90.82,0.1,88.31,0.03,91.31,0.02,76.56,0.35,92.79,0.11,89.85,0.09,85.235455,0.283636
198,step-0.3-0.15,57.47,0.51,84.65,0.06,85.09,0.04,91.56,0.19,88.14,0.27,90.85,0.07,88.37,0.03,91.37,0.02,77.71,0.26,92.76,0.1,89.59,0.07,85.232727,0.280909
176,linear-0.35-0.25,56.21,0.41,84.61,0.06,85.02,0.06,91.69,0.15,88.46,0.2,91.05,0.07,88.29,0.04,91.32,0.03,77.57,0.26,93.09,0.09,89.87,0.08,85.198182,0.246364
110,linear-0.3-0.25,57.33,0.29,84.63,0.05,84.93,0.05,92.02,0.27,88.9,0.41,90.84,0.06,88.33,0.04,91.33,0.03,76.1,0.24,92.84,0.1,89.19,0.56,85.130909,0.179091
99,linear-0.3-0.2,57.29,0.43,84.57,0.05,84.89,0.04,91.38,0.22,87.84,0.35,90.87,0.03,88.33,0.04,91.32,0.03,77.04,0.21,92.84,0.05,89.78,0.08,85.104545,0.152727
44,constant-0.35-0.35,55.62,0.36,84.4,0.05,84.99,0.05,91.67,0.25,88.45,0.32,90.84,0.07,88.31,0.03,91.32,0.02,77.81,0.38,92.86,0.09,89.88,0.09,85.104545,0.152727


In [42]:
task_grouped_df[["experiment_name", "glue_delta"]]

Unnamed: 0,experiment_name,glue_delta
66,cosine-0.3-0.15,0.389091
88,linear-0.3-0.15,0.359091
154,linear-0.35-0.15,0.35
165,linear-0.35-0.2,0.307273
33,constant-0.3-0.3,0.283636
198,step-0.3-0.15,0.280909
176,linear-0.35-0.25,0.246364
110,linear-0.3-0.25,0.179091
99,linear-0.3-0.2,0.152727
44,constant-0.35-0.35,0.152727


In [43]:
paper_just_metrics = ["mnli/MulticlassAccuracy", "mnli_mismatched/MulticlassAccuracy", "qnli/MulticlassAccuracy",
                 "qqp/BinaryF1Score", "rte/MulticlassAccuracy", "sst2/MulticlassAccuracy",
                 "mrpc/BinaryF1Score", "cola/MulticlassMatthewsCorrCoef", "stsb/SpearmanCorrCoef"
                ]
# paper_just_metrics = ["mnli/MulticlassAccuracy", "mnli_mismatched/MulticlassAccuracy", "qnli/MulticlassAccuracy",
#                  "qqp/BinaryF1Score", "sst2/MulticlassAccuracy",
#                  "mrpc/BinaryF1Score", "cola/MulticlassMatthewsCorrCoef", "stsb/SpearmanCorrCoef"
#                 ]
paper_just_errors = []
paper_metrics = []
for metric_name in paper_just_metrics:
    paper_metrics.append(metric_name)
    paper_metrics.append(metric_name + " STE")
    paper_just_errors.append(metric_name + " STE")

In [44]:
experiments = ["constant-0.15-0.15", "linear-0.15-0.3", "linear-0.3-0.15"]
# experiments = ["constant-0.4-0.4", "linear-0.4-0.25"]

In [45]:
paper_df = task_grouped_df[["experiment_name", *paper_metrics]]
paper_df["glue_mean"] = paper_df[paper_just_metrics].mean(axis=1)
paper_df["glue_ste"] = paper_df[paper_just_errors].mean(axis=1)
constant_avg = paper_df[
    paper_df["experiment_name"] == BASELINE_RUN]["glue_mean"].values[0]
#paper_df = paper_df[paper_df["experiment_name"].isin(experiments)]
paper_df["glue_delta"] = paper_df["glue_mean"].map(lambda avg: avg - constant_avg)
paper_df = paper_df.sort_values(by="glue_mean", ascending=False)
paper_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  paper_df["glue_mean"] = paper_df[paper_just_metrics].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  paper_df["glue_ste"] = paper_df[paper_just_errors].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  paper_df["glue_delta"] = paper_df["glue_mean"].map(lambda avg: avg - consta

Unnamed: 0,experiment_name,mnli/MulticlassAccuracy,mnli/MulticlassAccuracy STE,mnli_mismatched/MulticlassAccuracy,mnli_mismatched/MulticlassAccuracy STE,qnli/MulticlassAccuracy,qnli/MulticlassAccuracy STE,qqp/BinaryF1Score,qqp/BinaryF1Score STE,rte/MulticlassAccuracy,rte/MulticlassAccuracy STE,sst2/MulticlassAccuracy,sst2/MulticlassAccuracy STE,mrpc/BinaryF1Score,mrpc/BinaryF1Score STE,cola/MulticlassMatthewsCorrCoef,cola/MulticlassMatthewsCorrCoef STE,stsb/SpearmanCorrCoef,stsb/SpearmanCorrCoef STE,glue_mean,glue_ste,glue_delta
66,cosine-0.3-0.15,84.55,0.08,84.97,0.06,90.94,0.04,88.39,0.03,77.67,0.23,92.91,0.1,91.94,0.12,57.45,0.43,89.64,0.1,84.273333,0.132222,0.443333
88,linear-0.3-0.15,84.64,0.03,85.16,0.04,90.85,0.05,88.32,0.02,76.43,0.16,92.7,0.06,91.93,0.12,58.4,0.25,89.86,0.05,84.254444,0.086667,0.424444
165,linear-0.35-0.2,84.34,0.05,85.0,0.04,91.03,0.06,88.26,0.04,77.5,0.32,92.9,0.08,91.67,0.11,57.37,0.2,89.99,0.1,84.228889,0.111111,0.398889
198,step-0.3-0.15,84.65,0.06,85.09,0.04,90.85,0.07,88.37,0.03,77.71,0.26,92.76,0.1,91.56,0.19,57.47,0.51,89.59,0.07,84.227778,0.147778,0.397778
154,linear-0.35-0.15,84.61,0.06,84.96,0.08,91.11,0.07,88.35,0.02,76.8,0.42,92.85,0.07,92.23,0.19,56.92,0.25,89.92,0.08,84.194444,0.137778,0.364444
176,linear-0.35-0.25,84.61,0.06,85.02,0.06,91.05,0.07,88.29,0.04,77.57,0.26,93.09,0.09,91.69,0.15,56.21,0.41,89.87,0.08,84.155556,0.135556,0.325556
33,constant-0.3-0.3,84.5,0.04,84.83,0.04,90.82,0.1,88.31,0.03,76.56,0.35,92.79,0.11,92.18,0.19,57.24,0.34,89.85,0.09,84.12,0.143333,0.29
99,linear-0.3-0.2,84.57,0.05,84.89,0.04,90.87,0.03,88.33,0.04,77.04,0.21,92.84,0.05,91.38,0.22,57.29,0.43,89.78,0.08,84.11,0.127778,0.28
44,constant-0.35-0.35,84.4,0.05,84.99,0.05,90.84,0.07,88.31,0.03,77.81,0.38,92.86,0.09,91.67,0.25,55.62,0.36,89.88,0.09,84.042222,0.152222,0.212222
110,linear-0.3-0.25,84.63,0.05,84.93,0.05,90.84,0.06,88.33,0.04,76.1,0.24,92.84,0.1,92.02,0.27,57.33,0.29,89.19,0.56,84.023333,0.184444,0.193333


In [46]:
paper_df[["experiment_name", "glue_mean", "glue_delta"]]

Unnamed: 0,experiment_name,glue_mean,glue_delta
66,cosine-0.3-0.15,84.273333,0.443333
88,linear-0.3-0.15,84.254444,0.424444
165,linear-0.35-0.2,84.228889,0.398889
198,step-0.3-0.15,84.227778,0.397778
154,linear-0.35-0.15,84.194444,0.364444
176,linear-0.35-0.25,84.155556,0.325556
33,constant-0.3-0.3,84.12,0.29
99,linear-0.3-0.2,84.11,0.28
44,constant-0.35-0.35,84.042222,0.212222
110,linear-0.3-0.25,84.023333,0.193333


# Statistical sig testing for overall fit

In [47]:
base_df

Unnamed: 0,task,experiment_name,final_metric,pretrain_seed,glue_seed,scheduler,init_rate,final_rate
247,qqp/MulticlassAccuracy,subset-linear-0.3-0.15,91.251546,17,10536,linear,0.30,0.15
204,mrpc/BinaryF1Score,subset-linear-0.3-0.15,92.280704,17,19,linear,0.30,0.15
222,rte/MulticlassAccuracy,subset-linear-0.3-0.15,76.173288,3047,717,linear,0.30,0.15
221,rte/MulticlassAccuracy,subset-linear-0.3-0.15,78.700364,3047,19,linear,0.30,0.15
220,mrpc/MulticlassAccuracy,subset-linear-0.3-0.15,87.990195,3047,8364,linear,0.30,0.15
...,...,...,...,...,...,...,...,...
3164,stsb/SpearmanCorrCoef,constant-0.15-0.15,89.078087,17,90166,constant,0.15,0.15
3160,stsb/SpearmanCorrCoef,constant-0.15-0.15,89.544421,3047,8364,constant,0.15,0.15
3279,qqp/BinaryF1Score,constant-0.15-0.15,88.270420,2048,717,constant,0.15,0.15
3280,qqp/MulticlassAccuracy,constant-0.15-0.15,91.254020,2048,717,constant,0.15,0.15


In [48]:
base_df[(base_df["experiment_name"] == "constant-0.15-0.15") & (base_df["task"] == "stsb/SpearmanCorrCoef") & (base_df["pretrain_seed"] == "17") & (base_df["glue_seed"] == "90166")]
base_df[(base_df["experiment_name"] == "constant-0.15-0.15") & (base_df["task"] == "stsb/SpearmanCorrCoef") & (base_df["pretrain_seed"] == 17) &   (base_df["glue_seed"] == 90166)]


Unnamed: 0,task,experiment_name,final_metric,pretrain_seed,glue_seed,scheduler,init_rate,final_rate
3164,stsb/SpearmanCorrCoef,constant-0.15-0.15,89.078087,17,90166,constant,0.15,0.15


In [49]:
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests

def build_ttest(experiments):
    raw_results_grouped = {}
    for task in paper_just_metrics:
        task_results = {}
        for experiment_name in experiments:
            experiment_task_perfs = []
            # for pre_seed in [17, 2048, 3047]:
            for pre_seed in [17, 3047]:
                for glue_seed in [19, 8364, 717, 10536, 90166]:
                    try:
                        task_perf = base_df[
                            (base_df["experiment_name"] == experiment_name) & (base_df["task"] == task) & (base_df["pretrain_seed"] == pre_seed) & (base_df["glue_seed"] == glue_seed)]["final_metric"].values[0]
                        experiment_task_perfs.append(task_perf)
                    except Exception as e:
                        # print(e)
                        any_issue = True
                        continue
            task_results[experiment_name] = experiment_task_perfs
        raw_results_grouped[task] = task_results
        
    averages = {}
    for experiment_name in experiments:
        experiment_averages = []
        # for pre_seed in [17, 2048, 3047]:
        for pre_seed in [17, 3047]:
            for glue_seed in [19, 8364, 717, 10536, 90166]:
                any_issue = False
                run_perfs = []
                for task in paper_just_metrics:
                    try:
                        task_perf = base_df[
                            (base_df["experiment_name"] == experiment_name) & (base_df["task"] == task) & (base_df["pretrain_seed"] == pre_seed) & (base_df["glue_seed"] == glue_seed)]["final_metric"].values[0]
                        run_perfs.append(task_perf)
                    except Exception as e:
                        # print(e)
                        any_issue = True
                        continue
                if not any_issue:
                    experiment_averages.append(np.mean(run_perfs))
        averages[experiment_name] = experiment_averages
    raw_results_grouped["glue_mean"] = averages
    # print(raw_results_grouped)
    return raw_results_grouped

# Format the table for the paper

In [50]:
def print_latex_table(experiments):
    subset_paper_df = paper_df[paper_df["experiment_name"].isin(experiments)]
    best_experiment_per_task = {}
    for task in paper_just_metrics + ["glue_mean"]:
        best_experiment_per_task[task] = subset_paper_df[subset_paper_df[task] == subset_paper_df[task].max()]["experiment_name"].values[0]

    raw_results_grouped = build_ttest(experiments)

    sigs = {}
    for task in paper_just_metrics + ["glue_mean"]:
        best = best_experiment_per_task[task]
        pvals = []
        ordered_exp = []
        for experiment in experiments:
            if experiment == best:
                continue
            ordered_exp.append(experiment)
            sig = ttest_ind(raw_results_grouped[task][experiment], raw_results_grouped[task][best], alternative="less").pvalue
            pvals.append(sig)
        # not_reject = ~multipletests(pvals, method="fdr_bh")[0]
        sig_task = {}
        for exp, is_null in zip(ordered_exp, ~multipletests(pvals, method="fdr_bh")[0]):
            sig_task[exp] = is_null
        sigs[task] = sig_task

    for experiment in experiments:
        # print(experiment)
        latex_str = f"{experiment}\t& "
        for task in paper_just_metrics + ["glue_mean"]:
            # print(task)
            exp_perf = subset_paper_df[subset_paper_df["experiment_name"] == experiment][task].values[0]
            # is_not_sig = ttest_ind(raw_results_grouped[task][experiment], raw_results_grouped[task][best_experiment_per_task[task]], alternative="less").pvalue > 0.05
            if experiment == best_experiment_per_task[task] or sigs[task][experiment]:
                latex_str += r"\textbf{" + exp_perf.round(2).astype(str) + r"}"
            else:
                latex_str += exp_perf.round(2).astype(str)
            if task.split("/")[0] == "mnli":
                latex_str += "/"
            else:
                latex_str += "\t& "
        # latex_str += paper_df[paper_df["experiment_name"] == experiment]["glue_mean"].values[0].round(2).astype(str)
        latex_str = latex_str[:-2]  + "\t " + r"\\"
        print(latex_str)

In [51]:
print_latex_table(["constant-0.4-0.4", "linear-0.4-0.25"])

constant-0.4-0.4	& \textbf{84.14}/\textbf{84.67}	& \textbf{90.6}	& \textbf{88.32}	& \textbf{75.76}	& \textbf{92.47}	& \textbf{91.75}	& \textbf{55.74}	& \textbf{89.93}	& \textbf{83.71}		 \\


IndexError: index 0 is out of bounds for axis 0 with size 0

In [199]:
print_latex_table(["constant-0.15-0.15", "constant-0.3-0.3", "linear-0.3-0.15",
                   "linear-0.3-0.2", "linear-0.3-0.25", "linear-0.3-0.35",
                   "linear-0.3-0.4", "linear-0.3-0.45"
                   ])

constant-0.15-0.15	& 84.3/84.71	& 90.38	& \textbf{88.31}	& \textbf{76.65}	& \textbf{92.91}	& \textbf{91.94}	& 55.89	& 89.38	& 83.83		 \\
constant-0.3-0.3	& 84.5/84.83	& \textbf{90.82}	& \textbf{88.31}	& \textbf{76.56}	& \textbf{92.79}	& \textbf{92.18}	& 57.24	& \textbf{89.85}	& 84.12		 \\
linear-0.3-0.15	& \textbf{84.61}/\textbf{85.13}	& \textbf{90.89}	& \textbf{88.34}	& 76.25	& \textbf{92.71}	& \textbf{91.87}	& \textbf{58.96}	& \textbf{89.87}	& \textbf{84.29}		 \\
linear-0.3-0.2	& \textbf{84.57}/84.89	& \textbf{90.87}	& \textbf{88.33}	& \textbf{77.04}	& \textbf{92.84}	& 91.38	& 57.29	& \textbf{89.78}	& 84.11		 \\
linear-0.3-0.25	& \textbf{84.63}/84.93	& \textbf{90.84}	& \textbf{88.33}	& 76.1	& \textbf{92.84}	& \textbf{92.02}	& 57.33	& \textbf{89.19}	& 84.02		 \\
linear-0.3-0.35	& 84.31/84.85	& \textbf{90.73}	& \textbf{88.28}	& \textbf{76.9}	& \textbf{92.91}	& \textbf{91.68}	& 55.85	& 89.7	& 83.91		 \\
linear-0.3-0.4	& 84.19/84.71	& \textbf{90.74}	& \textbf{88.31}	& \textbf{76.82}	& 92

In [52]:
print_latex_table(["constant-0.15-0.15", "linear-0.15-0.3", "linear-0.3-0.15",
                   ])

constant-0.15-0.15	& 84.3/84.71	& 90.38	& \textbf{88.31}	& \textbf{76.65}	& \textbf{92.91}	& \textbf{91.94}	& 55.89	& 89.38	& 83.83		 \\
linear-0.15-0.3	& 84.31/84.74	& \textbf{90.62}	& \textbf{88.28}	& 75.28	& \textbf{92.74}	& \textbf{91.96}	& 56.46	& 89.31	& 83.74		 \\
linear-0.3-0.15	& \textbf{84.64}/\textbf{85.16}	& \textbf{90.85}	& \textbf{88.32}	& \textbf{76.43}	& \textbf{92.7}	& \textbf{91.93}	& \textbf{58.4}	& \textbf{89.86}	& \textbf{84.25}		 \\


In [200]:
print_latex_table(["constant-0.15-0.15", "constant-0.3-0.3", "linear-0.3-0.15",
                   "cosine-0.3-0.15", "step-0.3-0.15"
                   ])

constant-0.15-0.15	& 84.3/84.71	& 90.38	& 88.31	& 76.65	& \textbf{92.91}	& \textbf{91.94}	& 55.89	& 89.38	& 83.83		 \\
constant-0.3-0.3	& 84.5/84.83	& \textbf{90.82}	& 88.31	& 76.56	& \textbf{92.79}	& \textbf{92.18}	& 57.24	& \textbf{89.85}	& 84.12		 \\
linear-0.3-0.15	& \textbf{84.61}/\textbf{85.13}	& \textbf{90.89}	& \textbf{88.34}	& 76.25	& \textbf{92.71}	& \textbf{91.87}	& \textbf{58.96}	& \textbf{89.87}	& \textbf{84.29}		 \\
cosine-0.3-0.15	& \textbf{84.55}/84.97	& \textbf{90.94}	& \textbf{88.39}	& \textbf{77.67}	& \textbf{92.91}	& \textbf{91.94}	& 57.45	& 89.64	& \textbf{84.27}		 \\
step-0.3-0.15	& \textbf{84.65}/\textbf{85.09}	& \textbf{90.85}	& \textbf{88.37}	& \textbf{77.71}	& \textbf{92.76}	& \textbf{91.56}	& 57.47	& 89.59	& \textbf{84.23}		 \\


In [46]:
print_latex_table(["constant-0.15-0.15", "subset-linear-0.3-0.15", "linear-0.3-0.15"])

constant-0.15-0.15	& 84.3/84.71	& 90.38	& \textbf{88.31}	& \textbf{76.65}	& \textbf{92.91}	& \textbf{91.94}	& 55.89	& 89.38	& 83.83		 \\
subset-linear-0.3-0.15	& 84.19/84.44	& 90.38	& \textbf{88.31}	& 74.37	& \textbf{92.72}	& \textbf{91.77}	& 57.53	& 89.58	& 83.7		 \\
linear-0.3-0.15	& \textbf{84.61}/\textbf{85.13}	& \textbf{90.89}	& \textbf{88.34}	& \textbf{76.25}	& \textbf{92.71}	& \textbf{91.87}	& \textbf{58.96}	& \textbf{89.87}	& \textbf{84.29}		 \\


In [64]:
raw = build_ttest(["constant-0.15-0.15", "subset-linear-0.3-0.15", "linear-0.3-0.15"])
ttest_ind(raw["glue_mean"]["constant-0.15-0.15"], raw["glue_mean"]["subset-linear-0.3-0.15"], alternative="less")

Ttest_indResult(statistic=-0.5115432018739686, pvalue=0.30759444814111847)