In [1]:
import wandb
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import re

# Loading the data

In [2]:
api = wandb.Api(timeout=30)

entity = "mosaic-ml"
project = "rebuttal-mlm-schedule"

# min_perf = 0.3

In [3]:
task_to_metric = {
    "cola": "metrics/glue_cola/MulticlassMatthewsCorrCoef",
    "mnli": ["metrics/glue_mnli/MulticlassAccuracy", "metrics/glue_mnli_mismatched/MulticlassAccuracy"],
    "mrpc": ["metrics/glue_mrpc/BinaryF1Score", "metrics/glue_mrpc/MulticlassAccuracy"],
    "qnli": "metrics/glue_qnli/MulticlassAccuracy",
    "qqp": ["metrics/glue_qqp/BinaryF1Score", "metrics/glue_qqp/MulticlassAccuracy"],
    "rte": "metrics/glue_rte/MulticlassAccuracy",
    "sst-2": "metrics/glue_sst2/MulticlassAccuracy",
    "stsb": "metrics/glue_stsb/SpearmanCorrCoef"
}
tasks = [
    'mnli',
    'qnli',
    'qqp',
    'cola',
    'sst-2',
    'rte',
    'mrpc',
    'stsb',
]
tasks_formatted = [
    "cola/MulticlassMatthewsCorrCoef",
    "mnli/MulticlassAccuracy",
    "mnli_mismatched/MulticlassAccuracy",
    "mrpc/BinaryF1Score",
    "mrpc/MulticlassAccuracy",
    "qnli/MulticlassAccuracy",
    "qqp/BinaryF1Score",
    "qqp/MulticlassAccuracy",
    "rte/MulticlassAccuracy",
    "sst2/MulticlassAccuracy",
    "stsb/SpearmanCorrCoef"
]

In [4]:
min_task_acc = {
    "cola/MulticlassMatthewsCorrCoef": 45,
    "mnli/MulticlassAccuracy": 75,
    "mnli_mismatched/MulticlassAccuracy": 75,
    "mrpc/BinaryF1Score": 82,
    "mrpc/MulticlassAccuracy": 75,
    "qnli/MulticlassAccuracy": 80,
    "qqp/BinaryF1Score": 75,
    "qqp/MulticlassAccuracy": 80,
    "rte/MulticlassAccuracy": 65,
    "sst2/MulticlassAccuracy": 80,
    "stsb/SpearmanCorrCoef": 75,
}

In [5]:
MODEL_SIZE = "base"

In [6]:
def get_runs(skip_ids=[]):
    run_lookup = {}
    runs = api.runs("mosaic-ml/rebuttal-mlm-schedule", filters={
        "$and": [{'tags': "best-ckpt"}, {"tags": f"bert-{MODEL_SIZE}"}, {"tags": "glue"}]})
    for run in tqdm(runs):
        if run.state != "finished":
            continue
        
        if run.id in skip_ids:
            continue

        run_name = run.name
        task = [s for s in run_name.split("_") if "task=" in s][0].split("=")[1]
        tags = run.tags
        group = run.group

        # parsing group to speed up
        # subset_runs = [f"subset-linear-initial-0.3-final-0.15-og-seed-{seed}-ckpt-70000" for seed in [17, 3047, 2048]]
        # linear_runs = [f"linear-initial-0.3-final-0.15-og-seed-{seed}" for seed in [17, 3047, 2048]]
        # constant_runs = [f"constant-initial-0.15-final-0.15-og-seed-{seed}" for seed in [17, 3047, 2048]]
        # if group not in subset_runs + linear_runs + constant_runs:
        #     continue
        if "large-" in group:
            group = group.replace("large-", "")
        if "schedule-" in group:
            group = group.replace("schedule-", "")
        experiment_name = group.split("-seed-")[0]
        if "subset" in group:
            scheduler = experiment_name.split("-")[2]
        else:
            scheduler = experiment_name.split("-")[1]
        init_rate = [float(t.split("-")[1]) for t in tags if "initial" in t][0]
        final_rate = [float(t.split("-")[1]) for t in tags if "final" in t][0]
        experiment_name = f"rts-{scheduler}-{init_rate}-{final_rate}"
        if "subset" in group:
            experiment_name = "subset-" + experiment_name
        pretrain_seed = int(re.search(r"seed-(\d+)", group).group(1))
        glue_seed = int(re.search(r"seed=(\d+)", run_name).group(1))
                
        if task == 'sst2':
            task = 'sst-2'
        
        if task not in tasks:
            raise ValueError(f"Task {task} not recognized.")
        
        try:
            metric_names = task_to_metric[task]
            if isinstance(metric_names, str):
                metric_names = [metric_names]
            elif not isinstance(metric_names, list):
                raise Exception("Unsupported type for 'metric_name'")

            for metric_idx, metric_name in enumerate(metric_names):
                metric_hist = run.history(keys=[metric_name]).to_numpy()[:, 1:].mean(axis=1)
                final_metric = max(metric_hist)
                
                # if final_metric < min_perf:
                #     continue
                
                metric_task = metric_name.replace("metrics/glue_", "")
                run_lookup[run.id + f"-{metric_idx}"] = {
                    'task': metric_task,
                    'experiment_name': experiment_name,
                    'final_metric': 100 * final_metric,
                    'pretrain_seed': pretrain_seed,
                    "glue_seed": glue_seed,
                    'scheduler': scheduler,
                    "init_rate": init_rate,
                    "final_rate": final_rate
                }
        except:
            print(f"Error for run: {group} with id ({run.id})")
                
    return run_lookup

In [7]:
def save_runs(save_name, runs):
    with open(save_name, 'wb') as f:
        pickle.dump(runs, f)

def load_runs(save_name):
    with open(save_name, 'rb') as f:
        runs = pickle.load(f)
    return runs

In [8]:
# load_path = f"glue/bert-{MODEL_SIZE}-results-subset.pkl"
save_path = f"rts-glue/bert-{MODEL_SIZE}-results-subset.pkl"

# old_run_ids = set(load_runs(load_path).keys())
old_run_ids = set()

run_lookup = get_runs(skip_ids=old_run_ids)

save_runs(save_path, run_lookup)

100%|██████████| 240/240 [02:17<00:00,  1.75it/s]


In [10]:
run_lookup = load_runs(save_path)

In [11]:
columns=["task", "experiment_name", "final_metric", "pretrain_seed",
         "glue_seed", "scheduler", "init_rate", "final_rate"]
results = []
for run_info in run_lookup.values():
    if run_info is None:
        continue
    # if run_info["final_metric"] < min_task_acc[run_info["task"]]:
    #     continue
    results.append([run_info[c] for c in columns])
base_df = pd.DataFrame(results, columns = columns)
base_df[["final_metric", 'pretrain_seed', "glue_seed", "init_rate", "final_rate"]] = base_df[["final_metric", 'pretrain_seed', "glue_seed", "init_rate", "final_rate"]].apply(pd.to_numeric)
base_df = base_df.sort_values(by=['experiment_name'], ascending=False)

base_df

Unnamed: 0,task,experiment_name,final_metric,pretrain_seed,glue_seed,scheduler,init_rate,final_rate
165,stsb/SpearmanCorrCoef,rts-linear-0.3-0.15,90.068543,3047,90166,linear,0.30,0.15
237,sst2/MulticlassAccuracy,rts-linear-0.3-0.15,91.399086,3047,19,linear,0.30,0.15
115,qnli/MulticlassAccuracy,rts-linear-0.3-0.15,90.810907,42,19,linear,0.30,0.15
114,qnli/MulticlassAccuracy,rts-linear-0.3-0.15,91.140401,42,8364,linear,0.30,0.15
109,qnli/MulticlassAccuracy,rts-linear-0.3-0.15,91.048872,42,10536,linear,0.30,0.15
...,...,...,...,...,...,...,...,...
222,rte/MulticlassAccuracy,rts-constant-0.15-0.15,74.729240,3047,8364,constant,0.15,0.15
221,mrpc/MulticlassAccuracy,rts-constant-0.15-0.15,88.725489,3047,19,constant,0.15,0.15
220,mrpc/BinaryF1Score,rts-constant-0.15-0.15,91.872793,3047,19,constant,0.15,0.15
219,mrpc/MulticlassAccuracy,rts-constant-0.15-0.15,88.970590,3047,8364,constant,0.15,0.15


# Sanity check data

In [12]:
for task in tasks_formatted:
    print(task)
    print(base_df[base_df.task==task].experiment_name.value_counts())
    print(' ')

cola/MulticlassMatthewsCorrCoef
experiment_name
rts-linear-0.3-0.15       10
rts-constant-0.3-0.3      10
rts-constant-0.15-0.15    10
Name: count, dtype: int64
 
mnli/MulticlassAccuracy
experiment_name
rts-linear-0.3-0.15       10
rts-constant-0.3-0.3      10
rts-constant-0.15-0.15    10
Name: count, dtype: int64
 
mnli_mismatched/MulticlassAccuracy
experiment_name
rts-linear-0.3-0.15       10
rts-constant-0.3-0.3      10
rts-constant-0.15-0.15    10
Name: count, dtype: int64
 
mrpc/BinaryF1Score
experiment_name
rts-linear-0.3-0.15       10
rts-constant-0.3-0.3      10
rts-constant-0.15-0.15    10
Name: count, dtype: int64
 
mrpc/MulticlassAccuracy
experiment_name
rts-linear-0.3-0.15       10
rts-constant-0.3-0.3      10
rts-constant-0.15-0.15    10
Name: count, dtype: int64
 
qnli/MulticlassAccuracy
experiment_name
rts-linear-0.3-0.15       10
rts-constant-0.3-0.3      10
rts-constant-0.15-0.15    10
Name: count, dtype: int64
 
qqp/BinaryF1Score
experiment_name
rts-linear-0.3-0.15   

In [31]:
from scipy.stats import ttest_ind

ModuleNotFoundError: No module named 'scipy'

In [15]:
# base_df = base_df[base_df["glue_seed"] != 8364]
# base_df = base_df[base_df["pretrain_seed"] != 17]

In [13]:
metric_stand_err = base_df.groupby(["experiment_name", "task"])["final_metric"].sem().reset_index()
metric_stand_err
grouped_df = base_df.groupby(["experiment_name", "task"]).mean(numeric_only=True).reset_index()
grouped_df["error"] = metric_stand_err["final_metric"]
grouped_df = grouped_df.round({'final_metric': 2, 'error': 2})
grouped_df

Unnamed: 0,experiment_name,task,final_metric,pretrain_seed,glue_seed,init_rate,final_rate,error
0,rts-constant-0.15-0.15,cola/MulticlassMatthewsCorrCoef,57.8,1544.5,21960.4,0.15,0.15,0.54
1,rts-constant-0.15-0.15,mnli/MulticlassAccuracy,83.1,1544.5,21960.4,0.15,0.15,0.06
2,rts-constant-0.15-0.15,mnli_mismatched/MulticlassAccuracy,83.51,1544.5,21960.4,0.15,0.15,0.04
3,rts-constant-0.15-0.15,mrpc/BinaryF1Score,91.58,1544.5,21960.4,0.15,0.15,0.18
4,rts-constant-0.15-0.15,mrpc/MulticlassAccuracy,88.24,1544.5,21960.4,0.15,0.15,0.23
5,rts-constant-0.15-0.15,qnli/MulticlassAccuracy,90.62,1544.5,21960.4,0.15,0.15,0.06
6,rts-constant-0.15-0.15,qqp/BinaryF1Score,88.21,1544.5,21960.4,0.15,0.15,0.04
7,rts-constant-0.15-0.15,qqp/MulticlassAccuracy,91.23,1544.5,21960.4,0.15,0.15,0.03
8,rts-constant-0.15-0.15,rte/MulticlassAccuracy,75.31,1544.5,21960.4,0.15,0.15,0.29
9,rts-constant-0.15-0.15,sst2/MulticlassAccuracy,91.96,1544.5,21960.4,0.15,0.15,0.13


In [14]:
grouped_df.pivot(index="experiment_name", columns="task", values="final_metric")

task,cola/MulticlassMatthewsCorrCoef,mnli/MulticlassAccuracy,mnli_mismatched/MulticlassAccuracy,mrpc/BinaryF1Score,mrpc/MulticlassAccuracy,qnli/MulticlassAccuracy,qqp/BinaryF1Score,qqp/MulticlassAccuracy,rte/MulticlassAccuracy,sst2/MulticlassAccuracy,stsb/SpearmanCorrCoef
experiment_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
rts-constant-0.15-0.15,57.8,83.1,83.51,91.58,88.24,90.62,88.21,91.23,75.31,91.96,89.88
rts-constant-0.3-0.3,56.39,83.21,84.01,91.36,88.01,90.83,88.22,91.26,74.84,92.05,89.9
rts-linear-0.3-0.15,58.02,83.35,83.75,92.04,88.87,90.98,88.25,91.3,75.31,91.86,90.12


In [15]:
metrics_with_errors = []
for task_name in tasks_formatted:
    metrics_with_errors.append(task_name)
    metrics_with_errors.append(task_name + " STE")
metrics_with_errors

['cola/MulticlassMatthewsCorrCoef',
 'cola/MulticlassMatthewsCorrCoef STE',
 'mnli/MulticlassAccuracy',
 'mnli/MulticlassAccuracy STE',
 'mnli_mismatched/MulticlassAccuracy',
 'mnli_mismatched/MulticlassAccuracy STE',
 'mrpc/BinaryF1Score',
 'mrpc/BinaryF1Score STE',
 'mrpc/MulticlassAccuracy',
 'mrpc/MulticlassAccuracy STE',
 'qnli/MulticlassAccuracy',
 'qnli/MulticlassAccuracy STE',
 'qqp/BinaryF1Score',
 'qqp/BinaryF1Score STE',
 'qqp/MulticlassAccuracy',
 'qqp/MulticlassAccuracy STE',
 'rte/MulticlassAccuracy',
 'rte/MulticlassAccuracy STE',
 'sst2/MulticlassAccuracy',
 'sst2/MulticlassAccuracy STE',
 'stsb/SpearmanCorrCoef',
 'stsb/SpearmanCorrCoef STE']

In [16]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
#pd.set_option('max_colwidth', -1)

In [17]:
BASELINE_RUN = "rts-constant-0.15-0.15"

In [18]:
task_grouped = []
for experiment_name in grouped_df["experiment_name"]:
    experiment_grouped = [experiment_name]
    for metric_name in tasks_formatted:
        print(experiment_name, metric_name)
        performance = grouped_df[
            (grouped_df["experiment_name"] == experiment_name) & (grouped_df["task"] == metric_name)]["final_metric"].values[0]

        error = grouped_df[
            (grouped_df["experiment_name"] == experiment_name) & (grouped_df["task"] == metric_name)]["error"].values[0]
        
        experiment_grouped.append(performance)
        experiment_grouped.append(error)
        
    task_grouped.append(experiment_grouped)

task_grouped_df = pd.DataFrame(task_grouped, columns=["experiment_name"]+metrics_with_errors)
task_grouped_df = task_grouped_df.drop_duplicates()
task_grouped_df["glue_mean"] = task_grouped_df[tasks_formatted].mean(axis=1)
constant_avg = task_grouped_df[
    task_grouped_df["experiment_name"] == BASELINE_RUN]["glue_mean"].values[0]
task_grouped_df["glue_delta"] = task_grouped_df["glue_mean"].map(lambda avg: avg - constant_avg)
task_grouped_df = task_grouped_df.sort_values(by=["glue_mean"], ascending=False)
task_grouped_df

rts-constant-0.15-0.15 cola/MulticlassMatthewsCorrCoef
rts-constant-0.15-0.15 mnli/MulticlassAccuracy
rts-constant-0.15-0.15 mnli_mismatched/MulticlassAccuracy
rts-constant-0.15-0.15 mrpc/BinaryF1Score
rts-constant-0.15-0.15 mrpc/MulticlassAccuracy
rts-constant-0.15-0.15 qnli/MulticlassAccuracy
rts-constant-0.15-0.15 qqp/BinaryF1Score
rts-constant-0.15-0.15 qqp/MulticlassAccuracy
rts-constant-0.15-0.15 rte/MulticlassAccuracy
rts-constant-0.15-0.15 sst2/MulticlassAccuracy
rts-constant-0.15-0.15 stsb/SpearmanCorrCoef
rts-constant-0.15-0.15 cola/MulticlassMatthewsCorrCoef
rts-constant-0.15-0.15 mnli/MulticlassAccuracy
rts-constant-0.15-0.15 mnli_mismatched/MulticlassAccuracy
rts-constant-0.15-0.15 mrpc/BinaryF1Score
rts-constant-0.15-0.15 mrpc/MulticlassAccuracy
rts-constant-0.15-0.15 qnli/MulticlassAccuracy
rts-constant-0.15-0.15 qqp/BinaryF1Score
rts-constant-0.15-0.15 qqp/MulticlassAccuracy
rts-constant-0.15-0.15 rte/MulticlassAccuracy
rts-constant-0.15-0.15 sst2/MulticlassAccuracy
rts

Unnamed: 0,experiment_name,cola/MulticlassMatthewsCorrCoef,cola/MulticlassMatthewsCorrCoef STE,mnli/MulticlassAccuracy,mnli/MulticlassAccuracy STE,mnli_mismatched/MulticlassAccuracy,mnli_mismatched/MulticlassAccuracy STE,mrpc/BinaryF1Score,mrpc/BinaryF1Score STE,mrpc/MulticlassAccuracy,mrpc/MulticlassAccuracy STE,qnli/MulticlassAccuracy,qnli/MulticlassAccuracy STE,qqp/BinaryF1Score,qqp/BinaryF1Score STE,qqp/MulticlassAccuracy,qqp/MulticlassAccuracy STE,rte/MulticlassAccuracy,rte/MulticlassAccuracy STE,sst2/MulticlassAccuracy,sst2/MulticlassAccuracy STE,stsb/SpearmanCorrCoef,stsb/SpearmanCorrCoef STE,glue_mean,glue_delta
22,rts-linear-0.3-0.15,58.02,0.44,83.35,0.07,83.75,0.1,92.04,0.26,88.87,0.38,90.98,0.05,88.25,0.03,91.3,0.02,75.31,0.43,91.86,0.14,90.12,0.06,84.895455,0.219091
0,rts-constant-0.15-0.15,57.8,0.54,83.1,0.06,83.51,0.04,91.58,0.18,88.24,0.23,90.62,0.06,88.21,0.04,91.23,0.03,75.31,0.29,91.96,0.13,89.88,0.06,84.676364,0.0
11,rts-constant-0.3-0.3,56.39,0.54,83.21,0.06,84.01,0.08,91.36,0.22,88.01,0.33,90.83,0.07,88.22,0.04,91.26,0.03,74.84,0.59,92.05,0.1,89.9,0.06,84.552727,-0.123636


In [19]:
task_grouped_df[["experiment_name", "glue_delta"]]

Unnamed: 0,experiment_name,glue_delta
22,rts-linear-0.3-0.15,0.219091
0,rts-constant-0.15-0.15,0.0
11,rts-constant-0.3-0.3,-0.123636


In [20]:
paper_just_metrics = ["mnli/MulticlassAccuracy", "mnli_mismatched/MulticlassAccuracy", "qnli/MulticlassAccuracy",
                 "qqp/BinaryF1Score", "rte/MulticlassAccuracy", "sst2/MulticlassAccuracy",
                 "mrpc/BinaryF1Score", "cola/MulticlassMatthewsCorrCoef", "stsb/SpearmanCorrCoef"
                ]
# paper_just_metrics = ["mnli/MulticlassAccuracy", "mnli_mismatched/MulticlassAccuracy", "qnli/MulticlassAccuracy",
#                  "qqp/BinaryF1Score", "sst2/MulticlassAccuracy",
#                  "mrpc/BinaryF1Score", "cola/MulticlassMatthewsCorrCoef", "stsb/SpearmanCorrCoef"
#                 ]
paper_just_errors = []
paper_metrics = []
for metric_name in paper_just_metrics:
    paper_metrics.append(metric_name)
    paper_metrics.append(metric_name + " STE")
    paper_just_errors.append(metric_name + " STE")

In [21]:
experiments = ["rts-constant-0.15-0.15", "rts-constant-0.3-0.3", "rts-linear-0.3-0.15"]
# experiments = ["constant-0.4-0.4", "linear-0.4-0.25"]

In [22]:
paper_df = task_grouped_df[["experiment_name", *paper_metrics]]
paper_df["glue_mean"] = paper_df[paper_just_metrics].mean(axis=1)
paper_df["glue_ste"] = paper_df[paper_just_errors].mean(axis=1)
constant_avg = paper_df[
    paper_df["experiment_name"] == BASELINE_RUN]["glue_mean"].values[0]
#paper_df = paper_df[paper_df["experiment_name"].isin(experiments)]
paper_df["glue_delta"] = paper_df["glue_mean"].map(lambda avg: avg - constant_avg)
paper_df = paper_df.sort_values(by="glue_mean", ascending=False)
paper_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  paper_df["glue_mean"] = paper_df[paper_just_metrics].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  paper_df["glue_ste"] = paper_df[paper_just_errors].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  paper_df["glue_delta"] = paper_df["glue_mean"].map(lambda avg: avg - consta

Unnamed: 0,experiment_name,mnli/MulticlassAccuracy,mnli/MulticlassAccuracy STE,mnli_mismatched/MulticlassAccuracy,mnli_mismatched/MulticlassAccuracy STE,qnli/MulticlassAccuracy,qnli/MulticlassAccuracy STE,qqp/BinaryF1Score,qqp/BinaryF1Score STE,rte/MulticlassAccuracy,rte/MulticlassAccuracy STE,sst2/MulticlassAccuracy,sst2/MulticlassAccuracy STE,mrpc/BinaryF1Score,mrpc/BinaryF1Score STE,cola/MulticlassMatthewsCorrCoef,cola/MulticlassMatthewsCorrCoef STE,stsb/SpearmanCorrCoef,stsb/SpearmanCorrCoef STE,glue_mean,glue_ste,glue_delta
22,rts-linear-0.3-0.15,83.35,0.07,83.75,0.1,90.98,0.05,88.25,0.03,75.31,0.43,91.86,0.14,92.04,0.26,58.02,0.44,90.12,0.06,83.742222,0.175556,0.19
0,rts-constant-0.15-0.15,83.1,0.06,83.51,0.04,90.62,0.06,88.21,0.04,75.31,0.29,91.96,0.13,91.58,0.18,57.8,0.54,89.88,0.06,83.552222,0.155556,0.0
11,rts-constant-0.3-0.3,83.21,0.06,84.01,0.08,90.83,0.07,88.22,0.04,74.84,0.59,92.05,0.1,91.36,0.22,56.39,0.54,89.9,0.06,83.423333,0.195556,-0.128889


In [23]:
paper_df[["experiment_name", "glue_mean", "glue_delta"]]

Unnamed: 0,experiment_name,glue_mean,glue_delta
22,rts-linear-0.3-0.15,83.742222,0.19
0,rts-constant-0.15-0.15,83.552222,0.0
11,rts-constant-0.3-0.3,83.423333,-0.128889


# Statistical sig testing for overall fit

In [24]:
base_df

Unnamed: 0,task,experiment_name,final_metric,pretrain_seed,glue_seed,scheduler,init_rate,final_rate
165,stsb/SpearmanCorrCoef,rts-linear-0.3-0.15,90.068543,3047,90166,linear,0.30,0.15
237,sst2/MulticlassAccuracy,rts-linear-0.3-0.15,91.399086,3047,19,linear,0.30,0.15
115,qnli/MulticlassAccuracy,rts-linear-0.3-0.15,90.810907,42,19,linear,0.30,0.15
114,qnli/MulticlassAccuracy,rts-linear-0.3-0.15,91.140401,42,8364,linear,0.30,0.15
109,qnli/MulticlassAccuracy,rts-linear-0.3-0.15,91.048872,42,10536,linear,0.30,0.15
...,...,...,...,...,...,...,...,...
222,rte/MulticlassAccuracy,rts-constant-0.15-0.15,74.729240,3047,8364,constant,0.15,0.15
221,mrpc/MulticlassAccuracy,rts-constant-0.15-0.15,88.725489,3047,19,constant,0.15,0.15
220,mrpc/BinaryF1Score,rts-constant-0.15-0.15,91.872793,3047,19,constant,0.15,0.15
219,mrpc/MulticlassAccuracy,rts-constant-0.15-0.15,88.970590,3047,8364,constant,0.15,0.15


In [25]:
base_df[(base_df["experiment_name"] == "constant-0.15-0.15") & (base_df["task"] == "stsb/SpearmanCorrCoef") & (base_df["pretrain_seed"] == "17") & (base_df["glue_seed"] == "90166")]
base_df[(base_df["experiment_name"] == "constant-0.15-0.15") & (base_df["task"] == "stsb/SpearmanCorrCoef") & (base_df["pretrain_seed"] == 17) &   (base_df["glue_seed"] == 90166)]


Unnamed: 0,task,experiment_name,final_metric,pretrain_seed,glue_seed,scheduler,init_rate,final_rate


In [26]:
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests

def build_ttest(experiments):
    raw_results_grouped = {}
    for task in paper_just_metrics:
        task_results = {}
        for experiment_name in experiments:
            experiment_task_perfs = []
            # for pre_seed in [17, 2048, 3047]:
            for pre_seed in [3047, 42]:
                for glue_seed in [19, 8364, 717, 10536, 90166]:
                    try:
                        task_perf = base_df[
                            (base_df["experiment_name"] == experiment_name) & (base_df["task"] == task) & (base_df["pretrain_seed"] == pre_seed) & (base_df["glue_seed"] == glue_seed)]["final_metric"].values[0]
                        experiment_task_perfs.append(task_perf)
                    except Exception as e:
                        # print(e)
                        any_issue = True
                        continue
            task_results[experiment_name] = experiment_task_perfs
        raw_results_grouped[task] = task_results
        
    averages = {}
    for experiment_name in experiments:
        experiment_averages = []
        # for pre_seed in [17, 2048, 3047]:
        for pre_seed in [17, 3047]:
            for glue_seed in [19, 8364, 717, 10536, 90166]:
                any_issue = False
                run_perfs = []
                for task in paper_just_metrics:
                    try:
                        task_perf = base_df[
                            (base_df["experiment_name"] == experiment_name) & (base_df["task"] == task) & (base_df["pretrain_seed"] == pre_seed) & (base_df["glue_seed"] == glue_seed)]["final_metric"].values[0]
                        run_perfs.append(task_perf)
                    except Exception as e:
                        # print(e)
                        any_issue = True
                        continue
                if not any_issue:
                    experiment_averages.append(np.mean(run_perfs))
        averages[experiment_name] = experiment_averages
    raw_results_grouped["glue_mean"] = averages
    # print(raw_results_grouped)
    return raw_results_grouped

# Format the table for the paper

In [27]:
def print_latex_table(experiments):
    subset_paper_df = paper_df[paper_df["experiment_name"].isin(experiments)]
    best_experiment_per_task = {}
    for task in paper_just_metrics + ["glue_mean"]:
        best_experiment_per_task[task] = subset_paper_df[subset_paper_df[task] == subset_paper_df[task].max()]["experiment_name"].values[0]

    raw_results_grouped = build_ttest(experiments)

    sigs = {}
    for task in paper_just_metrics + ["glue_mean"]:
        best = best_experiment_per_task[task]
        pvals = []
        ordered_exp = []
        for experiment in experiments:
            if experiment == best:
                continue
            ordered_exp.append(experiment)
            sig = ttest_ind(raw_results_grouped[task][experiment], raw_results_grouped[task][best], alternative="less").pvalue
            pvals.append(sig)
        # not_reject = ~multipletests(pvals, method="fdr_bh")[0]
        sig_task = {}
        for exp, is_null in zip(ordered_exp, ~multipletests(pvals, method="fdr_bh")[0]):
            sig_task[exp] = is_null
        sigs[task] = sig_task

    for experiment in experiments:
        # print(experiment)
        latex_str = f"{experiment}\t& "
        for task in paper_just_metrics + ["glue_mean"]:
            # print(task)
            exp_perf = subset_paper_df[subset_paper_df["experiment_name"] == experiment][task].values[0]
            # is_not_sig = ttest_ind(raw_results_grouped[task][experiment], raw_results_grouped[task][best_experiment_per_task[task]], alternative="less").pvalue > 0.05
            if experiment == best_experiment_per_task[task] or sigs[task][experiment]:
                latex_str += r"\textbf{" + exp_perf.round(2).astype(str) + r"}"
            else:
                latex_str += exp_perf.round(2).astype(str)
            if task.split("/")[0] == "mnli":
                latex_str += "/"
            else:
                latex_str += "\t& "
        # latex_str += paper_df[paper_df["experiment_name"] == experiment]["glue_mean"].values[0].round(2).astype(str)
        latex_str = latex_str[:-2]  + "\t " + r"\\"
        print(latex_str)

In [28]:
print_latex_table(["rts-constant-0.15-0.15", "rts-constant-0.3-0.3", "rts-linear-0.3-0.15"])

rts-constant-0.15-0.15	& 83.1/83.51	& 90.62	& \textbf{88.21}	& \textbf{75.31}	& \textbf{91.96}	& \textbf{91.58}	& \textbf{57.8}	& 89.88	& \textbf{83.55}		 \\
rts-constant-0.3-0.3	& \textbf{83.21}/\textbf{84.01}	& 90.83	& \textbf{88.22}	& \textbf{74.84}	& \textbf{92.05}	& \textbf{91.36}	& 56.39	& 89.9	& \textbf{83.42}		 \\
rts-linear-0.3-0.15	& \textbf{83.35}/83.75	& \textbf{90.98}	& \textbf{88.25}	& \textbf{75.31}	& \textbf{91.86}	& \textbf{92.04}	& \textbf{58.02}	& \textbf{90.12}	& \textbf{83.74}		 \\


In [199]:
print_latex_table(["constant-0.15-0.15", "constant-0.3-0.3", "linear-0.3-0.15",
                   "linear-0.3-0.2", "linear-0.3-0.25", "linear-0.3-0.35",
                   "linear-0.3-0.4", "linear-0.3-0.45"
                   ])

constant-0.15-0.15	& 84.3/84.71	& 90.38	& \textbf{88.31}	& \textbf{76.65}	& \textbf{92.91}	& \textbf{91.94}	& 55.89	& 89.38	& 83.83		 \\
constant-0.3-0.3	& 84.5/84.83	& \textbf{90.82}	& \textbf{88.31}	& \textbf{76.56}	& \textbf{92.79}	& \textbf{92.18}	& 57.24	& \textbf{89.85}	& 84.12		 \\
linear-0.3-0.15	& \textbf{84.61}/\textbf{85.13}	& \textbf{90.89}	& \textbf{88.34}	& 76.25	& \textbf{92.71}	& \textbf{91.87}	& \textbf{58.96}	& \textbf{89.87}	& \textbf{84.29}		 \\
linear-0.3-0.2	& \textbf{84.57}/84.89	& \textbf{90.87}	& \textbf{88.33}	& \textbf{77.04}	& \textbf{92.84}	& 91.38	& 57.29	& \textbf{89.78}	& 84.11		 \\
linear-0.3-0.25	& \textbf{84.63}/84.93	& \textbf{90.84}	& \textbf{88.33}	& 76.1	& \textbf{92.84}	& \textbf{92.02}	& 57.33	& \textbf{89.19}	& 84.02		 \\
linear-0.3-0.35	& 84.31/84.85	& \textbf{90.73}	& \textbf{88.28}	& \textbf{76.9}	& \textbf{92.91}	& \textbf{91.68}	& 55.85	& 89.7	& 83.91		 \\
linear-0.3-0.4	& 84.19/84.71	& \textbf{90.74}	& \textbf{88.31}	& \textbf{76.82}	& 92

In [52]:
print_latex_table(["constant-0.15-0.15", "linear-0.15-0.3", "linear-0.3-0.15",
                   ])

constant-0.15-0.15	& 84.3/84.71	& 90.38	& \textbf{88.31}	& \textbf{76.65}	& \textbf{92.91}	& \textbf{91.94}	& 55.89	& 89.38	& 83.83		 \\
linear-0.15-0.3	& 84.31/84.74	& \textbf{90.62}	& \textbf{88.28}	& 75.28	& \textbf{92.74}	& \textbf{91.96}	& 56.46	& 89.31	& 83.74		 \\
linear-0.3-0.15	& \textbf{84.64}/\textbf{85.16}	& \textbf{90.85}	& \textbf{88.32}	& \textbf{76.43}	& \textbf{92.7}	& \textbf{91.93}	& \textbf{58.4}	& \textbf{89.86}	& \textbf{84.25}		 \\


In [200]:
print_latex_table(["constant-0.15-0.15", "constant-0.3-0.3", "linear-0.3-0.15",
                   "cosine-0.3-0.15", "step-0.3-0.15"
                   ])

constant-0.15-0.15	& 84.3/84.71	& 90.38	& 88.31	& 76.65	& \textbf{92.91}	& \textbf{91.94}	& 55.89	& 89.38	& 83.83		 \\
constant-0.3-0.3	& 84.5/84.83	& \textbf{90.82}	& 88.31	& 76.56	& \textbf{92.79}	& \textbf{92.18}	& 57.24	& \textbf{89.85}	& 84.12		 \\
linear-0.3-0.15	& \textbf{84.61}/\textbf{85.13}	& \textbf{90.89}	& \textbf{88.34}	& 76.25	& \textbf{92.71}	& \textbf{91.87}	& \textbf{58.96}	& \textbf{89.87}	& \textbf{84.29}		 \\
cosine-0.3-0.15	& \textbf{84.55}/84.97	& \textbf{90.94}	& \textbf{88.39}	& \textbf{77.67}	& \textbf{92.91}	& \textbf{91.94}	& 57.45	& 89.64	& \textbf{84.27}		 \\
step-0.3-0.15	& \textbf{84.65}/\textbf{85.09}	& \textbf{90.85}	& \textbf{88.37}	& \textbf{77.71}	& \textbf{92.76}	& \textbf{91.56}	& 57.47	& 89.59	& \textbf{84.23}		 \\


In [46]:
print_latex_table(["constant-0.15-0.15", "subset-linear-0.3-0.15", "linear-0.3-0.15"])

constant-0.15-0.15	& 84.3/84.71	& 90.38	& \textbf{88.31}	& \textbf{76.65}	& \textbf{92.91}	& \textbf{91.94}	& 55.89	& 89.38	& 83.83		 \\
subset-linear-0.3-0.15	& 84.19/84.44	& 90.38	& \textbf{88.31}	& 74.37	& \textbf{92.72}	& \textbf{91.77}	& 57.53	& 89.58	& 83.7		 \\
linear-0.3-0.15	& \textbf{84.61}/\textbf{85.13}	& \textbf{90.89}	& \textbf{88.34}	& \textbf{76.25}	& \textbf{92.71}	& \textbf{91.87}	& \textbf{58.96}	& \textbf{89.87}	& \textbf{84.29}		 \\


In [64]:
raw = build_ttest(["constant-0.15-0.15", "subset-linear-0.3-0.15", "linear-0.3-0.15"])
ttest_ind(raw["glue_mean"]["constant-0.15-0.15"], raw["glue_mean"]["subset-linear-0.3-0.15"], alternative="less")

Ttest_indResult(statistic=-0.5115432018739686, pvalue=0.30759444814111847)