In [1]:
from create_datasets import read_json_dataset
from dict_utils import unfold_to_list, fold_from_list
from pipeline import Pipeline, FewShotPipeline, FineTuningFewShotPipeline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")


def add_lerc_preds(data, lerc_preds_dir, split):
    lerc_preds = read_json_dataset(lerc_preds_dir, split)
        
    for dataset, d in lerc_preds.items():
        for example_id, score in d.items():
            data[dataset][example_id]["LERC"] = (score["pred_score"] - 1) / (5-1)
            
    return data


def add_log(data, metrics):
    for m in metrics:
        data[f"{m}_log"] = data[m].apply(lambda s: np.log(s+1e-15)) 

In [2]:
DATASET_DIR = "../data/lr_experiments"
LERC_PREDS_DIR = f"{DATASET_DIR}/lerc_preds"

RESULTS_DIR = "./experiments_20220605/results"

train = read_json_dataset(DATASET_DIR, "train_metrics")
dev = read_json_dataset(DATASET_DIR, "dev_metrics")
test = read_json_dataset(DATASET_DIR, "test_metrics")
print(len(train), len(dev), len(test))

add_lerc_preds(train, LERC_PREDS_DIR, "train")
add_lerc_preds(dev, LERC_PREDS_DIR, "dev")
add_lerc_preds(test, LERC_PREDS_DIR, "test")

train_df = pd.DataFrame(unfold_to_list(train, "dataset", "example_id"))
dev_df   = pd.DataFrame(unfold_to_list(dev, "dataset", "example_id"))
test_df  = pd.DataFrame(unfold_to_list(test, "dataset", "example_id"))
print(train_df.shape, dev_df.shape, test_df.shape)

train_df["score_scaled"] = train_df.score.apply(lambda s: (s-1)/(5-1))
dev_df["score_scaled"] = dev_df.score.apply(lambda s: (s-1)/(5-1))
test_df["score_scaled"] = test_df.score.apply(lambda s: (s-1)/(5-1))

6 6 6
(31069, 49) (4009, 49) (6321, 49)


In [7]:
DATASETS = list(train.keys())

METRICS = [
    # Bleu
    'bleu1', 'bleu2', 'bleu3', 'bleu4', 
    # 'hf_bleu1', 'hf_bleu2', 'hf_bleu3', 'hf_bleu4', 
    'rougeL', 
    # 'hf_rougeL', 'hf_rougeLsum',
    'hf_rouge1', 'hf_rouge2',
    'meteor',
    'recall', 'precision', 'f1_score',
    'sari_context', 'sari_question',
    # Token overlap when 1st error occurred
    'precision_at_err1', 'recall_at_err1',
    # Confusion matrix
    'tp', 'fn', 'fp',
    # Edit scores ------
    'char_edit_score',
    'word_edit_score',
    # Learned metrics -------
    'bertscore', 
    'bleurt',
    # 
    "LERC",
    # Input statistics ------
    'candidatelength_word',
    'candidatelength_char',
    'candidatenunique_words',
    'referencelength_word',
    'referencelength_char',
    'referencenunique_words',
    'contextlength_word',
    'contextlength_char',
    'contextnunique_words',
    'questionlength_word',
    'questionlength_char',
    'questionnunique_words',
]

TARGET = "score_scaled"

**Validate numbers reported in original MOCHA paper**

Most of the values are close to the numbers reported in the paper. The ones that are not, are consistently higher.

In [8]:
from scipy.stats import pearsonr

for dataset in DATASETS:
    print(); print("---- DEV SET ----")
    _df = dev_df[dev_df.dataset == dataset]
    print(dataset, "bleu1", round(pearsonr(_df["score_scaled"], _df["bleu1"])[0], 3))
    print(dataset, "meteor", round(pearsonr(_df["score_scaled"], _df["meteor"])[0], 3))
    print(dataset, "rougeL", round(pearsonr(_df["score_scaled"], _df["rougeL"])[0], 3))
    print(dataset, "bert-score", round(pearsonr(_df["score_scaled"], _df["bertscore"])[0], 3))
    print()
    
    print("TEST SET")
    _df = test_df[test_df.dataset == dataset]
    print(dataset, "bleu1", round(pearsonr(_df["score_scaled"], _df["bleu1"])[0], 3))
    print(dataset, "meteor", round(pearsonr(_df["score_scaled"], _df["meteor"])[0], 3))
    print(dataset, "rougeL", round(pearsonr(_df["score_scaled"], _df["rougeL"])[0], 3))
    print(dataset, "bert-score", round(pearsonr(_df["score_scaled"], _df["bertscore"])[0], 3))
    print()


---- DEV SET ----
cosmosqa bleu1 0.66
cosmosqa meteor 0.697
cosmosqa rougeL 0.702
cosmosqa bert-score 0.805

TEST SET
cosmosqa bleu1 0.671
cosmosqa meteor 0.712
cosmosqa rougeL 0.701
cosmosqa bert-score 0.78


---- DEV SET ----
drop bleu1 0.409
drop meteor 0.664
drop rougeL 0.48
drop bert-score 0.174

TEST SET
drop bleu1 0.388
drop meteor 0.568
drop rougeL 0.366
drop bert-score 0.329


---- DEV SET ----
mcscript bleu1 0.182
mcscript meteor 0.461
mcscript rougeL 0.225
mcscript bert-score 0.173

TEST SET
mcscript bleu1 0.261
mcscript meteor 0.503
mcscript rougeL 0.297
mcscript bert-score 0.195


---- DEV SET ----
narrativeqa bleu1 0.403
narrativeqa meteor 0.606
narrativeqa rougeL 0.434
narrativeqa bert-score 0.419

TEST SET
narrativeqa bleu1 0.472
narrativeqa meteor 0.616
narrativeqa rougeL 0.496
narrativeqa bert-score 0.535


---- DEV SET ----
quoref bleu1 0.675
quoref meteor 0.729
quoref rougeL 0.713
quoref bert-score 0.208

TEST SET
quoref bleu1 0.578
quoref meteor 0.716
quoref rouge

# Regression Experiments

In [24]:
def get_subset(df, dataset = None, col="dataset"):
    return df[df[col] == dataset].copy() if dataset else df
    
def get_all_datasets(df, datasets, include_all=True):
    result = {} if not include_all else {"all": df.copy()}
    
    for dataset in datasets:
        result.update({dataset: get_subset(df, dataset)})
        
    return result

def get_loov_datasets(df, datasets):
    result = {}

    for dataset in datasets:
        loo_datasets = [get_subset(df, d) for d in datasets if d != dataset]
        loo_dataset = pd.concat(loo_datasets)
        
        result.update({f"except_{dataset}": loo_dataset})
        
    return result

TRAIN_DATASETS = get_all_datasets(train_df, DATASETS)
DEV_DATASETS   = get_all_datasets(dev_df, DATASETS)
TEST_DATASETS  = get_all_datasets(test_df, DATASETS)

TRAIN_LOO_DATASETS = get_loov_datasets(train_df, DATASETS)

# Baselines

In [25]:
def fit(
        model_class,
        model_hparams,
        features,
        target,
        train_datasets,
        split_frac=None,
        with_std=True,
        with_pca=False,
        seed=817237,
        pipeline_class=Pipeline,
    ) -> dict:
    pipelines = {}
    
    for train_name, train_data in train_datasets.items():
        pipeline = pipeline_class(model_class, model_hparams, train_name, features, target, seed=seed)
        pipeline.load_data(train_data)
        if split_frac and isinstance(split_frac, float):
            pipeline.split(holdout_fraction=split_frac)

        pipeline.preprocess(with_std=with_std, with_pca=with_pca)
        pipeline.fit()
        pipelines[train_name] = pipeline

    return pipelines


def evaluate(pipelines, eval_datasets):
    results = []
    for train_name, pipeline in pipelines.items():
        result = pipeline.evaluate_multiple(eval_datasets)
        results.extend(result)
    return results

def evaluate_loo(pipelines, eval_datasets):
    results = []
    for train_name, pipeline in pipelines.items():
        loo_dataset = train_name.rpartition("_")[-1]
        result = pipeline.evaluate_multiple({loo_dataset: eval_datasets[loo_dataset]})
        results.extend(result)

    return results

## Individual metrics

In [12]:
class IndividualMetric:
    def __init__(self, feature=None):
        self.feature = feature
                
    def fit(self, X, y):
        pass
    
    def predict(self, X):
        if self.feature is None:
            raise RuntimeError("No feature specified")

        y_pred = X[self.feature]
        
        assert len(y_pred) == X.shape[0]
        return y_pred

In [26]:
individual_results = []
for metric in ["LERC"] + METRICS: 
    ps = fit(IndividualMetric, {"feature": metric}, [metric], TARGET, {"all": TRAIN_DATASETS["all"]}, with_pca=False, with_std=False)
    results = evaluate(ps, DEV_DATASETS)
    for r in results:
        r["model_classpath"] = metric
    
    individual_results.extend(results)

individual_results = pd.DataFrame(individual_results)
individual_results.head()

Loading dataset 'all': ['cosmosqa' 'drop' 'mcscript' 'narrativeqa' 'quoref' 'socialiqa']
Loading dataset 'all': ['cosmosqa' 'drop' 'mcscript' 'narrativeqa' 'quoref' 'socialiqa']
Loading dataset 'all': ['cosmosqa' 'drop' 'mcscript' 'narrativeqa' 'quoref' 'socialiqa']
Loading dataset 'all': ['cosmosqa' 'drop' 'mcscript' 'narrativeqa' 'quoref' 'socialiqa']
Loading dataset 'all': ['cosmosqa' 'drop' 'mcscript' 'narrativeqa' 'quoref' 'socialiqa']
Loading dataset 'all': ['cosmosqa' 'drop' 'mcscript' 'narrativeqa' 'quoref' 'socialiqa']
Loading dataset 'all': ['cosmosqa' 'drop' 'mcscript' 'narrativeqa' 'quoref' 'socialiqa']
Loading dataset 'all': ['cosmosqa' 'drop' 'mcscript' 'narrativeqa' 'quoref' 'socialiqa']
Loading dataset 'all': ['cosmosqa' 'drop' 'mcscript' 'narrativeqa' 'quoref' 'socialiqa']
Loading dataset 'all': ['cosmosqa' 'drop' 'mcscript' 'narrativeqa' 'quoref' 'socialiqa']
Loading dataset 'all': ['cosmosqa' 'drop' 'mcscript' 'narrativeqa' 'quoref' 'socialiqa']
Loading dataset 'all'

Unnamed: 0,mse,r2,pearson,spearman,features,target,model_classpath,model_hparams,trained_on,evaluated_on
0,0.054807,0.609562,0.810055,0.78491,[LERC],score_scaled,LERC,{'feature': 'LERC'},all,all
1,0.052269,0.652098,0.859841,0.820316,[LERC],score_scaled,LERC,{'feature': 'LERC'},all,cosmosqa
2,0.049659,0.630796,0.81646,0.739219,[LERC],score_scaled,LERC,{'feature': 'LERC'},all,drop
3,0.054287,0.629135,0.812387,0.786814,[LERC],score_scaled,LERC,{'feature': 'LERC'},all,mcscript
4,0.05973,0.601472,0.793851,0.786998,[LERC],score_scaled,LERC,{'feature': 'LERC'},all,narrativeqa


In [29]:
individual_results.to_csv(f"{RESULTS_DIR}/baselines/individual_metrics.csv")

### Average

In [30]:
class AverageBaseline:
    def __init__(self, features=None, subset=None):
        if features is None or subset is None:
            self.features = None
            self.subset = None
            self.subset_feat_ids = None
        else:
            self.features = features
            self.subset = subset        
            self.subset_feat_ids = [i for i, f in enumerate(features) if f in subset]
                
    def fit(self, X, y):
        pass
    
    def predict(self, X):
        if self.features is not None:
            X = X[:, self.subset_feat_ids]
        else:
            print("Since no features were specified, using all input to make prediction")

        y_pred = np.mean(X, axis=1)
        assert len(y_pred) == X.shape[0]
        return y_pred

In [33]:
METRICS

['bleu1',
 'bleu2',
 'bleu3',
 'bleu4',
 'rougeL',
 'hf_rouge1',
 'hf_rouge2',
 'meteor',
 'recall',
 'precision',
 'f1_score',
 'sari_context',
 'sari_question',
 'precision_at_err1',
 'recall_at_err1',
 'tp',
 'fn',
 'fp',
 'char_edit_score',
 'word_edit_score',
 'bertscore',
 'bleurt',
 'LERC',
 'candidatelength_word',
 'candidatelength_char',
 'candidatenunique_words',
 'referencelength_word',
 'referencelength_char',
 'referencenunique_words',
 'contextlength_word',
 'contextlength_char',
 'contextnunique_words',
 'questionlength_word',
 'questionlength_char',
 'questionnunique_words']

In [35]:
top3_features  = {
    "cosmosqa": ["bleurt", "bertscore", "meteor"],
    "drop": ["hf_rouge1", "meteor", "f1_score"],
    "mcscript": ["bleurt", "meteor", "hf_rouge1"],
    "narrativeqa": ["bleurt", "bertscore", "meteor"],
    "quoref": ["hf_rouge1", "meteor", "bleurt"],
    "socialiqa": ["bleurt", "meteor", "precision"],
}

ad_avg_results = []
# All datasets experiment
print("ALL Baseline")
ad_avg_pipelines = fit(AverageBaseline, {}, METRICS, TARGET, TRAIN_DATASETS)
ad_avg_results.append(evaluate(ad_avg_pipelines, DEV_DATASETS))


for dataset in DATASETS:
    features = top3_features[dataset]
    
    ad_avg_pipelines = fit(AverageBaseline, {}, features, TARGET, TRAIN_DATASETS)
    ad_avg_results.append(evaluate(ad_avg_pipelines, DEV_DATASETS))

ALL Baseline
Loading dataset 'all': ['cosmosqa' 'drop' 'mcscript' 'narrativeqa' 'quoref' 'socialiqa']
Loading dataset 'cosmosqa': ['cosmosqa']
Loading dataset 'drop': ['drop']
Loading dataset 'mcscript': ['mcscript']
Loading dataset 'narrativeqa': ['narrativeqa']
Loading dataset 'quoref': ['quoref']
Loading dataset 'socialiqa': ['socialiqa']
Since no features were specified, using all input to make prediction
Since no features were specified, using all input to make prediction
Since no features were specified, using all input to make prediction
Since no features were specified, using all input to make prediction
Since no features were specified, using all input to make prediction
Since no features were specified, using all input to make prediction
Since no features were specified, using all input to make prediction
Since no features were specified, using all input to make prediction
Since no features were specified, using all input to make prediction
Since no features were specified, u

Since no features were specified, using all input to make prediction
Loading dataset 'all': ['cosmosqa' 'drop' 'mcscript' 'narrativeqa' 'quoref' 'socialiqa']
Loading dataset 'cosmosqa': ['cosmosqa']
Loading dataset 'drop': ['drop']
Loading dataset 'mcscript': ['mcscript']
Loading dataset 'narrativeqa': ['narrativeqa']
Loading dataset 'quoref': ['quoref']
Loading dataset 'socialiqa': ['socialiqa']
Since no features were specified, using all input to make prediction
Since no features were specified, using all input to make prediction
Since no features were specified, using all input to make prediction
Since no features were specified, using all input to make prediction
Since no features were specified, using all input to make prediction
Since no features were specified, using all input to make prediction
Since no features were specified, using all input to make prediction
Since no features were specified, using all input to make prediction
Since no features were specified, using all inpu

In [None]:
# TODO
# Stronger avgs
# (per dataset)
# Learn avg in training and predict that

### Linear Regression

In [None]:
if "LERC" in METRICS:
    name = "_w_lerc"
else:
    name = ""

In [None]:
# All datasets experiment
ad_lr_pipelines = fit(LinearRegression, {}, METRICS, TARGET, TRAIN_DATASETS)
ad_lr_results = evaluate(ad_lr_pipelines, DEV_DATASETS)
ad_lr_results = pd.DataFrame(ad_lr_results)
ad_lr_results.to_csv(f"results/ad_lr{name}.csv")

loo_lr_pipelines = fit(LinearRegression, {}, METRICS, TARGET, TRAIN_LOO_DATASETS)
loo_lr_results = evaluate(loo_lr_pipelines, DEV_DATASETS)
loo_lr_results = pd.DataFrame(loo_lr_results)
loo_lr_results.to_csv(f"results/loo_lr{name}.csv")

# All (baseline) LR results
lr_results = pd.concat((ad_lr_results, loo_lr_results)).reset_index(drop=True)
# lr_results.to_csv("results/lr.csv")

lr_results.head()

In [None]:
for dataset in DATASETS:
    eval_on_mask = lr_results["evaluated_on"] == dataset
    train_on_mask = lr_results["trained_on"] == f"all"
    print("all", dataset, lr_results.loc[train_on_mask & eval_on_mask, "pearson"])
    
print("\n", "#" * 20)
print("LOO")
for dataset in DATASETS:
    train_on_mask = lr_results["trained_on"] == f"except_{dataset}"
    print(dataset, ":", lr_results.loc[train_on_mask, ["evaluated_on", "pearson"]])

## L1 Regression (Lasso Regression)

In [None]:
from sklearn.linear_model import Lasso

from sklearn.model_selection import ParameterSampler
from scipy.stats import expon


def get_alpha(args):
    return eval(args)["alpha"]


def plot_metric_by_alpha(data, metric, **kwargs):
    n_plots = data.trained_on.nunique()
    n_cols = 3

    n_rows = n_plots // n_cols
    n_rows += n_plots % n_cols

    position = range(1, n_plots+1)

    fig = plt.figure(1, figsize=(10, 10), dpi=150)

    for k, trained_on in enumerate(data.trained_on.unique()):
        d = data[(data["trained_on"] == trained_on)]
        ax = fig.add_subplot(n_rows, n_cols, position[k])
        sns.lineplot(data=d, x="alpha", y=metric, ax=ax, **kwargs)
        ax.set_title(f"Trained_on={trained_on}")

    plt.tight_layout()

In [None]:
# We will perform model selection using L1 regression
# since it is known to enforce sparsity of the solution!
N_L1_MODELS = 100

L1_GRID = {'alpha': expon(loc=0, scale=0.20)}
L1_PARAMS = list(ParameterSampler(L1_GRID, n_iter=N_L1_MODELS, random_state=81723))

plt.figure(figsize=(5, 3), dpi=150)
plt.hist([p["alpha"] for p in L1_PARAMS])
plt.title("Distribution of the sampled alpha values for Lasso")
plt.savefig("results/l1_alphas_dist.png", dpi=150)
plt.show()

In [None]:
l1_pipelines = {}
ad_results = []
loo_results = []
for i, l1_hparams in enumerate(L1_PARAMS):
    if l1_hparams["alpha"] > 2:
        continue

    # All datasets experiment
    ad_l1_pipelines = fit(Lasso, l1_hparams, METRICS, TARGET, TRAIN_DATASETS)
    ad_l1_results = evaluate(ad_l1_pipelines, DEV_DATASETS)
    ad_l1_results = pd.DataFrame(ad_l1_results)
    ad_l1_results["i"] = i
    ad_results.append(ad_l1_results)

    loo_l1_pipelines = fit(Lasso, l1_hparams, METRICS, TARGET, TRAIN_LOO_DATASETS)
    # loo_l1_results = evaluate(loo_l1_pipelines, DEV_DATASETS)
    loo_l1_results = evaluate_loo(loo_l1_pipelines, DEV_DATASETS)
    loo_l1_results = pd.DataFrame(loo_l1_results)
    loo_l1_results["i"] = i
    loo_results.append(loo_l1_results)

    # All (baseline) LR results
    l1_pipelines[i] = {"AD": ad_l1_pipelines, "LOO": loo_l1_pipelines}
    
l1_ad_results = pd.concat(ad_results).reset_index(drop=True)
l1_ad_results["alpha"] = l1_ad_results["model_hparams"].apply(get_alpha)

l1_loo_results = pd.concat(loo_results).reset_index(drop=True)
l1_loo_results["alpha"] = l1_loo_results["model_hparams"].apply(get_alpha)

In [None]:
def get_feat_information(pipelines, results, experiment_type, metrics):
    results = results.copy()
    # For every set of experiments
    for i, experiments in pipelines.items():
        # Get the experiment_type pipeline (AD or LOO)
        for trained_on, pipeline in experiments[experiment_type].items():
            # Determine the important features and their importance
            _feat_importance = pipeline.model.coef_
            _mask = np.abs(_feat_importance) > 1e-6
            
            trained_on_mask = results["trained_on"] == trained_on
            i_mask = results["i"] == i

            _feats = np.argsort(np.abs(_feat_importance))[::-1]
            _featnames = tuple(metrics[ix] for ix in _feats if _mask[ix])
            _feats = {metrics[ix]: _feat_importance[ix] for ix in _feats if _mask[ix]}
            _feats["intercept_"] = pipeline.model.intercept_

            results.loc[trained_on_mask & i_mask, "n_features"] =  sum(_mask)
            results.loc[trained_on_mask & i_mask, "feat_names"] = str(_featnames)
            results.loc[trained_on_mask & i_mask, "feat_importance"] = str(_feats)

    return results


l1_ad_results = get_feat_information(l1_pipelines, l1_ad_results, "AD", METRICS)
l1_ad_results.to_csv("results/l1_ad.csv")

l1_loo_results = get_feat_information(l1_pipelines, l1_loo_results, "LOO", METRICS)
l1_loo_results.to_csv("results/l1_loo.csv")

In [None]:
plot_metric_by_alpha(l1_ad_results, "mse")
plt.savefig(f"results/l1_ad_avg_mse_by_alpha{name}.png", dpi=200)

In [None]:
plot_metric_by_alpha(l1_ad_results, "mse", hue="evaluated_on")
plt.savefig(f"results/l1_ad_mse_by_alpha_discriminated_by_evaluation_set{name}.png", dpi=200)

In [None]:
plot_metric_by_alpha(l1_loo_results, "mse", hue="evaluated_on")
plt.savefig(f"results/l1_loo_mse_by_alpha_discriminated_by_evaluation_set{name}.png", dpi=200)

## Random Forest experiments

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# All datasets experiment
ad_rf_def_pipelines = fit(RandomForestRegressor, {}, METRICS, TARGET, TRAIN_DATASETS)
ad_rf_def_results = evaluate(ad_rf_def_pipelines, DEV_DATASETS)
ad_rf_def_results = pd.DataFrame(ad_rf_def_results)
ad_rf_def_results.to_csv(f"results/ad_rf{name}.csv")

loo_rf_def_pipelines = fit(RandomForestRegressor, {}, METRICS, TARGET, TRAIN_LOO_DATASETS)
loo_rf_def_results = evaluate(loo_rf_def_pipelines, DEV_DATASETS)
loo_rf_def_results = pd.DataFrame(loo_rf_def_results)
loo_rf_def_results.to_csv(f"results/loo_rf{name}.csv")

# All (baseline) LR results
rf_def_results = pd.concat((ad_rf_def_results, loo_rf_def_results)).reset_index(drop=True)
rf_def_results

## MLP

In [None]:
from sklearn.neural_network import MLPRegressor
ad_mlp_pipelines1 = fit(MLPRegressor, {"random_state": 42, "early_stopping": True}, METRICS, TARGET, TRAIN_DATASETS)
ad_mlp_results1 = evaluate(ad_mlp_pipelines1, DEV_DATASETS)
ad_mlp_results1 = pd.DataFrame(ad_mlp_results1)

In [None]:
ad_mlp_pipelines2 = fit(MLPRegressor, {"hidden_layer_sizes": (128, 64, 32), "random_state": 42, "early_stopping": True}, METRICS, TARGET, TRAIN_DATASETS)
ad_mlp_results2 = evaluate(ad_mlp_pipelines2, DEV_DATASETS)
ad_mlp_results2 = pd.DataFrame(ad_mlp_results2)

In [None]:
ad_mlp_results = pd.concat((ad_mlp_results1, ad_mlp_results2))
ad_mlp_results.to_csv(f"results/ad_mlp_default{name}.csv")

## Few shot experiment 

We can perform this experiment in multiple ways. It considers the LOO experiment. 
We can use weight the training data differently, and we can use different number of examples in the LOO experiment.

For the first experiment, we will consider using all the available training data ($100\%$) and use different number of points in the LOO. In order to ensure comparable results, we will restrict our _few shot_ examples to the ones available in the training split (that weren't used in the first place) and we evaluate on the same development set. Future experiments may consider enlarging it and using more examples from the dev set.


In general, we devise the following steps for a few-shot experiment:
1. create dataset of $D_{PT}=(D_1, ..., D_5)$;
2. train __model__ $m$ in $D_{PT}$;
3. assign weight $w_{PT}$ to examples used in pre-training according to ratio $\tau$;
3. select a fraction of the examples $f$ from $D_6$;
4. assign weight $w_{FS}$ to the fraction of $D_6$ examples according to ratio $\tau$;
5. train __model__
6. evaluate in dev set for $D_6$
5. repeat evaluation for 20 seeds.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

In [None]:
# Weighting scheme proof of concept
n1, n2 = 24_000, 1000
n = n1 + n2

# If we want n1 examples to be equivalent to a
# of the total dataset, then:
a = 0.2
target_n1, target_n2 = n * a, n * (1-a)
n1_w = target_n1 / n1 
n2_w = target_n2 / n2
print(n1_w, n2_w)

In [None]:
def run_few_shot_experiment(
        train_datasets,
        dataset_name,
        fewshot_datasets,
        fewshot_dataset_name,
        eval_datasets,
        fewshot_weights,
        fewshot_pct_examples,
        features,
        target,
        nruns=5,
        seed=81723,
        model_class=LinearRegression,
        model_hparams={},
        pipeline=None,
    ):
    from itertools import product
    rand = np.random.default_rng(seed)
    
    fewshot_data = fewshot_datasets[fewshot_dataset_name]
    
    all_results = []
    all_pipelines = []
    for i, (fewshot_pct, fewshot_weight) in enumerate(product(fewshot_pct_examples, fewshot_weights)):
        for j in range(nruns):
            seed = rand.integers(10**6)
            fewshot_fraction =  fewshot_data.sample(frac=fewshot_pct, replace=False, random_state=seed)
            print(len(fewshot_fraction))
            # Get subset of few shot data:
            if pipeline is None:
                pipeline = FewShotPipeline

            fs_pipeline = pipeline(
                fewshot_dataset=fewshot_dataset_name,
                fewshot_weight=fewshot_weight,
                model_class=model_class,
                model_hparams=model_hparams,
                dataset=dataset_name,
                features=features,
                target=target,
                seed=seed,
            )

            fs_pipeline.load_data(train_datasets[dataset_name], fewshot_data=fewshot_fraction)
            fs_pipeline.fewshot_fit()
            results = fs_pipeline.evaluate_multiple(eval_datasets)
            
            for r in results:
                r["i"] = i
                r["seed"] = seed
                r["fewshot_weight"] = fewshot_weight
                r["fewshot_pct"] = fewshot_pct
                
            all_results.extend(results)
            all_pipelines.append(fs_pipeline)
            
    return all_results, all_pipelines

### Experiments 


- [ ] GMMs
- [x] Feature engineering: logs
- [ ] Ordinal regression
- [ ] Create self-contained script to launch fewshot experiment for individual dataset and model.



In [None]:
import joblib
import os

RESULTS_DIR = "results_20220602"
os.makedirs(RESULTS_DIR, exist_ok=True)

In [None]:
FEWSHOT_PCTS = np.linspace(0.01, 1, 15, endpoint=True)
FEWSHOT_WEIGHTS = [0.25, 0.5, 0.75, 0.9, 1, None]

print(len(FEWSHOT_PCTS), len(FEWSHOT_WEIGHTS))
print("Fewshot pcts:", FEWSHOT_PCTS)
print("Fewshot weights:", FEWSHOT_WEIGHTS)

In [None]:
0.01

In [None]:
class Model: 
    def __init__(self):
        self.name = None
        self.classpath = None
        self.pipeline = None
        self.nruns = None

model = Model()
# model.name, model.classpath, model.hparams, model.nruns = "lr", LinearRegression, {}, 10
# model.name, model.classpath, model.hparams, model.nruns = "rf", RandomForestRegressor, {"n_jobs": 15}, 2
model.name, model.classpath, model.hparams, model.nruns, model.pipeline = "mlp", MLPRegressor, {"learning_rate": "adaptive", "random_state": 42, "early_stopping": True}, 5, FineTuningFewShotPipeline

USE_LOG_METRICS = False

METRICS = [
    # Bleu
    'bleu1', 'bleu2', 'bleu3', 'bleu4', 
    # 'hf_bleu1', 'hf_bleu2', 'hf_bleu3', 'hf_bleu4', 
    'rougeL', 
    # 'hf_rougeL', 'hf_rougeLsum',
    'hf_rouge1', 'hf_rouge2',
    'meteor',
    'recall', 'precision', 'f1_score',
    'sari_context', 'sari_question',
    # Token overlap when 1st error occurred
    'precision_at_err1', 'recall_at_err1',
    # Confusion matrix
    'tp', 'fn', 'fp',
    # Edit scores ------
    'char_edit_score', 'word_edit_score',
    # Learned metrics -------
    'bertscore', 
    'bleurt',
    "LERC",
    # Input statistics ------
    'candidatelength_word', 'candidatelength_char',
    'candidatenunique_words', 'referencelength_word',
    'referencelength_char', 'referencenunique_words',
    'contextlength_word', 'contextlength_char',
    'contextnunique_words', 'questionlength_word',
    'questionlength_char', 'questionnunique_words',
]

if USE_LOG_METRICS and len(LOG_METRICS) > 0:
    METRICS += LOG_METRICS_NAMES
    model.name += "_w_log_metrics"

if "bleurt" not in METRICS:
    model.name += '_no_bleurt'
if "bertscore" not in METRICS:
    model.name += "_no_bertscore"
if "LERC" in METRICS:
    model.name += "_w_LERC"
    
print(model.name)
DATASETS

In [None]:
#for dataset in ["quoref", "socialiqa"]:
#for dataset in ['narrativeqa', 'quoref', 'socialiqa']:
for dataset in DATASETS:
    print("Experiment for dataset", dataset)
    loo_fewshot, loo_ps =  run_few_shot_experiment(
        train_datasets=TRAIN_LOO_DATASETS,
        dataset_name=f"except_{dataset}",
        fewshot_datasets=TRAIN_DATASETS,
        fewshot_dataset_name=dataset,
        eval_datasets=DEV_DATASETS,
        fewshot_weights=FEWSHOT_WEIGHTS,
        fewshot_pct_examples=FEWSHOT_PCTS,
        features=METRICS,
        target=TARGET,
        nruns=model.nruns,
        seed=81723,
        model_class=model.classpath,
        model_hparams=model.hparams,
        pipeline=model.pipeline,
    )

    loo_results = pd.DataFrame(loo_fewshot)
    loo_results.fewshot_weight = loo_results.fewshot_weight.fillna("default")
    
    dataset_dir = f"{RESULTS_DIR}/{dataset}"
    os.makedirs(dataset_dir, exist_ok=True)
    loo_results.to_csv(f"{dataset_dir}/fewshot_loo_{model.name}_{model.nruns}.csv")
    joblib.dump(loo_ps, f"{dataset_dir}/fewshot_loo_{model.name}_{model.nruns}.pipelines")
    del loo_fewshot
    del loo_ps
    del loo_results