In [1]:
from create_datasets import read_json_dataset, write_json_dataset
from dict_utils import update_examples

## Preprocess datasets for metric modelling

In [6]:
import os

ROOT_DIR = "../.."
DATASET_RAW_DIR = f"{ROOT_DIR}/data/raw_splits"

DATASET_PREPROC_DIR = f"{ROOT_DIR}/data/preprocessing"
os.makedirs(DATASET_PREPROC_DIR, exist_ok=True)


FEATURES = [
    'char_edit_score',
    'word_edit_score',
    'recall', 
    'tp', 
    'fn',
    'precision',
    'fp',
    'f1_score',
    'sari_context',
    'sari_question',
    'bleu1',
    'bleu2',
    'bleu3',
    'bleu4',
    'hf_bleu1', 
    'hf_bleu2',
    'hf_bleu3',
    'hf_bleu4',
    'rougeL',
    'hf_rouge1',
    'hf_rouge2',
    'hf_rougeL',
    'hf_rougeLsum',
    'precision_at_err1',
    'recall_at_err1',
    'meteor',
    'bertscore',
    'bleurt',
    'wmd',
]

TARGET = 'score'

### Preprocess AD datasets

In [7]:
import pandas as pd 
import numpy as np

In [8]:
def to_dataframe(data: dict) -> pd.DataFrame:
    return pd.DataFrame.from_dict(data).T


def cast_types(df: pd.DataFrame, features: list, label: str) -> pd.DataFrame:
    print("Loaded", len(df), "examples")

    for feat in features + [label]:
        df[feat] = df[feat].astype(float)
        
    return df


def imputation(df: pd.DataFrame, features: list) -> pd.DataFrame:
    """Currently only accounts for inf."""    
    
    for feature in features:
        # Currently, we do not support missing data imputation
        assert not df[feature].isna().any(), f"NaN found for feature: {feature}"
        
        # For now, we will drop the infinite instances
        valid_mask = df[feature].abs() != np.inf
        df = df[valid_mask]

    return df
    
    
def standardize(data, mean, std):
    return (data - mean) / std


def min_max_scaling(data, min, max):
    return (data - min) / (max-min)


def whitening(df: pd.DataFrame, features, params=None) -> pd.DataFrame:
    df = df.copy()
    transform_params = params if params is not None else {}
    
    for feature in features:
        if params is None:
            transform_params[feature] = {
                "mean": df[feature].mean(),
                "std": df[feature].std(),
            }
        
        df[feature] = standardize(df[feature], **transform_params[feature])
        
    return df, transform_params

In [9]:
def preprocess_all_datasets_experiment(raw_dir, output_dir, features, target):
    train = read_json_dataset(raw_dir, "train")
    update_examples(train.values(), "original_filepath", [f"{raw_dir}/train.json"] * len(train.values()))

    df_train = to_dataframe(train)
    df_train = cast_types(df_train, features, target)
    df_train = imputation(df_train, features)
    df_train, df_params = whitening(df_train, features)
    df_train[target + "_scaled"] = min_max_scaling(df_train[target], 1, 5)
    train_json = df_train.T.to_dict()

    write_json_dataset(train_json, output_dir, "train")
    write_json_dataset(df_params, output_dir, "preproc_params")

    for split in ("dev", "test"):
        split_data = read_json_dataset(raw_dir, split)
        update_examples(split_data.values(), "original_filepath", [f"{raw_dir}/{split}.json"] * len(split_data.values()))

        df = to_dataframe(split_data)
        print(split, len(df))
        df = cast_types(df, features, target)
        df = imputation(df, features)
        df, _ = whitening(df, features, params=df_params)
        df[target + "_scaled"] = min_max_scaling(df[target], 1, 5)

        print(split, len(df))
        split_json = df.T.to_dict()
        write_json_dataset(split_json, output_dir, split)

    return df_train

preprocess_all_datasets_experiment(
    DATASET_RAW_DIR + "/all_datasets",
    DATASET_PREPROC_DIR + "/all_datasets",
    features=FEATURES,
    target=TARGET,
    
).describe()

Loaded 31068 examples
dev 4007
Loaded 4007 examples
dev 3897
test 6321
Loaded 6321 examples
test 6161


Unnamed: 0,score,bleu1,bleu2,bleu3,bleu4,hf_bleu1,hf_bleu2,hf_bleu3,hf_bleu4,rougeL,...,precision_at_err1,recall_at_err1,char_edit_score,word_edit_score,sari_context,sari_question,bertscore,bleurt,wmd,score_scaled
count,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,...,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0
mean,2.439354,-4.13079e-17,1.595987e-17,0.0,-7.510526e-18,9.388158e-19,-2.5348030000000002e-17,2.816447e-18,-5.6328950000000004e-18,-4.694079e-18,...,-2.065395e-17,-2.910329e-17,6.102303e-18,-2.7695070000000002e-17,-6.792332e-16,-3.92425e-16,-5.149405e-16,1.288055e-15,-1.0326970000000001e-17,0.359838
std,1.593652,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.398413
min,1.0,-0.8618692,-0.4802753,-0.304062,-0.2028705,-0.9322605,-0.5203978,-0.3319088,-0.2232683,-0.9405319,...,-0.5347742,-0.5403622,-0.4489977,-0.5690573,-2.815802,-4.987661,-3.390892,-26.55677,-1.936443,0.0
25%,1.0,-0.8618692,-0.4802753,-0.304062,-0.2028705,-0.9322605,-0.5203978,-0.3319088,-0.2232683,-0.9405319,...,-0.5347742,-0.5403622,-0.2798836,-0.3329889,-0.7592215,-0.7699267,-0.6751251,-0.3899368,-0.6843684,0.0
50%,2.0,-0.3327049,-0.4802753,-0.304062,-0.2028705,-0.2778152,-0.5203978,-0.3319088,-0.2232683,-0.2521664,...,-0.5347742,-0.5403622,-0.2080101,-0.2149547,-0.01164469,-0.107542,-0.1670626,0.0376428,-0.0112208,0.25
75%,4.0,0.5624568,-0.4802752,-0.304062,-0.2028705,0.6630428,-0.01497835,-0.3319088,-0.2232683,0.6925558,...,0.07921652,0.07075278,-0.1065416,-0.09692046,0.6970322,0.7073919,0.5913318,0.4743118,0.6716581,0.75
max,5.0,3.411109,4.568934,6.36042,8.896545,3.076227,4.061226,5.594109,7.761579,2.941398,...,2.535179,2.515213,47.44412,38.38223,2.377325,2.475758,2.289381,3.336418,3.388624,1.0


### LOOV experiment

In [10]:
def preprocess_loov(raw_dir, output_dir, features, target, train_filename="train", splits=("dev", "test")):
    train = read_json_dataset(raw_dir, train_filename)
    update_examples(train.values(), "original_filepath", [f"{raw_dir}/{train_filename}.json"] * len(train.values()))

    df_train = to_dataframe(train)
    df_train = cast_types(df_train, features, target)
    df_train = imputation(df_train, features)
    df_train, df_params = whitening(df_train, features)
    df_train[target + "_scaled"] = min_max_scaling(df_train[target], 1, 5)
    train_json = df_train.T.to_dict()

    write_json_dataset(train_json, output_dir, train_filename)
    write_json_dataset(df_params, output_dir, f"{train_filename}_preproc_params")

    for split in splits:
        split_data = read_json_dataset(raw_dir, split)
        update_examples(split_data.values(), "original_filepath", [f"{raw_dir}/{split}.json"] * len(split_data.values()))

        df = to_dataframe(split_data)
        print("Before imputation", split, len(df))
        df = cast_types(df, features, target)
        df = imputation(df, features)
        df, _ = whitening(df, features, params=df_params)
        print("After imputation", split, len(df))
        df[target + "_scaled"] = min_max_scaling(df[target], 1, 5)

        split_json = df.T.to_dict()
        write_json_dataset(split_json, output_dir, split)

    return df_train


In [11]:
for dataset in ("cosmosqa", "drop", "mcscript", "narrativeqa", "quoref", "socialiqa"):
    # The train for the loov experiment will be "except_{dataset}_(train|dev)"
    # The evaluation for the loov experiment will be "{dataset}_test"
    preprocess_loov(
        raw_dir= DATASET_RAW_DIR + "/loov_datasets",
        output_dir = DATASET_PREPROC_DIR + "/loov_datasets",    
        features=FEATURES,
        target=TARGET,
        train_filename=f"except_{dataset}_train",
        splits=(f"except_{dataset}_dev", f"except_{dataset}_test", f"{dataset}_test")
    )

Loaded 26035 examples
Before imputation except_cosmosqa_dev 3324
Loaded 3324 examples
After imputation except_cosmosqa_dev 3215
Before imputation except_cosmosqa_test 5304
Loaded 5304 examples
After imputation except_cosmosqa_test 5145
Before imputation cosmosqa_test 1017
Loaded 1017 examples
After imputation cosmosqa_test 1016
Loaded 30381 examples
Before imputation except_drop_dev 3910
Loaded 3910 examples
After imputation except_drop_dev 3801
Before imputation except_drop_test 6169
Loaded 6169 examples
After imputation except_drop_test 6009
Before imputation drop_test 152
Loaded 152 examples
After imputation drop_test 152
Loaded 23858 examples
Before imputation except_mcscript_dev 3029
Loaded 3029 examples
After imputation except_mcscript_dev 3024
Before imputation except_mcscript_test 4912
Loaded 4912 examples
After imputation except_mcscript_test 4896
Before imputation mcscript_test 1409
Loaded 1409 examples
After imputation mcscript_test 1265
Loaded 23598 examples
Before imputati