In [1]:
from create_datasets import read_json_dataset, write_json_dataset
from dict_utils import update_examples

## Preprocess datasets for metric modelling

In [2]:
import os

ROOT_DIR = "../.."
DATASET_RAW_DIR = f"{ROOT_DIR}/data/raw_splits"

DATASET_PREPROC_DIR = f"{ROOT_DIR}/data/preprocessing_minmaxscale"
os.makedirs(DATASET_PREPROC_DIR, exist_ok=True)


FEATURES = [
    'char_edit_score',
    'word_edit_score',
    'recall', 
    'tp', 
    'fn',
    'precision',
    'fp',
    'f1_score',
    'sari_context',
    'sari_question',
    'bleu1',
    'bleu2',
    'bleu3',
    'bleu4',
    'hf_bleu1', 
    'hf_bleu2',
    'hf_bleu3',
    'hf_bleu4',
    'rougeL',
    'hf_rouge1',
    'hf_rouge2',
    'hf_rougeL',
    'hf_rougeLsum',
    'precision_at_err1',
    'recall_at_err1',
    'meteor',
    'bertscore',
    'bleurt',
    'wmd',
]

TARGET = 'score'

### Preprocess AD datasets

In [3]:
import pandas as pd 
import numpy as np

In [5]:
def to_dataframe(data: dict) -> pd.DataFrame:
    return pd.DataFrame.from_dict(data).T


def cast_types(df: pd.DataFrame, features: list, label: str) -> pd.DataFrame:
    print("Loaded", len(df), "examples")

    for feat in features + [label]:
        df[feat] = df[feat].astype(float)
        
    return df


def imputation(df: pd.DataFrame, features: list) -> pd.DataFrame:
    """Currently only accounts for inf."""    
    
    for feature in features:
        # Currently, we do not support missing data imputation
        assert not df[feature].isna().any(), f"NaN found for feature: {feature}"
        
        # For now, we will drop the infinite instances
        valid_mask = df[feature].abs() != np.inf
        df = df[valid_mask]

    return df
    
    
def standardize(data, mean, std):
    return (data - mean) / std


def min_max_scaling(data, min, max):
    return (data - min) / (max-min)


def whitening_min_max(df: pd.DataFrame, features, params=None) -> pd.DataFrame:
    df = df.copy()
    transform_params = params if params is not None else {}
    
    for feature in features:
        if params is None:
            transform_params[feature] = {
                "max": df[feature].max(),
                "min": df[feature].min(),
            }
        
        df[feature] = min_max_scaling(df[feature], **transform_params[feature])
        
    return df, transform_params


def whitening(df: pd.DataFrame, features, params=None) -> pd.DataFrame:
    df = df.copy()
    transform_params = params if params is not None else {}
    
    for feature in features:
        if params is None:
            transform_params[feature] = {
                "mean": df[feature].mean(),
                "std": df[feature].std(),
            }
        
        df[feature] = standardize(df[feature], **transform_params[feature])
        
    return df, transform_params

In [9]:
def preprocess_all_datasets_experiment(raw_dir, output_dir, features, target, whitening_fn: callable):
    os.makedirs(output_dir, exist_ok=True)
    train = read_json_dataset(raw_dir, "train")
    update_examples(train.values(), "original_filepath", [f"{raw_dir}/train.json"] * len(train.values()))

    df_train = to_dataframe(train)
    df_train = cast_types(df_train, features, target)
    df_train = imputation(df_train, features)
    df_train, df_params = whitening_fn(df_train, features)
    df_train[target + "_scaled"] = min_max_scaling(df_train[target], 1, 5)
    train_json = df_train.T.to_dict()

    write_json_dataset(train_json, output_dir, "train")
    write_json_dataset(df_params, output_dir, "preproc_params")

    for split in ("dev", "test"):
        split_data = read_json_dataset(raw_dir, split)
        update_examples(split_data.values(), "original_filepath", [f"{raw_dir}/{split}.json"] * len(split_data.values()))

        df = to_dataframe(split_data)
        print(split, len(df))
        df = cast_types(df, features, target)
        df = imputation(df, features)
        df, _ = whitening_fn(df, features, params=df_params)
        df[target + "_scaled"] = min_max_scaling(df[target], 1, 5)

        print(split, len(df))
        split_json = df.T.to_dict()
        write_json_dataset(split_json, output_dir, split)

    return df_train

preprocess_all_datasets_experiment(
    DATASET_RAW_DIR + "/all_datasets",
    DATASET_PREPROC_DIR + "/all_datasets",
    features=FEATURES,
    target=TARGET,
    whitening_fn=whitening_min_max,
    
).describe()

Loaded 31068 examples
dev 4007
Loaded 4007 examples
dev 3897
test 6321
Loaded 6321 examples
test 6161


Unnamed: 0,score,bleu1,bleu2,bleu3,bleu4,hf_bleu1,hf_bleu2,hf_bleu3,hf_bleu4,rougeL,...,precision_at_err1,recall_at_err1,char_edit_score,word_edit_score,sari_context,sari_question,bertscore,bleurt,wmd,score_scaled
count,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,...,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0,30274.0
mean,2.439354,0.2017022,0.09511891,0.04562428,0.0222949,0.232572,0.113584,0.056009,0.027962,0.242285,...,0.174196,0.176845,0.009375,0.014609,0.542217,0.668281,0.596959,0.888389,0.363647,0.359838
std,1.593652,0.2340288,0.1980508,0.1500492,0.1098972,0.249471,0.218263,0.168747,0.125237,0.257604,...,0.325738,0.327271,0.02088,0.025673,0.192562,0.133987,0.176048,0.033452,0.187791,0.398413
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,3.333332e-16,7.070087e-16,7.937004e-13,5.842795e-13,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.003531,0.006061,0.39602,0.565121,0.478105,0.875344,0.235128,0.0
50%,2.0,0.1238397,4.229931e-09,2.554365e-11,2.659147e-11,0.163265,0.0,0.0,0.0,0.177326,...,0.0,0.0,0.005032,0.009091,0.539975,0.653872,0.567548,0.889648,0.36154,0.25
75%,4.0,0.3333333,2.236068e-08,7.937005e-08,8.6334e-09,0.397981,0.110314,0.0,0.0,0.42069,...,0.2,0.2,0.00715,0.012121,0.676439,0.763062,0.701062,0.904256,0.489778,0.75
max,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### LOOV experiment

In [10]:
def preprocess_loov(raw_dir, output_dir, features, target, train_filename="train", splits=("dev", "test"), whitening_fn: callable=whitening):
    os.makedirs(output_dir, exist_ok=True)

    train = read_json_dataset(raw_dir, train_filename)
    update_examples(train.values(), "original_filepath", [f"{raw_dir}/{train_filename}.json"] * len(train.values()))

    df_train = to_dataframe(train)
    df_train = cast_types(df_train, features, target)
    df_train = imputation(df_train, features)
    df_train, df_params = whitening_fn(df_train, features)
    df_train[target + "_scaled"] = min_max_scaling(df_train[target], 1, 5)
    train_json = df_train.T.to_dict()

    write_json_dataset(train_json, output_dir, train_filename)
    write_json_dataset(df_params, output_dir, f"{train_filename}_preproc_params")

    for split in splits:
        split_data = read_json_dataset(raw_dir, split)
        update_examples(split_data.values(), "original_filepath", [f"{raw_dir}/{split}.json"] * len(split_data.values()))

        df = to_dataframe(split_data)
        print("Before imputation", split, len(df))
        df = cast_types(df, features, target)
        df = imputation(df, features)
        df, _ = whitening_fn(df, features, params=df_params)
        print("After imputation", split, len(df))
        df[target + "_scaled"] = min_max_scaling(df[target], 1, 5)

        split_json = df.T.to_dict()
        write_json_dataset(split_json, output_dir, split)

    return df_train


In [11]:
for dataset in ("cosmosqa", "drop", "mcscript", "narrativeqa", "quoref", "socialiqa"):
    # The train for the loov experiment will be "except_{dataset}_(train|dev)"
    # The evaluation for the loov experiment will be "{dataset}_test"
    preprocess_loov(
        raw_dir= DATASET_RAW_DIR + "/loov_datasets",
        output_dir = DATASET_PREPROC_DIR + "/loov_datasets",    
        features=FEATURES,
        target=TARGET,
        train_filename=f"except_{dataset}_train",
        splits=(f"except_{dataset}_dev", f"except_{dataset}_test", f"{dataset}_test"),
        whitening_fn=whitening_min_max
    )

Loaded 26035 examples
Before imputation except_cosmosqa_dev 3324
Loaded 3324 examples
After imputation except_cosmosqa_dev 3215
Before imputation except_cosmosqa_test 5304
Loaded 5304 examples
After imputation except_cosmosqa_test 5145
Before imputation cosmosqa_test 1017
Loaded 1017 examples
After imputation cosmosqa_test 1016
Loaded 30381 examples
Before imputation except_drop_dev 3910
Loaded 3910 examples
After imputation except_drop_dev 3801
Before imputation except_drop_test 6169
Loaded 6169 examples
After imputation except_drop_test 6009
Before imputation drop_test 152
Loaded 152 examples
After imputation drop_test 152
Loaded 23858 examples
Before imputation except_mcscript_dev 3029
Loaded 3029 examples
After imputation except_mcscript_dev 3024
Before imputation except_mcscript_test 4912
Loaded 4912 examples
After imputation except_mcscript_test 4896
Before imputation mcscript_test 1409
Loaded 1409 examples
After imputation mcscript_test 1265
Loaded 23598 examples
Before imputati