In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

DIV_CMAP = sns.diverging_palette(220, 0, as_cmap=True) # DIVERGENT COLOR MAP
sns.set_style("whitegrid")


METRICS_DIR = "../outputs/proxy_metrics_20220426"

## Compute correlations for individual metrics in DEV set

In [None]:
SPLIT = "dev"
DATASET = "all_datasets"
TARGET_COL = "human_correctness"

DEV_FILEPATH = f"{METRICS_DIR}/{SPLIT}_{DATASET}_metrics.csv.gz"

In [None]:
dev_df = pd.read_csv(DEV_FILEPATH, index_col=0)
print("Read dataset with", len(dev_df), "examples from", DEV_FILEPATH)
print("Data file contains data for:", dev_df.dataset.unique())
dev_df.describe()

### Compute Pearson and Spearman Correlations

In [None]:
def collect_correlations(df, dataset=None, metric_cols=None, target_col=TARGET_COL):
    from scipy.stats import pearsonr, spearmanr

    if dataset is not None:
        df = df[df["dataset"] == dataset].copy()
    
    if metric_cols is None:
        metric_cols = df.select_dtypes("number").columns
    
    results = {}
    for metric_col in metric_cols:
        metric_corrs = {}
        
        correctness = df[target_col]
        metric_values = df[metric_col]
        
        pearson_val, p_value = pearsonr(correctness, metric_values)
        metric_corrs["pearson"] = pearson_val
        metric_corrs["pearson_pval"] = p_value

        spearman_val, p_value = spearmanr(correctness, metric_values)
        metric_corrs["spearman"] = spearman_val
        metric_corrs["spearman_pval"] = p_value
        
        # metric_corrs["dataset"] = dataset
        metric_corrs["n"] = len(df)
        results[metric_col] = metric_corrs
        
    return results


def get_all_correlations(df):
    unique_datasets = sorted(df.dataset.unique())

    correlations = {"all_datasets": collect_correlations(df)}
    correlations.update({
        d: collect_correlations(df, dataset=d) for d in unique_datasets
    })
    
    return correlations


# Sanity check (:
collect_correlations(dev_df);
get_all_correlations(dev_df)

In [None]:
dev_correlations = get_all_correlations(dev_df)

dev_corr_dfs = []
for dataset_name, correlations in dev_correlations.items():
    _df = pd.DataFrame.from_dict(correlations).T 
    dev_corr_dfs.append(_df)

dev_corr_dfs = pd.concat(dev_corr_dfs, keys=list(dev_correlations.keys()), axis=1)
dev_corr_dfs.to_csv(f"{METRICS_DIR}/dev_individual_correlations.csv")
dev_corr_dfs

In [None]:
def plot_correlation_heatmaps(data_correlations: dict):
    for dataset, correlations in data_correlations.items():
        df = pd.DataFrame.from_dict(correlations).T    
        df = df.drop(["pearson_pval", "spearman_pval", "n"], axis=1)

        plt.figure(figsize=(10, 7))
        plt.title(f"{METRICS_DIR} - {SPLIT} - {dataset}")
        sns.heatmap(df, vmin=-1, vmax=1, cmap=DIV_CMAP, annot=True)
        plt.show()
        

plot_correlation_heatmaps(dev_correlations)

In [None]:
def plot_col_distribution(df, col, split, figsize=(5, 3), **kwargs):
    plt.figure(figsize=figsize)
    plt.title(f"{METRICS_DIR} - {split} - all_datasets")
    sns.histplot(data=df, x=col, **kwargs)
    plt.show()

    for dataset in sorted(df.dataset.unique()):
        plt.figure(figsize=figsize)
        plt.title(f"{METRICS_DIR} - {split} - {dataset}")
        sns.histplot(data=df[df["dataset"] == dataset], x=col, **kwargs)
        plt.show()


plot_col_distribution(dev_df, TARGET_COL, split="dev", binrange=(0, 1), bins=20, stat="probability")

## Regression


In [None]:
TRAIN_FILEPATH = f"{METRICS_DIR}/train_all_datasets_metrics.csv.gz"

# Read original training set
train_df = pd.read_csv(TRAIN_FILEPATH, index_col=0)
print("Read dataset with", len(train_df), "examples from", TRAIN_FILEPATH)

# Define the columns to be all number types except the human correctness
METRIC_COLS = train_df.select_dtypes("number").columns
features = list(METRIC_COLS[2:])

# Target column will be the normalized human correctness
target = METRIC_COLS[1]
print("Target:", target, "\nFeatures:", features)

train_df.describe()

In [None]:
plot_col_distribution(train_df, target, split="train", binrange=(0, 1), bins=20, stat="probability")

### Regression

In [None]:
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics


def preprocess(data, scalers = None):
    data = data.copy()
    
    results = {}
    if scalers is not None:
        results = scalers
    
    for f in features:
        if scalers is None:
            scaler = StandardScaler()
            data[f] = scaler.fit_transform(data[f].values.reshape(-1, 1))
            results[f] = scaler
        else:
            scaler = results[f]
            data[f] = scaler.transform(data[f].values.reshape(-1, 1))
            
    return data, results


def fit_model(data, estimator, dataset=None):

    if dataset is not None:
        data = data[data["dataset"] == dataset]
    
    print("Considering dataset with", len(data), "examples, spanning datasets:", data.dataset.unique())
    X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.20, random_state=78452, stratify=data[target])
    print(X_train.shape, X_test.shape)
    
    # Preprocessing data (since LR may be sensitive to it)
    X_train_prec, scalers = preprocess(X_train)
    X_test_prec, _ = preprocess(X_test, scalers=scalers)

    # Create estimator
    clf = estimator()
    clf.fit(X_train_prec, y_train)

    # Evaluate
    scores = clf.predict(X_test_prec)
    results = {
        "mse": metrics.mean_squared_error(y_test, scores),
        "r2": metrics.r2_score(y_test, scores),
        "pearson": pearsonr(scores, y_test)[0],
        "spearman": spearmanr(scores, y_test),
    }
    return clf, scalers, results


def eval_datasets(model, eval_datasets: dict, scalers: dict):
    eval_results = {}
    eval_scores = {}
    for dataset_name, dataset in eval_datasets.items():
        X, y = dataset[features], dataset[target]

        X_prec, _ = preprocess(X.copy(), scalers=scalers)

        scores = model.predict(X_prec)
        eval_results[dataset_name] = {
            "mse": metrics.mean_squared_error(y, scores),
            "r2": metrics.r2_score(y, scores),
            "pearson": pearsonr(scores, y)[0],
            "spearman": spearmanr(scores, y)[0],
        }
        eval_scores[dataset_name] = scores
        
    return eval_results, eval_scores

# Sanity check
lr, lr_scalers, valid_results = fit_model(train_df, LinearRegression, dataset="narrativeqa")
valid_results

In [None]:
from collections import defaultdict

# Unique datasets
unique_datasets = list(train_df.dataset.unique())

# Evaluation datasets
# includes all_datasets (macro eval), as well as individual datasets
dev_orig_datasets = {None: dev_df}
dev_orig_datasets.update({dataset: dev_df[dev_df.dataset == dataset] for dataset in unique_datasets})

models = {}
results_by_dataset = {}
for dataset_name in dev_orig_datasets.keys():
    print("Fitting model using", "all" if dataset_name is None else dataset_name, "datasets")
    model, model_scalers, valid_results = fit_model(train_df, LinearRegression, dataset=dataset_name)
    
    models[dataset_name] = model
    results, scores = eval_datasets(model, dev_orig_datasets, model_scalers)
    
    results_by_dataset[dataset_name] = results    

In [None]:
table_results = defaultdict(list)

for train_dataset, test_values in results_by_dataset.items():
    
    for test_dataset, test_results in test_values.items():
        table_results["train_dataset"].append("all_datasets" if train_dataset is None else train_dataset)
        table_results["eval_dataset"].append("all_datasets" if test_dataset is None else test_dataset)
        
        for metric, metric_value in test_results.items():
            table_results[metric].append(metric_value)
            
table_results = pd.DataFrame(table_results)
table_results

In [None]:
def plot_model_coeffs(train_dataset): 
    clf = models[train_dataset]
    plt.figure(figsize=(10, 5))
    plt.title(f"Feature importance for train dataset: {train_dataset if train_dataset is not None else 'all_datasets'}")
    sns.barplot(y=features, x=clf.coef_, orient="h")
    plt.xlim(-1, 1)
    plt.show()
    

for train_dataset in models.keys():
    plot_model_coeffs(train_dataset)
