## Regression

Train and evaluate the baselines. Baselines are: 

- Linear Regression
- L1 regression (Lasso Regression)
- Decision Tree 
- Random Forests 

For every baseline above (except L1), we'll also consider a PCA version where we first reduce the dimensionality. 

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

METRICS_DIR = "../outputs/metrics/proxy_metrics_20220426"

# ---------------------------------------------------------------
# DATA LOADING
# ---------------------------------------------------------------
TRAIN_FILEPATH = f"{METRICS_DIR}/train_all_datasets_metrics.csv.gz"
DEV_FILEPATH = f"{METRICS_DIR}/dev_all_datasets_metrics.csv.gz"

train_df = pd.read_csv(TRAIN_FILEPATH, index_col=0)
print(f"Read dataset with {len(train_df)} examples from {TRAIN_FILEPATH}")

dev_df = pd.read_csv(DEV_FILEPATH, index_col=0)
print(f"Read (holdout) dataset with {len(dev_df)} examples from {DEV_FILEPATH}")

# ---------------------------------------------------------------
# FEATURES and TARGET 
# ---------------------------------------------------------------
METRIC_COLS = train_df.select_dtypes("number").columns
features = list(METRIC_COLS[2:])

# Target column will be the normalized human correctness
target = METRIC_COLS[1]
print("Features:\t", features, "\n\nTarget:\t", target)
      
DATASET_NAMES = sorted(train_df["dataset"].unique())

TRAIN_DATASETS = {"all": train_df}
TRAIN_DATASETS.update({d: train_df[train_df["dataset"] == d] for d in DATASET_NAMES})

EVAL_DATASETS = {"all": dev_df}
EVAL_DATASETS.update({d: dev_df[dev_df["dataset"] == d] for d in DATASET_NAMES})

Read dataset with 31069 examples from ../outputs/metrics/proxy_metrics_20220426/train_all_datasets_metrics.csv.gz
Read (holdout) dataset with 4009 examples from ../outputs/metrics/proxy_metrics_20220426/dev_all_datasets_metrics.csv.gz
Features:	 ['exact_match', 'meteor', 'rouge1', 'rouge2', 'rougeL', 'rougeLsum', 'bleurt', 'bert-score', 'bleu1', 'bleu2', 'bleu3', 'bleu4', 'bleu-precision0', 'bleu-precision1', 'bleu-precision2', 'bleu-precision3', 'precision', 'recall', 'f1_score', 'csi', 'num_edits', 'edit_score'] 

Target:	 human_correctness


TODO 
- [ ] Compute Baselines (Avg metric)
- [ ] Compute Linear Regression 
- [ ] L1 regression
- [ ] Decision Tree
- [ ] Random Forest
- 

In [86]:
class NoPreprocessing:
    def fit(self, *args, **kwargs): 
        pass
    
    def transform(self, X, *args, **kwargs): 
        return X
    
    def fit_transform(self, X, *args, **kwargs): 
        return X
    

class Pipeline:
    """"""
    
    def __init__(self, model_class, model_hparams, dataset, features, target, seed=81263):
        self.model_class = model_class
        self.model_hparams = model_hparams
        self.dataset = dataset
        self.features = features
        self.target = target
        self.seed = seed
        
    def load_data(self, data):
        """"""
        print(f"Loading dataset '{self.dataset}'")
        if self.dataset == "all":
            data = data.copy()
        else:
            data = data[data["dataset"] == self.dataset].copy()

        self.X_train = data[self.features]
        self.y_train = data[self.target]
        
    def split(self, holdout_fraction=0.2):
        """"""
        print(f"Splitting dataset holdout_fraction={holdout_fraction}")
        from sklearn.model_selection import train_test_split
        
        X_train, X_test, y_train, y_test = train_test_split(
            self.X_train, self.y_train,
            test_size=holdout_fraction, 
            random_state=self.seed, 
            stratify=self.y_train,
        )
            
        self.X_train, self.X_test = X_train, X_test
        self.y_train, self.y_test = y_train, y_test        
        
    def preprocess(self, with_std=True, with_pca=False, **kwargs):
        """"""
        from sklearn.preprocessing import StandardScaler
        from sklearn.pipeline import make_pipeline
        from sklearn.decomposition import PCA
        
        operations = []
        
        if with_std:
            print("Using StandardScaler")
            operations.append(StandardScaler())
        if with_pca:
            print("Using PCA")
            operations.append(PCA(random_state=self.seed, **kwargs))
        
        self.preproc_fn = make_pipeline(*operations) \
            if len(operations) > 0 else NoPreprocessing()
        
        self.preproc_fn.fit(self.X_train)
        self.X_train = self.preproc_fn.transform(self.X_train)
        
        if getattr(self, "X_test", None) is not None:
            self.X_test = self.preproc_fn.transform(self.X_test)
        
    def fit(self):
        """"""
        self.model = self.model_class(**self.model_hparams)
        self.model.fit(self.X_train, self.y_train)
        
    def evaluate(self, eval_dataset=None):
        """"""
        import sklearn.metrics as m
        import scipy.stats as st
        
        if eval_dataset is None:
            print("Evaluating holdout dev set.")
            X_test, y_test = self.X_test, self.y_test
        else:
            X_test = eval_dataset[self.features]
            y_test = eval_dataset[self.target]    
            X_test = self.preproc_fn.transform(X_test)
        
        # Evaluation
        scores = self.model.predict(X_test)
        
        return {
            "mse": m.mean_squared_error(y_pred=scores, y_true=y_test),
            "r2": m.r2_score(y_pred=scores, y_true=y_test),
            "pearson": st.pearsonr(scores, y_test)[0],
            "spearman": st.spearmanr(scores, y_test)[0],
            "trained_on": self.dataset,
        }
        
    def evaluate_multiple(self, eval_datasets: dict):
        all_results = []
        
        for name, eval_dataset in eval_datasets.items():
            eval_result = self.evaluate(eval_dataset)
            eval_result["evaluated_on"] = name
            all_results.append(eval_result)
            
        return all_results


from functools import partial

# Abstract features and targets
general_pipeline = partial(Pipeline, features=features, target=target)

In [95]:
def run_model_selection(model_class, model_hyperparams={}):
    pipelines = {}
    all_results = []
    for train_name, train_data in TRAIN_DATASETS.items(): 
        p = general_pipeline(model_class, model_hyperparams, dataset=train_name)

        p.load_data(train_data)
        p.split(holdout_fraction=0.2)
        p.preprocess(with_std=True)
        p.fit()
        
        result = p.evaluate()
        result["model_class"] = model_class.__name__
        result["model_hyperparams"] = str(model_hyperparams)
        
        pipelines["dataset"] = p
        all_results.append(result)

    return pd.DataFrame(all_results), pipelines


def run_eval(model_class, model_hyperparams={}):
                           
    all_results = []
    for train_name, train_data in TRAIN_DATASETS.items(): 
        p = general_pipeline(model_class, model_hyperparams, dataset=train_name)

        p.load_data(train_data)
        # p.split(holdout_fraction=0.2)
        # ^Note: We want to train with the whole training data when evaluating
        p.preprocess(with_std=True)
        p.fit()
        all_results.append(p.evaluate_multiple(EVAL_DATASETS))

    return pd.DataFrame(all_results)

In [96]:
# Linear regression
from sklearn.linear_model import LinearRegression
run_model_selection(LinearRegression)[0]

Loading dataset 'all'
Splitting dataset holdout_fraction=0.2
Using StandardScaler
Evaluating holdout dev set.
Loading dataset 'cosmosqa'
Splitting dataset holdout_fraction=0.2
Using StandardScaler
Evaluating holdout dev set.
Loading dataset 'drop'
Splitting dataset holdout_fraction=0.2
Using StandardScaler
Evaluating holdout dev set.
Loading dataset 'mcscript'
Splitting dataset holdout_fraction=0.2
Using StandardScaler
Evaluating holdout dev set.
Loading dataset 'narrativeqa'
Splitting dataset holdout_fraction=0.2
Using StandardScaler
Evaluating holdout dev set.
Loading dataset 'quoref'
Splitting dataset holdout_fraction=0.2
Using StandardScaler
Evaluating holdout dev set.
Loading dataset 'socialiqa'
Splitting dataset holdout_fraction=0.2
Using StandardScaler
Evaluating holdout dev set.


Unnamed: 0,mse,r2,pearson,spearman,trained_on,model_class,model_hyperparams
0,0.093295,0.414422,0.643876,0.641206,all,LinearRegression,{}
1,0.050313,0.678596,0.82392,0.761769,cosmosqa,LinearRegression,{}
2,0.072685,0.432717,0.664384,0.67496,drop,LinearRegression,{}
3,0.110449,0.356385,0.597036,0.594506,mcscript,LinearRegression,{}
4,0.108332,0.373252,0.611243,0.612935,narrativeqa,LinearRegression,{}
5,0.042555,0.534231,0.732197,0.721967,quoref,LinearRegression,{}
6,0.070787,0.536266,0.732457,0.721446,socialiqa,LinearRegression,{}


In [99]:
from sklearn.linear_model import LinearRegression, Lasso

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


results = []

# -------------------------------------------
# Linear regression
# -------------------------------------------
rs = run_model_selection(LinearRegression)[0]
results.append(rs)



# -------------------------------------------
# Lasso regression
# -------------------------------------------
alpha_configs = {"alpha": }
rs = run_model_selection(Lasso)[0]
results.append(rs)



SyntaxError: invalid syntax (2979854791.py, line 20)

In [106]:
from sklearn.model_selection import ParameterSampler
from scipy.stats import lognorm

param_grid = {'a':[1, 2], 'b': lognorm(loc=0, scale=1)}
param_list = list(ParameterSampler(param_grid, n_iter=4, random_state=1234))

TypeError: _parse_args() missing 1 required positional argument: 's'

In [90]:
from sklearn.linear_model import Lasso

p = general_pipeline(Lasso, {"alpha": 1}, dataset=train_name)

p.load_data(train_data)
p.split(holdout_fraction=0.2)
p.preprocess(with_std=True)
p.fit()
p.evaluate()


Loading dataset 'socialiqa'
Splitting dataset holdout_fraction=0.2
Using StandardScaler
Evaluating holdout dev set.




{'mse': 0.15264512887306073,
 'r2': -4.773249147049796e-07,
 'pearson': nan,
 'spearman': nan,
 'trained_on': 'socialiqa'}

In [91]:
p.model.coef_

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0., -0., -0.])

In [70]:
features[6]

'bleurt'

In [65]:
Lasso(alpha=0)

Lasso(alpha=0)

In [43]:
pd.crosstab()

AttributeError: 'DataFrameGroupBy' object has no attribute 'unstack'

In [40]:
all_results.pivot(index="trained_on", columns="evaluated_on", values=["mse"])

Unnamed: 0_level_0,mse,mse,mse,mse,mse,mse,mse
evaluated_on,all,cosmosqa,drop,mcscript,narrativeqa,quoref,socialiqa
trained_on,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
all,0.068703,0.052023,0.101517,0.094995,0.076208,0.043676,0.053389
cosmosqa,0.094798,0.037583,0.191385,0.131905,0.115565,0.125797,0.059669
drop,0.096497,0.071429,0.070072,0.131983,0.115645,0.034393,0.085979
mcscript,0.08336,0.066408,0.16975,0.089846,0.08121,0.133941,0.06504
narrativeqa,0.081593,0.077619,0.17743,0.093936,0.074912,0.099072,0.063187
quoref,0.090523,0.062699,0.076701,0.136007,0.106423,0.025276,0.074943
socialiqa,0.081158,0.043246,0.195842,0.108009,0.089985,0.117177,0.04995


In [11]:
p.X_test.shape

(75, 22)

In [6]:
all_results

[{'mse': 0.06866390421449335,
  'r2': 0.5108459457282504,
  'pearson': 0.7160323651395801,
  'spearman': 0.6973840280825205,
  'trained_on': 'all',
  'evaluated_on': 'all'},
 {'mse': 0.051919156141483284,
  'r2': 0.6544272576304422,
  'pearson': 0.8496092613008309,
  'spearman': 0.8070434965692682,
  'trained_on': 'all',
  'evaluated_on': 'cosmosqa'},
 {'mse': 0.10178139070316412,
  'r2': 0.24327344336532886,
  'pearson': 0.6433292806727148,
  'spearman': 0.6417500276605237,
  'trained_on': 'all',
  'evaluated_on': 'drop'},
 {'mse': 0.09478800133442038,
  'r2': 0.3524520806796013,
  'pearson': 0.6098660630245658,
  'spearman': 0.5850000860484836,
  'trained_on': 'all',
  'evaluated_on': 'mcscript'},
 {'mse': 0.07609741456403131,
  'r2': 0.49226871096094127,
  'pearson': 0.7101910798185168,
  'spearman': 0.7233652532110437,
  'trained_on': 'all',
  'evaluated_on': 'narrativeqa'},
 {'mse': 0.0440114168748348,
  'r2': 0.4116223570589844,
  'pearson': 0.7301193495597244,
  'spearman': 0.69

In [None]:
DATASET_NAMES = train_df.dataset.unique()
DATASET_NAMES

In [None]:
from sklearn.linear_model import LinearRegression

lr_pipeline = Pipeline(LinearRegression, {}, "drop", features, target, seed=1295532)
lr_pipeline.load_data(train_df)
lr_pipeline.split(holdout_fraction=0.2)
lr_pipeline.preprocess(with_std=True)
lr_pipeline.fit()
lr_pipeline.evaluate()
pd.DataFrame(lr_pipeline.evaluate_multiple({k: dev_df[dev_df.dataset == k] for k in DATASET_NAMES}))

In [None]:
features

###  Average Baseline

In [None]:
class AvgBaseline:
    def fit(self, X, y):
        pass
    
    def predict(self, X):
        y_pred = np.mean(X, axis=1)
        assert len(y_pred) == X.shape[0]
        return y_pred

In [None]:
AvgBaseline().predict(dev_df[features])

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

import sklearn.metrics as metrics
import scipy.stats as st

import os
import joblib


EXPERIMENT_DIR = "../outputs/experiment"
print("Persisting experiment results at", EXPERIMENT_DIR)
os.makedirs(EXPERIMENT_DIR, exist_ok=True)

### Fit models

In [None]:
train_df[train_df.dataset == "drop"]

In [None]:
dataset_name = "drop"
dataset = TRAIN_DATASETS[dataset_name]

# preprocess dataset
data = train_test_split(dataset[features], dataset[target], preprocessor, test_fraction=0.2, seed=1295532)

# fit model
model, model_results = fit_model(AvgBaseline, data)

# evaluate in validation
model_results


In [None]:
for dataset_name, dataset in TRAIN_DATASETS.items():
    # Dataset preprocessor
    preprocessor = scaler()
    
    data = train_test_split(dataset[features], dataset[target], preprocessor, test_fraction=0.2, seed=SEED)
    model, model_results = fit_model(AvgBaseline, data, features=features[1:])
    model_results["dataset"] = dataset_name
    model_results["seed"] = SEED
    model_results["test_fraction"] = 0.2
    model_results["preprocessing"] = "StandardScaler"
    results.append(model_results)

In [None]:
def fit_model(model_class, data: tuple, **kwargs) -> tuple:
    X_train, X_test, y_train, y_test = data
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    
    # Estimator
    model = model_class(**kwargs)
    model.fit(X_train, y_train)

    # Evaluate in holdout set
    scores = model.predict(X_test)
    results = {
        "mse": metrics.mean_squared_error(y_true=y_test, y_pred=scores),
        "r2": metrics.r2_score(y_true=y_test, y_pred=scores),
        "pearson": st.pearsonr(scores, y_test)[0],
        "spearman": st.spearmanr(scores, y_test)[0],
    }

    return model, results

In [None]:
def compute_m

In [None]:
from sklearn.linear_model import LinearRegression 

train_datasets = TRAIN_DATASETS
preproc_fn = scaler
test_fraction = 0.2
seed = seed

model_selection_results = []
for dataset_name, dataset in train_datasets.items():
    # Dataset preprocessor
    preprocessor = preproc_fn()
    
    data = train_test_split(dataset[features], dataset[target], preprocessor, test_fraction=test_fraction, seed=SEED)
    model, model_results = fit_model(LinearRegression, data)
    model_results["dataset"] = dataset_name
    model_results["seed"] = SEED
    model_results["test_fraction"] = test_fraction
    model_results["preprocessing"] = "StandardScaler"
    results.append(model_results)

In [None]:
pd.DataFrame(results)

In [None]:
results

In [None]:




def fit_model(data, estimator, dataset=None):

    if dataset is not None:
        data = data[data["dataset"] == dataset]
    
    print("Considering dataset with", len(data), "examples, spanning datasets:", data.dataset.unique())
    X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.20, random_state=78452, stratify=data[target])
    print(X_train.shape, X_test.shape)
    
    # Preprocessing data (since LR may be sensitive to it)
    X_train_prec, scalers = preprocess(X_train)
    X_test_prec, _ = preprocess(X_test, scalers=scalers)

    # Create estimator
    clf = estimator()
    clf.fit(X_train_prec, y_train)

    # Evaluate
    scores = clf.predict(X_test_prec)
    results = {
        "mse": metrics.mean_squared_error(y_test, scores),
        "r2": metrics.r2_score(y_test, scores),
        "pearson": pearsonr(scores, y_test)[0],
        "spearman": spearmanr(scores, y_test),
    }
    return clf, scalers, results


def eval_datasets(model, eval_datasets: dict, scalers: dict):
    eval_results = {}
    eval_scores = {}
    for dataset_name, dataset in eval_datasets.items():
        X, y = dataset[features], dataset[target]

        X_prec, _ = preprocess(X.copy(), scalers=scalers)

        scores = model.predict(X_prec)
        eval_results[dataset_name] = {
            "mse": metrics.mean_squared_error(y, scores),
            "r2": metrics.r2_score(y, scores),
            "pearson": pearsonr(scores, y)[0],
            "spearman": spearmanr(scores, y)[0],
        }
        eval_scores[dataset_name] = scores
        
    return eval_results, eval_scores


In [None]:
from sklearn.linear_model import LinearRegression

# Sanity check
lr, lr_scalers, valid_results = fit_model(train_df, LinearRegression, dataset="narrativeqa")
valid_results

In [None]:
dev_df[features]

In [None]:
from scipy.stats import pearsonr, spearmanr

    
    
model = AvgBaseline(features[1:])
print("Using metrics:", model._features)
avg_metrics = model.predict(dev_df)

from sklearn.metrics import mean_squared_error, mean_absolute_error
print("MSE:", mean_squared_error(y_pred=avg_metrics, y_true=dev_df[target]))
print("MAE:", mean_absolute_error(y_pred=avg_metrics, y_true=dev_df[target]))
print("Pearson:", pearsonr(avg_metrics, dev_df[target])[0])
print("Spearman:", spearmanr(avg_metrics, dev_df[target])[0])

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
print("MSE:", mean_squared_error(y_pred=avg_metrics, y_true=dev_df[target]))
print("MAE:", mean_absolute_error(y_pred=avg_metrics, y_true=dev_df[target]))
print("Pearson:", pearsonr(avg_metrics, dev_df[target])[0])
print("Spearman:", spearmanr(avg_metrics, dev_df[target])[0])

In [None]:
from collections import defaultdict

# Unique datasets
unique_datasets = list(train_df.dataset.unique())

# Evaluation datasets
# includes all_datasets (macro eval), as well as individual datasets
dev_orig_datasets = {None: dev_df}
dev_orig_datasets.update({dataset: dev_df[dev_df.dataset == dataset] for dataset in unique_datasets})

models = {}
results_by_dataset = {}
for dataset_name in dev_orig_datasets.keys():
    print("Fitting model using", "all" if dataset_name is None else dataset_name, "datasets")
    model, model_scalers, valid_results = fit_model(train_df, LinearRegression, dataset=dataset_name)
    
    models[dataset_name] = model
    results, scores = eval_datasets(model, dev_orig_datasets, model_scalers)
    
    results_by_dataset[dataset_name] = results
    
    
    


def parse_table_results(results_by_dataset, filename, output_dir=METRICS_DIR):
    table_results = defaultdict(list)

    for train_dataset, test_values in results_by_dataset.items():

        for test_dataset, test_results in test_values.items():
            table_results["train_dataset"].append("all_datasets" if train_dataset is None else train_dataset)
            table_results["eval_dataset"].append("all_datasets" if test_dataset is None else test_dataset)

            for metric, metric_value in test_results.items():
                table_results[metric].append(metric_value)
            
    table_results = pd.DataFrame(table_results)
    table_results.to_csv((f"{output_dir}/{filename}.csv"))
    return table_results


table_results = parse_table_results(results_by_dataset, output_dir=METRICS_DIR, filename="dev_lr_correlations")
table_results

In [None]:
def plot_model_coeffs(models, train_dataset): 
    clf = models[train_dataset]
    plt.figure(figsize=(10, 5))
    plt.title(f"Feature importance for train dataset: {train_dataset if train_dataset is not None else 'all_datasets'}")
    sns.barplot(y=features, x=clf.coef_, orient="h")
    plt.xlim(-1, 1)
    plt.show()
    

for train_dataset in models.keys():
    plot_model_coeffs(models, train_dataset)


### Leave-one-out (LOO) Regression

In [None]:
from collections import defaultdict

# Unique datasets
_unique_datasets = list(train_df.dataset.unique())

# Evaluation datasets
# includes all_datasets (macro eval), as well as individual datasets
_dev_orig_datasets = {None: dev_df}
_dev_orig_datasets.update({dataset: dev_df[dev_df.dataset == dataset] for dataset in unique_datasets})

_models = {}
_results_by_dataset = {}
for _dataset_name in _dev_orig_datasets.keys():
    if _dataset_name is None: continue
    
    # Compute other dataset names except `_dataset_name`
    _remaining_datasets = [k for k in _dev_orig_datasets.keys() if k != _dataset_name]
    
    # Select subset of trainin data that does not include `_dataset_name`
    _train_remain_df = train_df[train_df.dataset.isin(_remaining_datasets)]
    _train_remain_name = f"all_except_{_dataset_name}"
    
    print("Fitting model on", _train_remain_name, f"with {len(_train_remain_df)} examples (instead of {len(train_df)})")

    _model, _model_scalers, _valid_results = fit_model(_train_remain_df, LinearRegression)
    _models[_train_remain_name] = _model
    _results, _scores = eval_datasets(_model, _dev_orig_datasets, _model_scalers)
    _results_by_dataset[_train_remain_name] = _results
    

parse_table_results(_results_by_dataset, "dev_lr_loo_correlations", METRICS_DIR)

In [None]:
for train_dataset in _models.keys():
    plot_model_coeffs(_models, train_dataset)

### Principal Component Analysis

We've seen there is a large correlation between the different metrics.. In particular, it might explain the coefficients, we see in the image above. In the presence of redundancy, the model may be [non-identifiable](https://en.wikipedia.org/wiki/Identifiability), i.e., have two or more parameterizations that are observationally equivalent.

In this section of the notebook, we are interested in knowing whether there will be a set of orthogonal components that fully explain the model. 

In [None]:
def eval_datasets_with_pca(model, eval_datasets: dict, pca: dict):
    eval_results = {}
    eval_scores = {}
    for dataset_name, dataset in eval_datasets.items():
        X, y = dataset[features], dataset[target]

        X_prec = pca.transform(X.copy())

        scores = model.predict(X_prec)
        eval_results[dataset_name] = {
            "mse": metrics.mean_squared_error(y, scores),
            "r2": metrics.r2_score(y, scores),
            "pearson": pearsonr(scores, y)[0],
            "spearman": spearmanr(scores, y)[0],
        }
        eval_scores[dataset_name] = scores
    return eval_results, eval_scores

In [None]:
from sklearn.decomposition import PCA

In [None]:
data = train_df.copy()
print("Considering dataset with", len(data), "examples, spanning datasets:", data.dataset.unique())

X_train, X_test, y_train, y_test = train_test_split(
    data[features], data[target], test_size=0.20, random_state=78452, stratify=data[target])
print(X_train.shape, X_test.shape)
    
# Preprocessing data (since LR may be sensitive to it)
# X_train_prec, scalers = preprocess(X_train)
# X_test_prec, _ = preprocess(X_test, scalers=scalers)


# Iterate over several components
eval_results = defaultdict(list)
_pca_models = {}
_pca = {}
for n in range(2, 20):
    print(X_train.shape, X_test.shape)
    print("Fitting PCA w/ n_components =", n)
    for seed in (123124, 1295532, 875843):
        # Create estimator
        pca = PCA(n_components=n, random_state=seed)
        X_train_transf = pca.fit_transform(X_train.copy())
        X_test_transf = pca.transform(X_test.copy())
        # print(X_train_transf.shape, X_test_transf.shape)

        # Fit LR on top of new representation
        lr = LinearRegression()
        lr.fit(X_train_transf, y_train)

        scores = lr.predict(X_test_transf)
        eval_results["n"].append(n)
        eval_results["seed"].append(n)
        eval_results["mse"].append(metrics.mean_squared_error(y_test, scores))
        eval_results["r2"].append(metrics.r2_score(y_test, scores))
        eval_results["pearson"].append(pearsonr(scores, y_test)[0])
        eval_results["spearman"].append(spearmanr(scores, y_test)[0])
        
        _pca[(n, seed)] = pca
        _pca_models[(n, seed)] = lr
        
eval_results = pd.DataFrame(eval_results)

In [None]:
sns.lineplot(data=eval_results, x="n", y="mse")
plt.xlabel("N components (PCA)")
plt.title("MSE of fit in function of number of PCA components on dev set")
plt.xlim(0, 18)
plt.ylim(0.08, 0.15)
plt.show()

In [None]:
n_components, seed = 10, 1295532
model = _pca_models[(n_components, seed)]

# Unique datasets
unique_datasets = list(train_df.dataset.unique())

# Evaluation datasets
# includes all_datasets (macro eval), as well as individual datasets
dev_orig_datasets = {None: dev_df}
dev_orig_datasets.update({dataset: dev_df[dev_df.dataset == dataset] for dataset in unique_datasets})

results_by_dataset = {}
for dataset_name in dev_orig_datasets.keys():
    print("Fitting model using", "all" if dataset_name is None else dataset_name, "datasets")
    results, scores = eval_datasets_with_pca(
        model=model,
        pca=_pca[(n_components, seed)], 
        eval_datasets=dev_orig_datasets,
    )
    
    results_by_dataset[dataset_name] = results 

parse_table_results(results_by_dataset, f"dev_pca_{n_components}+lr_correlations", METRICS_DIR)

In [None]:

# Reduce dimension to 2 with PCA
pca = {i: lambda: make_pipeline(StandardScaler(), PCA(n_components=i, random_state=SEED)) for i in range(2, 15)}