In [None]:
%pylab inline
import seaborn as sns
sns.set_palette("colorblind")

In [None]:
from io import StringIO
import pandas as pd
import json

In [None]:
def get_df(param_key, filename):


    jq_cmd = """["total", "q1", "q2"] as $score_keys |
    [.[] | .train_scores as $train |
       (.test_scores | to_entries[] | {"score_key": $score_keys[.key], "test_score": .value, "train_score": $train[.key]})
        + {"param": .params.%s}]
    """%param_key

    data = !jq '{jq_cmd}' {filename}
    
    
    return pd.read_json(''.join(data))


def plot_cv(df, xlabel=r"$\alpha$", plotter=plt.semilogx):
    fg = sns.FacetGrid(df.reset_index(), hue="score_key", aspect=1.5, size=4, legend_out=False)
    fg.map(plotter, "param", "test_score")
    fg.map(plotter, "param", "train_score", ls='--')

    fg.add_legend()
    plt.ylabel(r'$R^2$')
    

In [None]:
cv_ridge = get_df("ridge__alpha", "cv/ridge/cv.json")
cv_mca = get_df("mca__n_components", "cv/mca/cv.json")
cv_pcr = get_df("pca__n_components", "cv/pcr/cv.json")

cv = pd.concat({"pcr":cv_pcr, "mca":cv_mca, "ridge": cv_ridge},
               names=["model"])\
        .reset_index()\
        .set_index(["model", "score_key"])\
        .drop('level_1', axis=1)

Ridge regression performance

In [None]:
plot_cv(cv.loc["ridge"])
plt.title("""Cross Validation performance of Ridge Regression
test (-), train(--)
""")
plt.xlabel(r"$\alpha$")

MCA performance

In [None]:
plot_cv(cv.loc["mca"], plotter=plt.plot)
plt.title("""Cross Validation performance of MCA Regression
test (-), train(--)
""")
plt.xlabel(r"number of modes retained")

The performance on the test dataset rapidly declines for a large number of modes. Both models have a best case total performance of around $R^2 = .15$

PCR Performance

In [None]:
plot_cv(cv.loc["pcr"], plotter=plt.plot)
plt.title("""Cross Validation performance of MCA Regression
test (-), train(--)
""")
plt.xlabel(r"number of modes retained")

PCR peforms similary to MCA, but with slightly worse statistics.

Now let's make a table comparing our cross validation options

In [None]:
def get_best_cv(cv):


    def best(df):
        dfr = df.set_index("param")
        idx = dfr.loc[dfr.score_key == "total"].test_score.argmax()
        return dfr.loc[idx].reset_index()

    return (cv.reset_index()
             .groupby("model")
             .apply(best)
             .set_index(["model", "param", "score_key"])
             .unstack(level=2))
best_df = get_best_cv(cv)
best_df

As you can see the ridge regression significantly outperforms the decomposition based approaches, but the $R^2$ is low for all options.