In [None]:
%pylab inline
import seaborn as sns
sns.set_palette("colorblind")
sns.set_style("darkgrid")

In [None]:
from io import StringIO
import pandas as pd
import json

In [None]:
def read_cv(file, param=None):
    cv = json.load(open(file))
    def parse_cv(cv):
        for item in cv:
            params = item['params']
            for key in item['test_scores']:
                yield (key, 'test', params[param], item['test_scores'][key])
                yield (key, 'train', params[param], item['train_scores'][key])
                
    df = pd.DataFrame(parse_cv(cv), columns='variable score_type param value'.split(' '))
    
    dplot =df.set_index(['score_type', 'variable', 'param']).unstack(level=[0])
    dplot.columns = dplot.columns.get_level_values(1)
    
    return dplot
    
    
def plot_cv(df, xlabel=r"$\alpha$", plotter=plt.semilogx):
    
#     cv.set_index(["model", "score_type", "param", "variable"]).unstack(1)
    fg = sns.FacetGrid(df.reset_index(), hue="variable", aspect=1.5, size=4, legend_out=False)
    fg.map(plotter, "param", "test")
    fg.map(plotter, "param", "train", ls='--')

    fg.add_legend()
    plt.ylabel(r'$R^2$')
    

In [None]:
cv_ridge = read_cv("cv/ridge/cv.json", "ridge__alpha")
cv_mca = read_cv("cv/mca/cv.json", "mca__n_components")
cv_pcr = read_cv("cv/pcr/cv.json", "pca__n_components")

Ridge regression performance

In [None]:
cv_ridge.head()

In [None]:
plot_cv(cv_ridge)
plt.title("""Cross Validation performance of Ridge Regression
test (-), train(--)
""")
plt.xlabel(r"$\alpha$")

MCA performance

In [None]:
plot_cv(cv_mca, plotter=plt.plot)
plt.title("""Cross Validation performance of MCA Regression
test (-), train(--)
""")
plt.xlabel(r"number of modes retained")

The performance on the test dataset rapidly declines for a large number of modes. Both models have a best case total performance of around $R^2 = .15$

PCR Performance

In [None]:
plot_cv(cv_pcr, plotter=plt.plot)
plt.title("""Cross Validation performance of MCA Regression
test (-), train(--)
""")
plt.xlabel(r"number of modes retained")

PCR peforms similary to MCA, but with slightly worse statistics.

Now let's make a table comparing our cross validation options

In [None]:
cv = pd.concat({'ridge': cv_ridge, 'mca': cv_mca, 'pcr': cv_pcr}, names=['model'])

In [None]:
cv.head()

In [None]:
def f(df):
    best_ind = df[df.variable == "total"].test.argmax()
    del df['model']
    return df.loc[best_ind].set_index('variable', append=True).unstack(level=1)

cv.reset_index().set_index('param')\
.groupby('model')\
.apply(f)

As you can see the ridge regression significantly outperforms the decomposition based approaches, but the $R^2$ is low for all options.