In [31]:
from sklearn.cross_validation import KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, Lars
from sklearn.metrics import mean_squared_error, r2_score
import math
import numpy as np
import pylab as pl
import statsmodels.api as sm
ccard = sm.datasets.ccard

print(ccard.DESCRLONG)

More information on this data can be found on the
homepage for Greene's `Econometric Analysis`. See source.



In [32]:
dataset_ccard = ccard.load_pandas()

In [33]:
df_ccard = dataset_ccard.data
df_ccard.head()

Unnamed: 0,AVGEXP,AGE,INCOME,INCOMESQ,OWNRENT
0,124.98,38.0,4.52,20.4304,1.0
1,9.85,33.0,2.42,5.8564,0.0
2,15.0,34.0,4.5,20.25,1.0
3,137.87,31.0,2.54,6.4516,0.0
4,546.5,32.0,9.79,95.8441,1.0


In [34]:
x = np.array(df_ccard['AGE'])
y = np.array(df_ccard['INCOMESQ'])
x = x.reshape(-1,1)

In [35]:
a = 0.8
for name,met in [
        ('linear regression', LinearRegression()),
        ('lasso', Lasso(fit_intercept=True, alpha=a)),
        ('ridge', Ridge(fit_intercept=True, alpha=a)),
        ('elastic-net', ElasticNet(fit_intercept=True, alpha=a)),
        ('lars', Lars(fit_intercept=True, n_nonzero_coefs=1)),
        ]:
    met.fit(x,y)
    # p = np.array([met.predict(xi) for xi in x])
    p = met.predict(x)
    e = p-y
    total_error = np.dot(e,e)
    rmse_train = np.sqrt(total_error/len(p))
    std_dev = np.std(x)
    

    kf = KFold(len(x), n_folds=10)
    err = 0
    for train,test in kf:
        met.fit(x[train],y[train])
        #std_dev=math.sqrt(r2_score(x[train],y[train]))
        p = met.predict(x[test])
        e = p-y[test]
        err += np.dot(e,e)

    rmse_10cv = np.sqrt(err/len(x))
    print('Method: %s' %name)
    print('RMSE on training: %.4f' %rmse_train)
    print('RMSE on 10-fold CV: %.4f' %rmse_10cv)
    print ("\n")

Method: linear regression
RMSE on training: 16.6212
RMSE on 10-fold CV: 17.0115


Method: lasso
RMSE on training: 16.6216
RMSE on 10-fold CV: 17.0093


Method: ridge
RMSE on training: 16.6212
RMSE on 10-fold CV: 17.0114


Method: elastic-net
RMSE on training: 16.6215
RMSE on 10-fold CV: 17.0075


Method: lars
RMSE on training: 16.6212
RMSE on 10-fold CV: 17.0115


