In [26]:
from sklearn.cross_validation import KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, Lars
from sklearn.metrics import mean_squared_error, r2_score
import math
import numpy as np
import pylab as pl
import statsmodels.api as sm
copper = sm.datasets.copper

print(copper.DESCRLONG)

This data describes the world copper market from 1951 through 1975.  In an
example, in Gill, the outcome variable (of a 2 stage estimation) is the world
consumption of copper for the 25 years.  The explanatory variables are the
world consumption of copper in 1000 metric tons, the constant dollar adjusted
price of copper, the price of a substitute, aluminum, an index of real per
capita income base 1970, an annual measure of manufacturer inventory change,
and a time trend.



In [27]:
dataset_copper = copper.load_pandas()

In [28]:
df_copper = dataset_copper.data
df_copper.head(2)

Unnamed: 0,WORLDCONSUMPTION,COPPERPRICE,INCOMEINDEX,ALUMPRICE,INVENTORYINDEX,TIME
0,3173.0,26.56,0.7,19.76,0.98,1.0
1,3281.1,27.31,0.71,20.78,1.04,2.0


In [29]:
x = np.array(df_copper['ALUMPRICE'])
y = np.array(df_copper['COPPERPRICE'])
x = x.reshape(-1,1)

In [30]:
a = 0.5
for name,met in [
        ('linear regression', LinearRegression()),
        ('lasso', Lasso(fit_intercept=True, alpha=a)),
        ('ridge', Ridge(fit_intercept=True, alpha=a)),
        ('elastic-net', ElasticNet(fit_intercept=True, alpha=a)),
        ('lars', Lars(fit_intercept=True, n_nonzero_coefs=1)),
        ]:
    met.fit(x,y)
    # p = np.array([met.predict(xi) for xi in x])
    p = met.predict(x)
    e = p-y
    total_error = np.dot(e,e)
    rmse_train = np.sqrt(total_error/len(p))
    std_dev = np.std(x)
    

    kf = KFold(len(x), n_folds=10)
    err = 0
    for train,test in kf:
        met.fit(x[train],y[train])
        #std_dev=math.sqrt(r2_score(x[train],y[train]))
        p = met.predict(x[test])
        e = p-y[test]
        err += np.dot(e,e)

    rmse_10cv = np.sqrt(err/len(x))
    print('Method: %s' %name)
    print('RMSE on training: %.4f' %rmse_train)
    print('RMSE on 10-fold CV: %.4f' %rmse_10cv)
    print ("\n")

Method: linear regression
RMSE on training: 6.7686
RMSE on 10-fold CV: 8.6587


Method: lasso
RMSE on training: 6.7714
RMSE on 10-fold CV: 8.4255


Method: ridge
RMSE on training: 6.7686
RMSE on 10-fold CV: 8.6519


Method: elastic-net
RMSE on training: 6.7696
RMSE on 10-fold CV: 8.4759


Method: lars
RMSE on training: 6.7686
RMSE on 10-fold CV: 8.6587


