In [68]:
from sklearn import datasets
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

In [69]:
bean = datasets.load_boston()
print(bean.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [70]:
def load_boston():
    scaler = StandardScaler()
    boston = datasets.load_boston()
    X=boston.data
    y=boston.target
    X = scaler.fit_transform(X)
    return train_test_split(X,y)

In [71]:
X_train, X_test, y_train, y_test = load_boston()

In [72]:
X_train.shape

(379L, 13L)

In [73]:
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [74]:
list(zip (y_test, clf.predict(X_test)))

[(17.199999999999999, 14.758818393108733),
 (12.800000000000001, 13.448520958628283),
 (16.300000000000001, 12.987556037754853),
 (25.0, 22.096746829353268),
 (50.0, 40.880541272713415),
 (25.0, 23.139287646643325),
 (19.399999999999999, 23.893389744336783),
 (10.199999999999999, 17.055257042045184),
 (21.399999999999999, 22.678557652222764),
 (12.0, 12.465786523183688),
 (19.800000000000001, 18.546326878152559),
 (19.899999999999999, 19.236965197970775),
 (23.699999999999999, 10.31518114931489),
 (8.3000000000000007, 10.151848641055224),
 (24.800000000000001, 26.291052234142654),
 (16.699999999999999, 19.956510606480936),
 (19.600000000000001, 24.263372576222366),
 (24.100000000000001, 30.270229599649472),
 (43.5, 39.592464064359383),
 (7.2000000000000002, 17.987023989737491),
 (24.100000000000001, 20.35989624243259),
 (5.0, 6.277977823010719),
 (29.0, 32.017297724298878),
 (19.699999999999999, 14.273319442216724),
 (27.5, 32.805018220098091),
 (28.199999999999999, 33.350266132930216)

In [84]:
r2_score(y_test, clf.predict(X_test))

0.74888233880555566

In [85]:
mean_squared_error(y_test, clf.predict(X_test))

19.954555752005351

In [86]:
#Impliment sklearn.linear_model.Lasso
from sklearn.linear_model import Lasso

In [119]:
alpha = 0.01
lasso = Lasso(alpha=alpha)

y_lassopredict = lasso.fit(X_train, y_train).predict(X_test)
r2_lassoscore = r2_score(y_test, y_lassopredict)
print(lasso)
print("r^2: %f" % r2_lassoscore)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
r^2: 0.750892


In [145]:
#Adjust alpha to get the maximum r2 within (0,1)
alpha = 0.213
lasso = Lasso(alpha=alpha)

y_lassopredict = lasso.fit(X_train, y_train).predict(X_test)
r2_lassoscore = r2_score(y_test, y_lassopredict)
print(lasso)
print("r^2: %f" % r2_lassoscore)

Lasso(alpha=0.213, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
r^2: 0.763197
