In [1]:
from sklearn import datasets
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [2]:
bean=datasets.load_boston()

In [3]:
print bean.DESCR

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [5]:
def load_boston():
    scaler = StandardScaler()
    boston = datasets.load_boston()
    X=boston.data
    y=boston.target
    X = scaler.fit_transform(X)
    return train_test_split(X,y)

In [50]:
#lets divide the data in boston housing to training and test sets using load_boston
#X referes to independent varaible and y referes to dependent variable 
X_train, X_test, y_train, y_test = load_boston()

In [51]:
#Lets check the shape of training data of the independent variable 
X_train.shape

(379L, 13L)

In [8]:
#Fitting a linear regression 
#Instantiating a new regression object
nlr=LinearRegression()

In [13]:
#Giving training data of independent and dependent variables to the linear regression object by calling .fit(X,y)
nlr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [14]:
#lets compare the real value to the predicted values and create a tuple of the observation 
zip(y_test,nlr.predict(X_test))

[(32.0, 33.698108652649978),
 (8.5, 8.4116468966136804),
 (15.199999999999999, 20.00409274211982),
 (19.100000000000001, 17.461997738864767),
 (50.0, 33.899700871629165),
 (20.399999999999999, 19.124087768964788),
 (12.5, 20.010054874835049),
 (22.699999999999999, 24.374170145522001),
 (32.5, 30.34563285601083),
 (15.6, 19.958628771686353),
 (22.199999999999999, 21.828025060360975),
 (36.200000000000003, 27.613262656122107),
 (13.300000000000001, 21.564229594252421),
 (28.399999999999999, 30.947636769761111),
 (12.6, 18.248388261272545),
 (30.300000000000001, 33.196840557966993),
 (13.1, 14.536925520679274),
 (14.0, 15.394646652213682),
 (15.0, 13.523717679423441),
 (20.100000000000001, 16.260656715836628),
 (15.699999999999999, 16.392376028231176),
 (18.699999999999999, 21.11285140432533),
 (18.800000000000001, 20.919431364405785),
 (8.5, 16.468979139125572),
 (18.699999999999999, 17.600448508476589),
 (13.300000000000001, 17.074850817625034),
 (48.799999999999997, 41.854996865087116)

In [16]:
#lets assign a vaiable to the predicted values
y_LrPred=nlr.predict(X_test)

In [17]:
#lets measure the performance of linear regressor nlr using R^2 and mse
#R^2 (coefficient of determination) regression score function
r2Score=r2_score(y_test,y_LrPred)

In [18]:
#lets print R^2 score 
r2Score

0.65844946060822163

In [20]:
#Now lets do mse (mean square root )
mseValue=mean_squared_error(y_test,y_LrPred)

In [21]:
#lets print mse value
mseValue

24.274009328517412


R^2 score and mse on the Linear reggessor are 

R^2=0.65844946060822163

mse=24.274009328517412

In [42]:
#lets now try Ridge linear regression 
#lets assign a value to alpha and instiantiate new object
alpha=0.001
ridge=Ridge(alpha=alpha)

In [43]:
#lets fit the training data of independent and dependent variables to the Ridge linear regression object by calling .fit(X,y)
ridge.fit(X_train,y_train)

Ridge(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [44]:
#lets compare the real value to the predicted values and create a tuple of the observation by Ridge model
zip(y_test,ridge.predict(X_test))

[(32.0, 33.698043308342818),
 (8.5, 8.4116718391259511),
 (15.199999999999999, 20.004089276958357),
 (19.100000000000001, 17.46195109655886),
 (50.0, 33.899658245877312),
 (20.399999999999999, 19.12412986956183),
 (12.5, 20.010031821352491),
 (22.699999999999999, 24.374140781710061),
 (32.5, 30.345586249663352),
 (15.6, 19.958716216979603),
 (22.199999999999999, 21.828049173584713),
 (36.200000000000003, 27.613217472658675),
 (13.300000000000001, 21.564201717241275),
 (28.399999999999999, 30.947549442023146),
 (12.6, 18.24839273252455),
 (30.300000000000001, 33.196849943345399),
 (13.1, 14.536953696792484),
 (14.0, 15.394666643342683),
 (15.0, 13.523717677731216),
 (20.100000000000001, 16.260785566309483),
 (15.699999999999999, 16.392234161098692),
 (18.699999999999999, 21.11284409616146),
 (18.800000000000001, 20.919457169201632),
 (8.5, 16.468953453304859),
 (18.699999999999999, 17.600436156343545),
 (13.300000000000001, 17.074804484503851),
 (48.799999999999997, 41.85497650273895),


In [45]:
#lets assign a vaiable to the predicted values
y_RgPred=ridge.predict(X_test)

In [46]:
#Now lets measure our Ridge model by using R^2 and mse
#first lets do calculate R^2
r2RgScore=r2_score(y_test,y_RgPred)

In [47]:
#lets print R^2 score
r2RgScore

0.65844954171896286

In [48]:
#now lets do mse 
mseRgScore=mean_squared_error(y_test,y_RgPred)

In [49]:
#lets print mse score of our ridge model
mseRgScore

24.274003563974059


R^2 score and mse on the Ridge linear reggessor model is 

R^2=0.65850094220597888

mse=24.270350529481608


When we have set alpha at 0.001 we are getting a optimal result 
