Fire up GraphLab Create

In [2]:
import graphlab

Load in House Sales Data

In [3]:
sales = graphlab.SFrame('kc_house_data.gl/')

This non-commercial license of GraphLab Create for academic use is assigned to 160645l@uom.lk and will expire on July 20, 2021.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\Nipuna\AppData\Local\Temp\graphlab_server_1595847622.log.0


Split data into training and testing

In [4]:
train_data,test_data = sales.random_split(0.8,seed=0)

Learning a multiple regression model

In [5]:
example_features = ['sqft_living','bedrooms','bathrooms']
example_model = graphlab.linear_regression.create(train_data,target='price',features=example_features,validation_set=None)

In [6]:
example_weight_summary = example_model.get("coefficients")
print example_weight_summary

+-------------+-------+----------------+---------------+
|     name    | index |     value      |     stderr    |
+-------------+-------+----------------+---------------+
| (intercept) |  None | 87910.0724924  |  7873.3381434 |
| sqft_living |  None | 315.403440552  | 3.45570032585 |
|   bedrooms  |  None | -65080.2155528 | 2717.45685442 |
|  bathrooms  |  None | 6944.02019265  | 3923.11493144 |
+-------------+-------+----------------+---------------+
[4 rows x 4 columns]



Making Predictions

In [7]:
example_predictions = example_model.predict(train_data)
print example_predictions[0] # should be 271789.505878

271789.505878


Compute RSS

In [8]:
def get_residual_sum_of_squares(model, data, outcome):
    # First get the predictions
    predictions = model.predict(data)
    # Then compute the residuals/errors
    residuals = outcome - predictions
    # Then square and add them up
    RSS = (residuals * residuals).sum()
    return(RSS)

Create New Features

In [9]:
train_data['bedrooms_squared'] = train_data['bedrooms'].apply(lambda x: x**2)
test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x: x**2)

In [13]:
import math
# create the remaining 3 features in both TEST and TRAIN data
train_data['bed_bath_rooms'] = train_data['bedrooms'] * train_data['bathrooms']
test_data['bed_bath_rooms'] = test_data['bedrooms'] * test_data['bathrooms']

train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x: math.log(x))
test_data['log_sqft_living'] = test_data['sqft_living'].apply(lambda x: math.log(x))

train_data['lat_plus_long'] = train_data['lat'] + train_data['long']
test_data['lat_plus_long'] = test_data['lat'] + test_data['long']

In [14]:

train_data[['bedrooms','bathrooms','lat','long','bedrooms_squared','bed_bath_rooms','log_sqft_living','lat_plus_long']].head()

bedrooms,bathrooms,lat,long,bedrooms_squared,bed_bath_rooms,log_sqft_living
3.0,1.0,47.51123398,-122.25677536,9.0,3.0,7.07326971746
3.0,2.25,47.72102274,-122.3188624,9.0,6.75,7.85166117789
2.0,1.0,47.73792661,-122.23319601,4.0,2.0,6.64639051485
4.0,3.0,47.52082,-122.39318505,16.0,12.0,7.58069975222
3.0,2.0,47.61681228,-122.04490059,9.0,6.0,7.4265490724
4.0,4.5,47.65611835,-122.00528655,16.0,18.0,8.59785109443
3.0,2.25,47.30972002,-122.32704857,9.0,6.75,7.4471683596
3.0,1.5,47.40949984,-122.31457273,9.0,4.5,6.96602418711
3.0,1.0,47.51229381,-122.33659507,9.0,3.0,7.48436864329
3.0,2.5,47.36840673,-122.0308176,9.0,7.5,7.54433210805

lat_plus_long
-74.74554138
-74.59783966
-74.4952694
-74.87236505
-74.42808831
-74.3491682
-75.01732855
-74.90507289
-74.82430126
-74.66241087


In [15]:
print test_data['bedrooms_squared'].mean()
print test_data['bed_bath_rooms'].mean()
print test_data['log_sqft_living'].mean()
print test_data['lat_plus_long'].mean()

12.4466777016
7.50390163159
7.55027467965
-74.6533349722


Learning Multiple Regression

In [16]:
model_1_features = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model_2_features = model_1_features + ['bed_bath_rooms']
model_3_features = model_2_features + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [17]:
# Learn the three models: (don't forget to set validation_set = None)
model_1 = graphlab.linear_regression.create(train_data, target = 'price', features = model_1_features, 
                                                  validation_set = None)
model_2 = graphlab.linear_regression.create(train_data, target = 'price', features = model_2_features, 
                                                  validation_set = None)
model_3 = graphlab.linear_regression.create(train_data, target = 'price', features = model_3_features, 
                                                  validation_set = None)

In [18]:
# Examine/extract each model's coefficients:
model_1_weight_summary = model_1.get("coefficients")
model_2_weight_summary = model_2.get("coefficients")
model_3_weight_summary = model_3.get("coefficients")
print model_1_weight_summary 
print model_2_weight_summary
print model_3_weight_summary

+-------------+-------+----------------+---------------+
|     name    | index |     value      |     stderr    |
+-------------+-------+----------------+---------------+
| (intercept) |  None | -56140675.7451 | 1649985.42026 |
| sqft_living |  None | 310.263325777  | 3.18882960408 |
|   bedrooms  |  None | -59577.116068  | 2487.27977322 |
|  bathrooms  |  None | 13811.8405418  | 3593.54213296 |
|     lat     |  None | 629865.789514  | 13120.7100326 |
|     long    |  None | -214790.285181 | 13284.2851608 |
+-------------+-------+----------------+---------------+
[6 rows x 4 columns]

+----------------+-------+----------------+---------------+
|      name      | index |     value      |     stderr    |
+----------------+-------+----------------+---------------+
|  (intercept)   |  None | -54410676.1159 | 1650405.16539 |
|  sqft_living   |  None | 304.449298056  | 3.20217535637 |
|    bedrooms    |  None | -116366.04323  | 4805.54966545 |
|   bathrooms    |  None | -77972.3305135 |  756

Comparing Multiple Models

In [19]:
# Compute the RSS on TRAINING data for each of the three models and record the values:
rss_model_1_train = get_residual_sum_of_squares(model_1, train_data, train_data['price'])
rss_model_2_train = get_residual_sum_of_squares(model_2, train_data, train_data['price'])
rss_model_3_train = get_residual_sum_of_squares(model_3, train_data, train_data['price'])
print rss_model_1_train
print rss_model_2_train
print rss_model_3_train

9.71328233543e+14
9.61592067855e+14
9.05276314555e+14


In [20]:
# Compute the RSS on TESTING data for each of the three models and record the values:
rss_model_1_test = get_residual_sum_of_squares(model_1, test_data, test_data['price'])
rss_model_2_test = get_residual_sum_of_squares(model_2, test_data, test_data['price'])
rss_model_3_test = get_residual_sum_of_squares(model_3, test_data, test_data['price'])
print rss_model_1_test
print rss_model_2_test
print rss_model_3_test

2.26568089093e+14
2.24368799994e+14
2.51829318951e+14
