In [11]:
import pandas as pd
import math
from sklearn import linear_model

## Importing house sales, training and test data

In [2]:
sales = pd.read_csv('../kc_house_data.csv')
train_data = pd.read_csv('../kc_house_train_data.csv')
test_data = pd.read_csv('../kc_house_test_data.csv')

## Adding additional features to training and test data

In [7]:
train_data['bedrooms_squared'] = train_data['bedrooms'] * train_data['bedrooms']
test_data['bedrooms_squared'] = train_data['bedrooms'] * train_data['bedrooms']

train_data['bed_bath_rooms'] = train_data['bedrooms'] * train_data['bathrooms']
test_data['bed_bath_rooms'] = test_data['bedrooms'] * test_data['bathrooms']

train_data['log_sqft_living'] = [math.log(sqft_living) for sqft_living in train_data['sqft_living']]
test_data['log_sqft_living'] = [math.log(sqft_living) for sqft_living in test_data['sqft_living']]

train_data['lat_plus_long'] = train_data['lat'] + train_data['long']
test_data['lat_plus_long'] = test_data['lat'] + test_data['long']

In [9]:
mean_test_bedrooms_squared = test_data['bedrooms_squared'].mean()
mean_test_bed_bath_rooms = test_data['bed_bath_rooms'].mean()
mean_test_log_sqft_living = test_data['log_sqft_living'].mean()
mean_test_lat_plus_long = test_data['lat_plus_long'].mean()

## Creating a Regression Model for sqft_living, bedrooms, bathrooms, lat, long

In [25]:
model1 = linear_model.LinearRegression()

In [26]:
model1.fit(train_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']], train_data['price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [27]:
model1.coef_

array([  3.12258646e+02,  -5.95865332e+04,   1.57067421e+04,
         6.58619264e+05,  -3.09374351e+05])

## Creating a Regression Model for sqft_living, bedrooms, bathrooms, lat, long and bed_bath_rooms

In [28]:
model2 = linear_model.LinearRegression()

In [29]:
model2.fit(train_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']], train_data['price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [30]:
model2.coef_

array([  3.06610053e+02,  -1.13446368e+05,  -7.14613083e+04,
         6.54844630e+05,  -2.94298969e+05,   2.55796520e+04])

## Creating a Regression Model for sqft_living, bedrooms, bathrooms, lat, long, bed_bath_rooms, bedrooms_squared, log_sqft_living, lat_plus_long

In [31]:
model3 = linear_model.LinearRegression()

In [33]:
model3.fit(train_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']], train_data['price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [34]:
model3.coef_

array([  5.29422820e+02,   3.45142296e+04,   6.70607813e+04,
         5.34085611e+05,  -4.06750711e+05,  -8.57050439e+03,
        -6.78858667e+03,  -5.61831484e+05,   1.27334900e+05])

## Quiz Question: What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 1?

1.57067421e+04

## Quiz Question: What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 2?

In [35]:
-7.14613083e+04

-71461.3083

## Is the sign for the coefficient the same in both models? Think about why this might be the case.

No, the sign of the coeffecient is not the same in both the models. In model1 we only had bathrooms and in model2 with the addition of bath_bed_rooms the impact of bed_bath_rooms reduces the weight of the bathrooms

## Computing RSS on Training Data

In [42]:
# RSS on Model1
model1_predicted_values = model1.predict(train_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']])
model1_rss = ((model1_predicted_values - train_data['price']) ** 2).sum()

In [43]:
# RSS on Model2
model2_predicted_values = model2.predict(train_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']])
model2_rss = ((model2_predicted_values - train_data['price']) ** 2).sum()

In [44]:
# RSS on Model3
model3_predicted_values = model3.predict(train_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']])
model3_rss = ((model3_predicted_values - train_data['price']) ** 2).sum()

In [45]:
print(model1_rss, model2_rss, model3_rss)

9.6787996305e+14 9.58419635074e+14 9.0343645505e+14


## Quiz Question: Which model (1, 2 or 3) had the lowest RSS on TRAINING data?

Model 3 has low RSS compared to other models

## Computing RSS on Testing Data

In [46]:
# RSS on Model1
model1_test_predicted_values = model1.predict(test_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']])
model1_test_rss = ((model1_test_predicted_values - test_data['price']) ** 2).sum()

In [51]:
# RSS on Model2
model2_test_predicted_values = model2.predict(test_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']])
model2_test_rss = ((model2_test_predicted_values - test_data['price']) ** 2).sum()

In [52]:
# RSS on Model3
model3_test_predicted_values = model3.predict(test_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']])
model3_test_rss = ((model3_test_predicted_values - test_data['price']) ** 2).sum()

In [54]:
print(model1_test_rss, model2_test_rss, model3_test_rss)

2.25500469795e+14 2.23377462976e+14 2.25490414586e+14
