In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random
from sklearn.linear_model import LinearRegression 

In [10]:
sales = pd.read_csv("~/Desktop/ML_Washington/WashingtonML/Regression/Week2/kc_house_data.csv")

In [11]:
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [12]:
sales.shape

(21613, 21)

In [20]:
#train_data, test_data = train_test_split(sales, test_size = 0.2, random_state = 0)

In [23]:
train_data = pd.read_csv("~/Desktop/ML_Washington/WashingtonML/Regression/Week2/kc_house_train_data.csv")

In [27]:
test_data = pd.read_csv("~/Desktop/ML_Washington/WashingtonML/Regression/Week2/kc_house_test_data.csv")

In [28]:
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])

In [29]:
X

array([[1, 1],
       [1, 2],
       [2, 2],
       [2, 3]])

In [186]:
ys = train_data['price'].values.reshape(-1,1)
xs = train_data[['sqft_living', 'bedrooms', 'bathrooms']].values

In [187]:
regressor = LinearRegression()
regressor.fit(xs,ys)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [188]:
regressor.coef_.flatten().astype('float')

array([   315.40669062, -65081.88711588,   6942.16598637])

## Read the coefficients and weight

In [189]:

coeff_df = pd.DataFrame(regressor.coef_.flatten(), ['sqft_living', 'bedrooms', 'bathrooms'], columns = ['coefficient'])

In [190]:
pd.options.display.float_format = '{:20,.4f}'.format   # to format the result 
coeff_df

Unnamed: 0,coefficient
sqft_living,315.4067
bedrooms,-65081.8871
bathrooms,6942.166


In [191]:
#This means that for a unit increase in “density”, there is a decrease of 31.51 units in the quality of the wine. 
#Similarly, a unit decrease in “Chlorides“ results in an increase of 1.87 units in the quality of the wine. 
#We can see that the rest of the features have very little effect on the quality of the wine.
#Screen Shot 2020-03-11 at 11.47.03 PM


In [192]:
y_pred = regressor.predict(train_data[['sqft_living', 'bedrooms', 'bathrooms']].values)

In [193]:
y_pred = pd.DataFrame(y_pred)

In [194]:
y_pred.iloc[:,0].values

array([271789.26537997, 718882.27281845, 207554.4093435 , ...,
       392594.85607494, 414673.32441803, 284670.54050078])

In [195]:
def get_residual_sum_of_squares(model, data, outcome):
    # First get the predictions
    pred = model.predict(data)
    #print(pred)
    # Then compute the residuals/errors
    pred = pd.DataFrame(pred)
    #print(pred.iloc[:,0].values[0:10])
    residuals = pred.iloc[:,0].values - outcome
    # Then square and add them up
    RSS = sum(val**2 for val in residuals)
    return(RSS)

In [196]:
rss_example_train = get_residual_sum_of_squares(regressor, test_data[['sqft_living', 'bedrooms', 'bathrooms']], test_data['price'].values)
print (rss_example_train) # should be 2.7376153833e+14

273761940583134.03


In [127]:
from math import log

Now we will learn the weights for three (nested) models for predicting house prices. The first model will have the fewest features the second model will add one more feature and the third will add a few more:
* Model 1: squarefeet, # bedrooms, # bathrooms, latitude & longitude
* Model 2: add bedrooms\*bathrooms
* Model 3: Add log squarefeet, bedrooms squared, and the (nonsensical) latitude + longitude

In [130]:
train_data['bedrooms_squared'] = train_data['bedrooms'].apply(lambda x:x*x)
train_data['bed_bath_rooms'] = train_data['bedrooms'] * train_data['bathrooms']
train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x: log(x))
train_data['lat_plus_lang'] = train_data['lat'] + train_data['long']


In [131]:
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_lang
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,0,98178,47.5112,-122.257,1340,5650,9,3.0,7.0733,-74.7458
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,1991,98125,47.721,-122.319,1690,7639,9,6.75,7.8517,-74.598
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,0,98028,47.7379,-122.233,2720,8062,4,2.0,6.6464,-74.4951
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,0,98136,47.5208,-122.393,1360,5000,16,12.0,7.5807,-74.8722
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,0,98074,47.6168,-122.045,1800,7503,9,6.0,7.4265,-74.4282


In [132]:
test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x:x*x)
test_data['bed_bath_rooms'] = test_data['bedrooms'] * test_data['bathrooms']
test_data['log_sqft_living'] = test_data['sqft_living'].apply(lambda x: log(x))
test_data['lat_plus_lang'] = test_data['lat'] + test_data['long']

In [133]:
test_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_lang
0,114101516,20140528T000000,310000.0,3,1.0,1430,19901,1.5,0,0,...,0,98028,47.7558,-122.229,1780,12697,9,3.0,7.2654,-74.4732
1,9297300055,20150124T000000,650000.0,4,3.0,2950,5000,2.0,0,3,...,0,98126,47.5714,-122.375,2140,4000,16,12.0,7.9896,-74.8036
2,1202000200,20141103T000000,233000.0,3,2.0,1710,4697,1.5,0,0,...,0,98002,47.3048,-122.218,1030,4705,9,6.0,7.4442,-74.9132
3,8562750320,20141110T000000,580500.0,3,2.5,2320,3980,2.0,0,0,...,0,98027,47.5391,-122.07,2580,3980,9,7.5,7.7493,-74.5309
4,7589200193,20141110T000000,535000.0,3,1.0,1090,3000,1.5,0,0,...,0,98117,47.6889,-122.375,1570,5080,9,3.0,6.9939,-74.6861


In [140]:
test_data.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_lang
count,4229.0,4229.0,4229.0,4229.0,4229.0,4229.0,4229.0,4229.0,4229.0,4229.0,...,4229.0,4229.0,4229.0,4229.0,4229.0,4229.0,4229.0,4229.0,4229.0,4229.0
mean,4604770649.9686,543054.043,3.3769,2.1136,2079.3663,15168.8591,1.4946,0.0071,0.227,3.4039,...,89.7226,98077.9518,47.5631,-122.2164,1988.8442,12735.8777,12.4467,7.5039,7.5503,-74.6533
std,2894016943.2937,356421.2458,1.0214,0.7636,905.3175,41265.627,0.5423,0.0839,0.7594,0.6547,...,413.7369,53.4266,0.138,0.1405,680.8376,27829.2002,17.869,4.227,0.4266,0.1822
min,1000102.0,85000.0,0.0,0.0,370.0,600.0,1.0,0.0,0.0,1.0,...,0.0,98001.0,47.1559,-122.514,700.0,660.0,0.0,0.0,5.9135,-75.1772
25%,2110900050.0,325000.0,3.0,1.75,1430.0,5027.0,1.0,0.0,0.0,3.0,...,0.0,98032.0,47.4766,-122.33,1490.0,5105.0,9.0,4.5,7.2654,-74.7672
50%,4019300906.0,453000.0,3.0,2.25,1920.0,7633.0,1.5,0.0,0.0,3.0,...,0.0,98065.0,47.5734,-122.239,1840.0,7611.0,9.0,7.0,7.5601,-74.6671
75%,7338220280.0,650000.0,4.0,2.5,2550.0,10760.0,2.0,0.0,0.0,4.0,...,0.0,98118.0,47.6795,-122.125,2370.0,10159.0,16.0,10.0,7.8438,-74.5281
max,9895000040.0,6885000.0,33.0,7.75,9890.0,1024068.0,3.5,1.0,4.0,5.0,...,2015.0,98199.0,47.7776,-121.315,5030.0,858132.0,1089.0,57.75,9.1993,-73.602


In [142]:
bedrooms_squared_mean = test_data['bedrooms_squared'].mean()
bed_bath_rooms_mean = test_data['bed_bath_rooms'].mean()
log_sqft_living_mean = test_data['log_sqft_living'].mean()
lat_plus_lang_mean = test_data['lat_plus_lang'].mean()

In [147]:
test_new_features_mean = np.array([bedrooms_squared_mean,bed_bath_rooms_mean,
                                  log_sqft_living_mean,lat_plus_lang_mean])
                                  

In [207]:
test_new_features_mean

array([ 12.4466777 ,   7.50390163,   7.55027468, -74.65333355])

In [175]:
model_1_features = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model_2_features = model_1_features + ['bed_bath_rooms']
model_3_features = model_2_features + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_lang']

In [210]:
regressor_1_train = LinearRegression()
model_1_fit_train = regressor_1_train.fit(train_data[model_1_features].values, ys)

model_1_coeff_train = pd.DataFrame(model_1_fit_train.coef_.flatten(),model_1_features ,
                             columns = ['coefficients_1_train'])
model_1_coeff_train

Unnamed: 0,coefficients_1_train
sqft_living,312.2586
bedrooms,-59586.5332
bathrooms,15706.7421
lat,658619.2639
long,-309374.3513


# Read this page for interpreting the coefficients 
#https://www.theanalysisfactor.com/interpreting-regression-coefficients/

In [208]:
regressor_2_train = LinearRegression()
model_2_fit_train = regressor_2_train.fit(train_data[model_2_features].values, ys)
model_2_coeff_train = pd.DataFrame(model_2_fit_train.coef_.flatten(),model_2_features ,columns = ['coefficients_2_train'])
model_2_coeff_train

Unnamed: 0,coefficients_2_train
sqft_living,306.6101
bedrooms,-113446.3681
bathrooms,-71461.3083
lat,654844.6295
long,-294298.9691
bed_bath_rooms,25579.652


In [209]:
regressor_3_train = LinearRegression()
model_3_fit_train = regressor_3_train.fit(train_data[model_3_features].values, ys)
model_3_coeff_train = pd.DataFrame(model_3_fit_train.coef_.flatten(),model_3_features ,columns = ['coefficients_3_train'])
model_3_coeff_train

Unnamed: 0,coefficients_3_train
sqft_living,529.4228
bedrooms,34514.2296
bathrooms,67060.7813
lat,534085.6109
long,-406750.7109
bed_bath_rooms,-8570.5044
bedrooms_squared,-6788.5867
log_sqft_living,-561831.4841
lat_plus_lang,127334.9


In [199]:
RSS_1_train = get_residual_sum_of_squares(regressor_1_train, train_data[model_1_features],
                                          train_data['price'].values)
RSS_2_train = get_residual_sum_of_squares(regressor_2_train, train_data[model_2_features],
                                          train_data['price'].values)
RSS_3_train = get_residual_sum_of_squares(regressor_3_train, train_data[model_3_features],
                                          train_data['price'].values)

In [201]:
print("RSS_1_Train: {}, RSS_2_Train: {}, RSS_3_Train: {}".format(RSS_1_train,RSS_2_train,RSS_3_train))

RSS_1_Train: 967879963049549.6, RSS_2_Train: 958419635074068.9, RSS_3_Train: 903436455050480.5


In [205]:

regressor_1_test = LinearRegression()
model_1_fit_test = regressor_1_test.fit(test_data[model_1_features].values, test_data['price'].values)

model_1_coeff_test = pd.DataFrame(model_1_fit_test.coef_.flatten(), model_1_features,
                             columns = ['coefficients_1_test'])



regressor_2_test = LinearRegression()
model_2_fit_test = regressor_2_test.fit(test_data[model_2_features].values, test_data['price'].values)
model_2_coeff_test = pd.DataFrame(model_2_fit_test.coef_.flatten(),model_2_features ,columns = ['coefficients_2_test'])



regressor_3_test = LinearRegression()
model_3_fit_test = regressor_3_test.fit(test_data[model_3_features].values, test_data['price'].values)
model_3_coeff_test = pd.DataFrame(model_3_fit_test.coef_.flatten(), model_3_features,columns = ['coefficients_3_test'])


print(model_1_coeff_test,model_2_coeff_test,model_3_coeff_test)

             coefficients_1_test
sqft_living             291.5230
bedrooms            -36,344.2805
bathrooms            22,478.4634
lat                 649,781.1484
long               -322,967.6642                 coefficients_2_test
sqft_living                284.1100
bedrooms               -93,857.1172
bathrooms              -77,384.4701
lat                    646,266.3922
long                  -305,020.2515
bed_bath_rooms          29,194.7505                   coefficients_3_test
sqft_living                  469.7167
bedrooms                 -32,885.7029
bathrooms                 50,463.7585
lat                      525,504.3263
long                    -408,891.5784
bed_bath_rooms            -2,285.9238
bedrooms_squared           1,081.0017
log_sqft_living         -433,486.8880
lat_plus_lang            116,612.7479


In [211]:
RSS_1_test = get_residual_sum_of_squares(regressor_1_train, test_data[model_1_features],
                                          test_data['price'].values)
RSS_2_test = get_residual_sum_of_squares(regressor_2_train, test_data[model_2_features],
                                          test_data['price'].values)
RSS_3_test = get_residual_sum_of_squares(regressor_3_train, test_data[model_3_features],
                                          test_data['price'].values)
print("RSS_1_test: {}, RSS_2_test: {}, RSS_3_test: {}".format(RSS_1_test,RSS_2_test,RSS_3_test))

RSS_1_test: 225500469795489.66, RSS_2_test: 223377462976467.22, RSS_3_test: 259236319207179.4
