In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn

In [2]:
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
train_data = pd.read_csv('kc_house_train_data.csv')

In [4]:
test_data = pd.read_csv('kc_house_test_data.csv')

In [5]:
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


## Adding 4 new Variables to train and test data

In [6]:
temp = train_data

In [7]:
def add_columns(data):
    data['bedrooms_squared'] = data['bedrooms'] * data['bedrooms']
    data['bed_bath_rooms'] = data['bedrooms'] * data['bathrooms']
    data['log_sqft_living'] = np.log(data['sqft_living'])
    data['lat_plus_long'] = data['lat'] + data['long']


In [8]:
list(map(add_columns, (train_data, test_data)))


[None, None]

In [9]:
train_data[['bedrooms_squared', 'bed_bath_rooms', 'log_sqft_living', 'lat_plus_long']]

Unnamed: 0,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_long
0,9,3.00,7.073270,-74.7458
1,9,6.75,7.851661,-74.5980
2,4,2.00,6.646391,-74.4951
3,16,12.00,7.580700,-74.8722
4,9,6.00,7.426549,-74.4282
5,16,18.00,8.597851,-74.3489
6,9,6.75,7.447168,-75.0173
7,9,4.50,6.966024,-74.9055
8,9,3.00,7.484369,-74.8247
9,9,7.50,7.544332,-74.6626


In [11]:
test_data[['bedrooms_squared', 'bed_bath_rooms', 'log_sqft_living', 'lat_plus_long']]

Unnamed: 0,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_long
0,9,3.00,7.265430,-74.4732
1,16,12.00,7.989560,-74.8036
2,9,6.00,7.444249,-74.9132
3,9,7.50,7.749322,-74.5309
4,9,3.00,6.993933,-74.6861
5,16,10.00,7.870930,-74.6499
6,16,9.00,8.347590,-74.9020
7,16,10.00,7.718685,-74.7477
8,9,5.25,7.138867,-74.8984
9,16,8.00,7.919356,-74.5702


In [12]:
train_data.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'bedrooms_squared',
       'bed_bath_rooms', 'log_sqft_living', 'lat_plus_long'],
      dtype='object')

## Quiz Question: what are the mean (arithmetic average) values of your 4 new variables on TEST data? (round to 2 digits)

In [13]:
[np.mean(np.array(test_data[col])) for col in ['bedrooms_squared', 'bed_bath_rooms', 'log_sqft_living', 'lat_plus_long']]

[12.4466777015843, 7.5039016315913925, 7.550274679645938, -74.65333355403168]

In [14]:
np.mean(np.array(test_data['lat_plus_long']))

-74.65333355403168

## Fitting Multiple Linear Regression Model

In [14]:
from sklearn.linear_model import LinearRegression as LR

In [15]:
def prediction(features):
    X_train = train_data[features]
    y_train = train_data[['price']]
    
    X_test = test_data[features]
    y_test = test_data[['price']]
    
    lr = LR()
    model = lr.fit(X_train, y_train)
    
    #Predict on test set
    prediction_test = model.predict(X_test)
    
    #Predict on train data
    prediction_train = model.predict(X_train)
    
    #RMSE and R-Squared Error
    print("Test Data\nRSS:- {}\tR-Squared Error:- {}\n" .format(mean_squared_error(y_test, prediction_test) * len(y_test), r2_score(y_test, prediction_test)))
    print("Train Data\nRSS:- {}\tR-Squared Error:- {}\n" .format(mean_squared_error(y_train, prediction_train) * len(y_train), r2_score(y_train, prediction_train)))
    
    return (model, prediction_test)

## Model 1

In [16]:
model_1 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']

In [17]:
model_one, prediction_test_model_1 = prediction(model_1)

Test Data
RSS:- 225500469795490.34	R-Squared Error:- 0.5801585583483893

Train Data
RSS:- 967879963049545.8	R-Squared Error:- 0.5926022811353866



## Model 2

In [18]:
model_2 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']

In [19]:
model_two, prediction_test_model_2 = prediction(model_2) 

Test Data
RSS:- 223377462976467.5	R-Squared Error:- 0.5841112163820648

Train Data
RSS:- 958419635074069.9	R-Squared Error:- 0.596584299757589



## Model 3

In [20]:
model_3 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat','long', 'bed_bath_rooms','bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [21]:
model_three, prediction_test_model_3 = prediction(model_3)

Test Data
RSS:- 259236319207180.12	R-Squared Error:- 0.5173484557122805

Train Data
RSS:- 903436455050477.8	R-Squared Error:- 0.6197276883725955



## Quiz Question: What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 1?



In [22]:
model_one.coef_
#Positive

array([[ 3.12258646e+02, -5.95865332e+04,  1.57067421e+04,
         6.58619264e+05, -3.09374351e+05]])

In [23]:
model_one.intercept_

array([-69075726.79256983])

## Quiz Question: What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 2?

In [24]:
model_two.coef_
#Negative

array([[ 3.06610053e+02, -1.13446368e+05, -7.14613083e+04,
         6.54844630e+05, -2.94298969e+05,  2.55796520e+04]])

In [25]:
model_two.intercept_

array([-66867968.87107886])

## Quiz Question: Which model (1, 2 or 3) had the lowest RSS on TRAINING data?

In [26]:
model_three.coef_

array([[ 5.29422820e+02,  3.45142296e+04,  6.70607813e+04,
         5.34085611e+05, -4.06750711e+05, -8.57050439e+03,
        -6.78858667e+03, -5.61831484e+05,  1.27334900e+05]])

In [27]:
model_three.intercept_

array([-62036084.9860983])

## Quiz Question: Which model (1, 2, or 3) had the lowest RSS on TESTING data?


### Model 1

### Model 2

In [79]:
#Lowest RSS on test data

### Model 3

In [80]:
#Lowest RSS on train data