In [31]:
import pandas as pd
import numpy as np
import io

import matplotlib.pyplot as plt

import sklearn.metrics as metrics

import pickle

from google.colab import files

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV

In [32]:
# from google.colab import files
uploaded = files.upload()

Saving data_all_raw.csv to data_all_raw.csv


In [33]:
### Single family homes in SF Bay Area from June 2019, more details in:
### https://towardsdatascience.com/house-hunting-in-the-san-francisco-bay-area-deal-hunting-using-machine-learning-3ed6fc8e8991
data_all_raw = pd.read_csv('data_all_raw.csv')

In [34]:
# format/clean -- select columns of interest, drop rows with zeros, NaNs
data_all_temp2 = data_all_raw.dropna()
data_all_temp3 = data_all_temp2[(data_all_temp2 != 0).all(1)]

# get rid of some outliers
### Remove all houses with home size > 5000 sq ft, or lot size > 20,000 sq ft, or price > $5 million, etc.
data_all_temp4 = data_all_temp3[data_all_temp3['Home size'] < 5000]
data_all_temp5 = data_all_temp4[data_all_temp4['Lot size'] < 20000]
data_all_temp6 = data_all_temp5[data_all_temp5['Price'] < 5000000]
data_all_temp7 = data_all_temp6[data_all_temp6['Beds'] < 6]
data_all_temp8 = data_all_temp7[data_all_temp7['Baths'] < 6]

# remove spaces in column names - necessary for OLS?
data_all_temp8.columns = data_all_temp8.columns.str.replace(' ', '_')
data_all = data_all_temp8

# check for multicollinearity
correlations = data_all.corr()

# query count, mean, stdev etc. of selected data
data_all.describe()

## option to select one zipcode
#zipcode = 94401
#data_subset = data_all[data_all['Zip'] == zipcode]

Unnamed: 0,Unnamed:_0,Zip,Price,Beds,Baths,Home_size,Lot_size,Latitude,Longitude,SF_time,PA_time,School_score,Commute_time
count,5758.0,5758.0,5758.0,5758.0,5758.0,5758.0,5758.0,5758.0,5758.0,5758.0,5758.0,5758.0,5758.0
mean,3641.553838,94619.864536,1194509.0,3.426016,2.318861,1906.304967,6704.043175,37.698938,-122.115052,84.225947,84.442862,50.217367,68.383814
std,2020.278018,331.993813,739402.6,0.851077,0.842316,773.425567,3411.103629,0.260709,0.231021,26.95054,33.735555,19.45398,28.831007
min,1.0,94002.0,129000.0,1.0,1.0,107.0,436.0,37.150065,-122.687775,8.0,5.0,17.1,5.0
25%,1939.25,94517.0,679000.0,3.0,2.0,1329.0,4500.0,37.465193,-122.274202,65.0,60.0,33.1,47.0
50%,3658.5,94579.0,995975.0,3.0,2.0,1750.0,6064.0,37.739942,-122.10269,90.0,85.0,48.8,65.0
75%,5283.75,94903.0,1469722.0,4.0,3.0,2330.0,8102.0,37.926986,-121.943836,100.0,115.0,65.3,90.0
max,7145.0,95391.0,4999888.0,5.0,5.5,4955.0,19602.0,38.149218,-121.532356,170.0,150.0,89.8,130.0


In [35]:
# Regression metrics

# https://stackoverflow.com/questions/26319259/how-to-get-a-regression-summary-in-scikit-learn-like-r-does
# https://machinelearningmastery.com/regression-metrics-for-machine-learning/

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html
# R^2 (coefficient of determination) regression score function.
# Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). 
# A constant model that always predicts the expected value of y, disregarding the input features, would get a
# score of 0.0.

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html
# MSE is Mean squared error regression loss.

# The MSE is calculated as the mean or average of the squared differences between predicted and expected target values in a dataset.
# The units of the MSE are squared units.
# For example, if the value represents “dollars,” then the MSE will be “squared dollars.” 
# This can be confusing for stakeholders; therefore, when reporting results, often the root mean squared error (RMSE) is used instead
def regression_results(y_true, y_pred):
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error = metrics.mean_absolute_error(y_true, y_pred) 
    mse = metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)

    # print('explained_variance: ', round(explained_variance,4))    
    # print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    # print('R_squared: ', round(r2,4))
    # print('MAE: ', round(mean_absolute_error,4))
    # print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [36]:
def display_scores (scores):
    print("Scores:", scores)
    print("\n Mean:", scores.mean())
    print("\n Standard deviation:", scores.std(),"\n")

### Train and Test Model

Listing data only

If model has no cross-validation, data is split into train and test.

If model has cross-validation, data is split into train, validation, and test.

Best model so far: **Decision tree** but it overfits


Model results, rounded to 2 decimal places:

**Linear regression**

Test set RMSE:  $605,096.22

**Decision tree**

Test set RMSE:  $19,314.05

**10-fold cross validated decision tree**

Eval set RMSE Mean: $861,902.94

Test set RMSE:  $19,314.05

**10-fold cross validated linear regression**

Eval set RMSE Mean: $615,793.63

Test set RMSE:  $605,096.22

**10-fold cross validated random forest regressor**

Eval set RMSE Mean: $65,8179.03

Test set RMSE:  $189,566.31

In [37]:
### sklearn ###

# set features (independent) and labels (dependent)
y = data_all['Price']

# before commute, school quality data
x2 = data_all[['Home_size', 'Lot_size', 'Beds', 'Baths']]

# fit data
# regressor = LinearRegression()
regressor = DecisionTreeRegressor()
# regressor = RandomForestRegressor()
regressor.fit(x2, y)

# Train a model with 10-fold cross validation.
# Randomly splits the training set into 10 distinct subsets called folds, 
# then it trains and evaluates the Decision Tree model 10 times, 
# picking a different fold for evaluation every time and training on the 
# other 9 folds. The result is an array containing the 10 evaluation scores
# Note: sklearn cross validation expects a utility function, i.e. greater is better,
# rather than cost function like MSE where lower is better.
# That's why -scores is computed to get the RMSE.
scores = cross_val_score(regressor, x2, y, scoring="neg_mean_squared_error", cv = 10)
tree_rmse_scores  = np.sqrt(-scores)
display_scores(tree_rmse_scores)

# predict values based on model
y_pred2 = regressor.predict(x2) # listing data only

# calculate difference between predicted and actual prices
diff = round((y - y_pred2), 6)

# add difference to full data set
data_all['Price_difference'] = diff

Scores: [1131510.70027451 1084661.84878058 1080441.36893986  687650.27617917
  762602.14989779  688480.18842043  704042.49569219  866978.71015081
  873386.85142693  797034.66368465]

 Mean: 867678.9253446913

 Standard deviation: 164109.97716175174 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [38]:
# Root Mean Squared Error as the evaluation metric
regression_results(y, y_pred2)

RMSE:  19314.0491


In [39]:
data_all.head()

Unnamed: 0,Unnamed:_0,Address,City,State,Zip,Price,Beds,Baths,Home_size,Lot_size,Latitude,Longitude,SF_time,PA_time,School_score,Commute_time,Price_difference
1,1,1909 Hillman Ave,Belmont,CA,94002,1595000,4,2.0,2220.0,3999.0,37.521972,-122.294079,63,33,77.9,33,0.0
2,2,641 Waltermire St,Belmont,CA,94002,899999,2,1.0,840.0,4234.0,37.520233,-122.273144,63,33,77.9,33,0.0
3,3,2706 Sequoia Way,Belmont,CA,94002,1588000,3,2.0,1860.0,5210.0,37.520192,-122.309437,63,33,77.9,33,0.0
4,4,1568 Winding Way,Belmont,CA,94002,1999000,4,3.5,2900.0,16117.2,37.52428,-122.291241,63,33,77.9,33,0.0
5,5,1440 5th Ave,Belmont,CA,94002,1249950,2,1.0,1170.0,5001.0,37.516227,-122.272763,63,33,77.9,33,0.0


In [40]:
# save the model
file = open("house_price_model.pkl", 'wb')
pickle.dump(regressor, file)
file.close()

In [41]:
# check model can run prediction given inputs 
# with arbitrary first row of training values, expect prediction = $1,595,000
with open('house_price_model.pkl', 'rb') as pickle_in:
  model = pickle.load(pickle_in) 
  home_size = 2220.0 
  lot_size = 3999.0
  beds = 4
  baths = 2.0 
  # get prediction
  input_cols = [[home_size, lot_size, beds, baths]]
  prediction = model.predict(input_cols)
  output = round(prediction[0], 2)
  print('Your predicted house price: $' + str(output))

Your predicted house price: $1595000.0


  "X does not have valid feature names, but"
