In [1]:
from sklearn.metrics import mean_squared_error
from math import sqrt # rms = sqrt(mean_squared_error(y_true, y_predicted))
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
import pickle

# linear regression models

from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet,BayesianRidge
from sklearn.svm import SVR

# cross val, k-folds
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import make_pipeline

  from numpy.core.umath_tests import inner1d


In [2]:
import torch
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

from keras.regularizers import l1
from keras.layers.normalization import BatchNormalization
from keras.models import load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
df = pd.read_csv('FinalREW.csv')

In [4]:
df.drop('Unnamed: 0',axis=1,inplace=True)
df.drop(df.columns[12:544],axis=1,inplace=True)

In [98]:
X = df.drop(['price'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, df['price'], test_size=0.2, random_state=42)

In [99]:
svr_model = SVR('linear',C=500)
svr_model.fit(X_train,y_train)
y_pred = svr_model.predict(X_test)

rmse = sqrt(mean_squared_error(y_test, y_pred)) 
r2 = r2_score(y_test, y_pred)

print("Support Vector Regression Scores- rmse:",rmse," r2:",r2)

Support Vector Regression Scores- rmse: 734916.5834675301  r2: 0.4844043556316916


In [100]:
filename = 'svr_f1.pkl'
pickle.dump(svr_model, open(filename, 'wb')) 

In [39]:
params = {'n_estimators': 600, 'max_depth': 3,'max_features': 'auto',
          'learning_rate': 0.1, 'loss': 'ls'}

gbr_model = GradientBoostingRegressor(**params)
gbr_model.fit(X_train,y_train)

y_pred = gbr_model.predict(X_test)

rmse = sqrt(mean_squared_error(y_test, y_pred)) 
r2 = r2_score(y_test, y_pred)

print("Gradient Boost(No tuning) Regression Scores- rmse:",rmse," r2:",r2)

Gradient Boost(No tuning) Regression Scores- rmse: 399439.27864794794  r2: 0.8476878029902623


In [0]:
filename = 'gbr_f2.pkl'
pickle.dump(gbr_model, open(filename, 'wb')) 

In [106]:
rfr_model = RandomForestRegressor(max_depth=12, max_features='auto',min_samples_leaf=2, min_samples_split=10,n_estimators=300)
rfr_model.fit(X_train,y_train)
y_pred = rfr_model.predict(X_test)

rmse = sqrt(mean_squared_error(y_test, y_pred)) 
r2 = r2_score(y_test, y_pred)

print("Random Forest Scores- rmse:",rmse," r2:",r2)

Random Forest Scores- rmse: 446583.2423091945  r2: 0.8096126933618631


In [107]:
filename = 'rfr_f3.pkl'
pickle.dump(rfr_model, open(filename, 'wb')) 

In [72]:

model_Neural = Sequential()

# get # of columns in training data
cols = X_train.shape[1]

#adding layers
model_Neural.add(Dense(200, activation='relu', input_shape=(cols,)))
model_Neural.add(Dense(100, activation='relu'))
model_Neural.add(Dense(30, activation='relu'))
model_Neural.add(Dense(1))

model_Neural.compile(optimizer='adam', loss='mean_squared_error')

early_stopping_monitor = EarlyStopping(patience=70)

model_Neural.fit(X_train, y_train, validation_split=0.1, epochs=300,callbacks=[early_stopping_monitor])

predictions = model_Neural.predict(X_test)
print("R2 score:",r2_score(y_test, predictions))
rms = sqrt(mean_squared_error(y_test, predictions))
print("RMSE Score:",rms)

Train on 8624 samples, validate on 959 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300


In [0]:
filename = 'nn_f4.pkl'
model_Neural.save('nn_f4.h5')

In [0]:
ndf = pd.DataFrame()

for i in df.index:
  
  x = df.loc[i,df.columns[1:]]
  x = np.array(x).reshape(1,-1)
  
  ndf.loc[i,'price'] = df.loc[i,'price']
  ndf.loc[i,'f1'] = f1.predict(x)[0]
  ndf.loc[i,'f2'] = f2.predict(x)[0]
  ndf.loc[i,'f3'] = f3.predict(x)[0]
  ndf.loc[i,'f4'] = f4.predict(x)[0][0]
  

In [0]:
X = ndf.drop(['price'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, ndf['price'], test_size=0.2)

In [119]:
lm_model = LinearRegression(normalize=True)
lm_model.fit(X_train,y_train)
y_pred = lm_model.predict(X_test)

rmse = sqrt(mean_squared_error(y_test, y_pred)) 
r2 = r2_score(y_test, y_pred)

print("Linear regression Scores- rmse:",rmse," r2:",r2)

Linear regression Scores- rmse: 312813.0787349889  r2: 0.9020859017485927


In [150]:
i=66

x = ndf.loc[i,ndf.columns[1:]]
x = np.array(x).reshape(1,-1)

print('predicted price: %.2f'%lm_model.predict(x)[0])
print('actual price: ',df.loc[i,'price'])

predicted price: 552918.29
actual price:  518800


In [0]:
filename = 'finalmodel.pkl'
pickle.dump(lm_model, open(filename, 'wb')) 

In [None]:
areas = ['V3S', 'V7C', 'V3W', 'V4N', 'V6Y', 'V6X', 'V4A', 'V2X', 'V7E', 'V3B','V3T', 'V4B', 'V3R', 'V3A', 'V6B',
         'V5R', 'V3M', 'V5H','V3E', 'V2Y', 'V3Z', 'V7S', 'V6P', 'V6Z', 'V7A', 'V3K', 'V3J', 'V3N', 'V7L', 'V3H',
         'V3C', 'V4C', 'V3X', 'V5J', 'V1M', 'V3V', 'V5N', 'V6E', 'V5E', 'V7V', 'V4P', 'V5C', 'V4K', 'V4R', 'V5X',
         'V5Z', 'V2W', 'V6S','V2Z', 'V6M', 'V3L', 'V7M', 'V6G', 'V5M', 'V7W', 'V5A', 'V4M', 'V6N', 'V5Y', 'V5S',
         'V7R', 'V7J', 'V5P', 'V6R', 'V7T', 'V5B', 'V7P', 'V3Y', 'V4W', 'V6J', 'V4L', 'V7G', 'V6K', 'V6A', 'V5G',
         'V5T', 'V7N','V4E', 'V6H', 'V6V', 'V5K', 'V6T', 'V6C', 'V5L', 'V6L', 'V5V', 'V5W', 'V7H', 'V7K', 'V6W',
         'V0N', 'V0V', 'V7B', 'V3G', 'V0X', 'V2S', 'V2T', 'V4S', 'V4X', 'V1V', 'V2E', 'V0T', 'V9L', 'V8K', 'V0H',
         'V0Y','V0M', 'V8V', 'V2J', 'V2A', 'V1L', 'V2H', 'V0B', 'V0L', 'V4G']

In [None]:
crimep = {'V6E':9.652823,'V6H':8.577192,'V5T':8.09922,'V5M':6.480475,'V5L':6.361571,'V6J':6.026049,'V5N':5.794912,
          'V6A':5.666982,'V5K':4.266424,'V6M':3.76216,'V5W':3.643648,'V6L':3.232389,'V5V':3.110738,'V6P':2.989087,
          'V5P':2.39692,'V6S':2.266636,'V6R':1.380544,'V5Z':1.316187,'V6G':0.784061,'V6N':0.096536}

In [0]:
srs = {'V3R':5.214286,'V3S':5.940625,'V3T':4.837500,'V3V':3.827273,'V3W':5.425000,'V3X':5.785714,'V4A':6.916667,'V4N':6.286364,
       'V4P':7.000000,'V5K':7.040000,'V5L':5.450000,'V5M':7.140000,'V5N':6.883333,'V5P':6.216667,'V5R':6.800000,'V5S':6.166667,
       'V5T':8.000000,'V5V':5.933333,'V5W':5.425000,'V5X':6.650000,'V5Z':7.600000,'V6B':6.800000,'V6G':5.500000,'V6H':7.600000,
       'V6J':8.280000,'V6K':7.875000,'V6L':6.475000,'V6M':7.280000,'V6N':7.466667,'V6P':7.660000,'V6R':7.657143,'V6S':9.400000,
       'V6T':6.550000,'V6Z':6.600000}

### Model for Front-end

In [108]:
# Load models

f1n = 'svr_f1.pkl' # # Try svr_f1_new.pkl if predict_price function fails
f2n = 'gbr_f2.pkl'
f3n = 'rfr_f3.pkl' # Try rfr_f3_new.pkl if predict_price function fails
f4n = 'nn_f4.h5'
finall = 'finalmodel.pkl'

f1 = pickle.load(open(f1n, 'rb'))
f2 = pickle.load(open(f2n, 'rb'))
f3 = pickle.load(open(f3n, 'rb'))
f4 = load_model(f4n)
stackedmodel = pickle.load(open(finall, 'rb'))



In [109]:
def encode_data(htype,parea):
    
    encoded = []
    
    # Add Crime percentage
    
    crimep = {'V6E':9.652823,'V6H':8.577192,'V5T':8.09922,'V5M':6.480475,'V5L':6.361571,'V6J':6.026049,'V5N':5.794912,
              'V6A':5.666982,'V5K':4.266424,'V6M':3.76216,'V5W':3.643648,'V6L':3.232389,'V5V':3.110738,'V6P':2.989087,
              'V5P':2.39692,'V6S':2.266636,'V6R':1.380544,'V5Z':1.316187,'V6G':0.784061,'V6N':0.096536}
    
    if parea in crimep.keys():
        encoded.append(crimep[parea])
    else:
        encoded.append(0.0)
    
    # Encoding House type
    
    if htype in ['Apt/Condo','Mfd/Mobile Home','Townhouse']:
        encoded.append(0)
        encoded.append(1)
    else:
        encoded.append(1)
        encoded.append(0)
        
    
    # Encoding School type
    
    srs = {'V4A':[0,1,0],'V4N':[0,1,0],'V4P':[0,1,0],'V5K':[0,1,0],'V5M':[0,1,0],'V5N':[0,1,0],'V5P':[0,1,0],'V5R':[0,1,0],
           'V5S':[0,1,0],'V5T':[0,1,0],'V5X':[0,1,0],'V5Z':[0,1,0],'V6B':[0,1,0],'V6H':[0,1,0],'V6J':[1,0,0],'V6K':[0,1,0],
           'V6L':[0,1,0],'V6M':[0,1,0],'V6N':[0,1,0],'V6P':[0,1,0],'V6R':[0,1,0],'V6S':[1,0,0],'V6T':[0,1,0],'V6Z':[0,1,0]}
    
    if parea not in srs.keys():
        encoded = encoded + [0,0,1]
    else:
        encoded = encoded + srs[parea]
        
    # Encoding Area
            
    areas = ['V3S', 'V7C', 'V3W', 'V4N', 'V6Y', 'V6X', 'V4A', 'V2X', 'V7E', 'V3B','V3T', 'V4B', 'V3R', 'V3A', 'V6B',
         'V5R', 'V3M', 'V5H','V3E', 'V2Y', 'V3Z', 'V7S', 'V6P', 'V6Z', 'V7A', 'V3K', 'V3J', 'V3N', 'V7L', 'V3H',
         'V3C', 'V4C', 'V3X', 'V5J', 'V1M', 'V3V', 'V5N', 'V6E', 'V5E', 'V7V', 'V4P', 'V5C', 'V4K', 'V4R', 'V5X',
         'V5Z', 'V2W', 'V6S','V2Z', 'V6M', 'V3L', 'V7M', 'V6G', 'V5M', 'V7W', 'V5A', 'V4M', 'V6N', 'V5Y', 'V5S',
         'V7R', 'V7J', 'V5P', 'V6R', 'V7T', 'V5B', 'V7P', 'V3Y', 'V4W', 'V6J', 'V4L', 'V7G', 'V6K', 'V6A', 'V5G',
         'V5T', 'V7N','V4E', 'V6H', 'V6V', 'V5K', 'V6T', 'V6C', 'V5L', 'V6L', 'V5V', 'V5W', 'V7H', 'V7K', 'V6W',
         'V0N', 'V0V', 'V7B', 'V3G', 'V0X', 'V2S', 'V2T', 'V4S', 'V4X', 'V1V', 'V2E', 'V0T', 'V9L', 'V8K', 'V0H',
         'V0Y','V0M', 'V8V', 'V2J', 'V2A', 'V1L', 'V2H', 'V0B', 'V0L', 'V4G']
    
    for i in areas:
        if i == parea:
            encoded.append(1)
        else:
            encoded.append(0)
            
    return encoded


In [110]:
def predict_price(bed,bath,area_sqft,age,fireplaces,housetype,area):
    
    inputs = [bed,bath,area_sqft,age,fireplaces]
    inputs = inputs + encode_data(housetype,area)
    sample = []
    sample.append(tuple(inputs))
    
    inp_sample = pd.DataFrame(sample)
    inp_sample = np.array(inp_sample).reshape(1,-1)
    
    p1 = f1.predict(inp_sample)[0]
    p2 = f2.predict(inp_sample)[0]
    p3 = f3.predict(inp_sample)[0]
    p4 = f4.predict(inp_sample)[0][0]
    
    newInp = pd.DataFrame([(p1,p2,p3,p4)])
    newInp = np.array(newInp).reshape(1,-1)
    predicted_price = stackedmodel.predict(newInp)[0]
    
    return predicted_price
    

In [111]:
# Inputs from front-end

bedrooms = 3 # Any Integer
bathrooms = 2 # Any Integer
Area_sqft = 1800 # Any Integer
builtin_last = 12 # Property built in last x years. Any Integer
fireplaces_in_house = 1 # Any Integer
house_type = 'Townhouse' # Can only be 'Apt/Condo','Mfd/Mobile Home','Townhouse','Duplex','House','Land/Lot','Multifamily'
postal_area = 'V5J' # Postal code

In [112]:
# Output

p = predict_price(bedrooms,bathrooms,Area_sqft,builtin_last,fireplaces_in_house,house_type,postal_area)
print('Predicted price: %.2f'%p)

Predicted price: 1004599.99
