# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset 

In [2]:
df = pd.read_csv('final_house_scrape.csv')
df.head(10)

Unnamed: 0,BHK,Bathroom,price(L),rate_persqft,area_insqft,construction_statusUnder Construction,construction_statusReady to move,Sale_typenew,Sale_typeresale,location_Thane West,...,location_Mazgaon,location_Sector-18 Ulwe,location_Ulhasnagar,location_vasant vihar thane west,location_New Balaji Nagar,location_Bhayandar West,location_Roadpali,location_Bhandup East,location_Mahim,location_Agripada
0,1,1,18.49,7034.0,263.0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,40.0,11869.0,337.0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2,149.0,22513.0,664.0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,2,49.0,7802.0,628.0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,96.0,14222.0,675.0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2,2,45.0,7745.0,581.0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2,2,81.99,10419.0,787.0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2,2,135.0,19679.0,686.0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2,2,125.0,19562.0,639.0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1,1,32.5,8689.0,374.0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# select the dependent and independent features for prediction.

In [3]:
X = df.drop("price(L)", axis=1)
y = df['price(L)']
print('Shape of X = ', X.shape)
print('Shape of y = ', y.shape)

Shape of X =  (81212, 154)
Shape of y =  (81212,)


# split the data into train and test 

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 51)
print('Shape of X_train = ', X_train.shape)
print('Shape of y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_test = ', y_test.shape)

Shape of X_train =  (64969, 154)
Shape of y_train =  (64969,)
Shape of X_test =  (16243, 154)
Shape of y_test =  (16243,)


# Feature Scaling

In [5]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train= sc.transform(X_train)
X_test = sc.transform(X_test)

# Model Building

# Linear Regression

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
lr = LinearRegression()
lr_lasso = Lasso()

In [7]:
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [8]:
lr.fit(X_train, y_train)
lr_score = lr.score(X_test, y_test) 
lr_rmse = rmse(y_test, lr.predict(X_test))
lr_score, lr_rmse

(0.9230118142259816, 33.24236339816179)

# Lasso 

In [9]:
lr_lasso.fit(X_train, y_train)
lr_lasso_score=lr_lasso.score(X_test, y_test) 
lr_lasso_rmse = rmse(y_test, lr_lasso.predict(X_test))
lr_lasso_score, lr_lasso_rmse

(0.9153004156388886, 34.86747430885102)

# Random Forest

In [10]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)
rfr_score=rfr.score(X_test,y_test) 
rfr_rmse = rmse(y_test, rfr.predict(X_test))
rfr_score, rfr_rmse

(0.9996326889523914, 2.2961319695663436)

# Cross validation

In [12]:
from sklearn.model_selection import cross_val_score

cvs_rfr = cross_val_score(rfr, X_train,y_train, cv = 10)
cvs_rfr, cvs_rfr.mean()

(array([0.99868624, 0.99963287, 0.99945367, 0.9994672 , 0.99961462,
        0.99961539, 0.99925568, 0.99967598, 0.9995487 , 0.99944481]),
 0.999439515609599)

In [13]:
cvs_rfr2 = cross_val_score(RandomForestRegressor(), X_train,y_train, cv = 10)
cvs_rfr2, cvs_rfr2.mean()

(array([0.99866336, 0.99964392, 0.99946825, 0.99947739, 0.99961585,
        0.99960322, 0.99926379, 0.99967454, 0.99958261, 0.99946422]),
 0.999445715132658)

# Serialize Models

In [14]:
import pickle
# open a file, where you ant to store the data
file = open('lasso_model.pkl', 'wb')

# dump information to that file
pickle.dump(lr_lasso, file)

In [15]:
import pickle
# open a file, where you ant to store the data
file = open('random_model.pkl', 'wb')

# dump information to that file
pickle.dump(rfr, file)

In [16]:
import pickle
# open a file, where you ant to store the data
file = open('linear_model.pkl', 'wb')

# dump information to that file
pickle.dump(lr, file)

In [17]:
list(X.columns)

['BHK',
 'Bathroom',
 'rate_persqft',
 'area_insqft',
 'construction_statusUnder Construction',
 'construction_statusReady to move',
 'Sale_typenew',
 'Sale_typeresale',
 'location_Thane West',
 'location_Mira Road East',
 'location_Kharghar',
 'location_Kandivali East',
 'location_Ulwe',
 'location_Dombivali',
 'location_Virar',
 'location_Chembur',
 'location_Kalyan West',
 'location_Panvel',
 'location_Andheri West',
 'location_Ambernath East',
 'location_Dombivali East',
 'location_Badlapur East',
 'location_Malad West',
 'location_Mulund West',
 'location_Taloja',
 'location_Andheri East',
 'location_Powai',
 'location_Goregaon East',
 'location_Kandivali West',
 'location_Borivali East',
 'location_Badlapur West',
 'location_Kamothe',
 'location_Goregaon West',
 'location_Naigaon East',
 'location_Borivali West',
 'location_Bhandup West',
 'location_Malad East',
 'location_Kanjurmarg',
 'location_Vasai',
 'location_Kalyan East',
 'location_Ghansoli',
 'location_Ambernath West',
 

# Method for Prediction

In [18]:
def predict_house_price(model,BHK,Bathroom,rate_persqft,area_insqft,construction_status,Sale_type,location):
    x =np.zeros(len(X.columns)) # create zero numpy array, len = 107 as input value for model
    # adding feature's value accorind to their column index
    x[0]=BHK
    x[1]=Bathroom
    x[2]=rate_persqft
    x[3]=area_insqft
 
    if "construction_status"=="Ready To Move":
        x[5]=1
     
    if "Sale_type"=="resale":
        x[7]=1
 
    if 'location_'+location in X.columns:
        loc_index = np.where(X.columns=="location_"+location)[0][0]
        x[loc_index] =1
 
    #print(loc_index)
 
  #print(x)
 
  # feature scaling
    x = sc.transform([x])[0] # give 2d np array for feature scaling and get 1d scaled np array
  #print(x)
 
    return model.predict([x])[0] # return the predicted value by train Random forest model


In [26]:
#Test on Linear Part-1
predict_house_price(model=lr ,BHK=1,Bathroom=1,rate_persqft=14545,area_insqft=550,construction_status="Ready to move",Sale_type="resale",location="Ghatkopar") 

-613343954092892.8

In [27]:
# Test on Linear part 2
predict_house_price(model=lr,BHK=1,Bathroom=2,rate_persqft=14222,area_insqft=675,construction_status="Under Construction",Sale_type="new",location="Jogeshwari West")

-699780469738048.1

In [25]:
#Test on Lasso Part-1
predict_house_price(model=lr_lasso,BHK=1,Bathroom=1,rate_persqft=14545,area_insqft=550,construction_status="Ready to move",Sale_type="resale",location="Ghatkopar") 

86.59369534009585

In [20]:
# Test on lassso part 2
predict_house_price(model=lr_lasso,BHK=1,Bathroom=2,rate_persqft=14222,area_insqft=675,construction_status="Under Construction",Sale_type="new",location="Jogeshwari West")

103.25889568621106

In [21]:
# Test on Random forest part 1
predict_house_price(model=rfr,BHK=1,Bathroom=1,rate_persqft=14545,area_insqft=550,construction_status="Ready to move",Sale_type="resale",location="Ghatkopar")

80.0

In [22]:
# Test on Random forest part 2
predict_house_price(model=rfr,BHK=1,Bathroom=2,rate_persqft=14222,area_insqft=675,construction_status="Under Construction",Sale_type="new",location="Jogeshwari West")

95.88

In [None]:
# From the Above predictions we see that in Lasso model there is error in price value around 5-6 lacs from original price
#While RandomForest has negligible error compared to Lasso and Linear regression so we will use Random Forest for further user-interface
#Linear Regression model totally failed to predict the values.