In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

Data Cleaning

In [None]:
data1=pd.read_csv('/content/Housing_with_Locations.csv')
data1

Unnamed: 0,Price,Area in sqft.,Bedrooms,Bathrooms,Stories,Mainroad,Guestrooms,Basement,Air Conditioning,Parking Area,Preferred Area,Furnishing Status,Location
0,1750000,3620,2,1,1,yes,no,no,yes,2,yes,furnished,"Whitefield, Bangalore"
1,1750000,2910,3,1,1,yes,no,no,yes,3,no,furnished,"Sector 62, Noida"
2,1750000,3850,3,1,2,yes,no,no,yes,2,yes,semi-furnished,"Gachibowli, Hyderabad"
3,1767150,2400,3,1,1,yes,no,no,no,0,no,unfurnished,"Gachibowli, Hyderabad"
4,1820000,3000,2,1,1,yes,yes,no,yes,1,yes,unfurnished,"Bandra, Mumbai"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,11410000,7420,4,1,2,yes,no,yes,no,1,no,furnished,"Sector 49, Gurgaon"
541,12215000,7500,4,2,2,yes,no,yes,no,0,no,unfurnished,"Whitefield, Bangalore"
542,12250000,8960,4,4,4,no,no,yes,no,0,no,semi-furnished,"Vashi, Navi Mumbai"
543,12250000,9960,3,2,2,yes,no,yes,no,0,no,unfurnished,"Andheri, Mumbai"


In [None]:
data1.drop(columns=['Stories','Mainroad','Preferred Area'],axis=1,inplace=True)

In [None]:
data1

Unnamed: 0,Price,Area in sqft.,Bedrooms,Bathrooms,Guestrooms,Basement,Air Conditioning,Parking Area,Furnishing Status,Location
0,1750000,3620,2,1,no,no,yes,2,furnished,"Whitefield, Bangalore"
1,1750000,2910,3,1,no,no,yes,3,furnished,"Sector 62, Noida"
2,1750000,3850,3,1,no,no,yes,2,semi-furnished,"Gachibowli, Hyderabad"
3,1767150,2400,3,1,no,no,no,0,unfurnished,"Gachibowli, Hyderabad"
4,1820000,3000,2,1,yes,no,yes,1,unfurnished,"Bandra, Mumbai"
...,...,...,...,...,...,...,...,...,...,...
540,11410000,7420,4,1,no,yes,no,1,furnished,"Sector 49, Gurgaon"
541,12215000,7500,4,2,no,yes,no,0,unfurnished,"Whitefield, Bangalore"
542,12250000,8960,4,4,no,yes,no,0,semi-furnished,"Vashi, Navi Mumbai"
543,12250000,9960,3,2,no,yes,no,0,unfurnished,"Andheri, Mumbai"


In [None]:
data1['Guestrooms']=data1['Guestrooms'].apply(lambda x: 1 if x == 'yes' else 0)
data1['Air Conditioning']=data1['Air Conditioning'].apply(lambda x: 1 if x == 'yes' else 0)
data1['Basement']=data1['Basement'].apply(lambda x: 1 if x == 'yes' else 0)
data1['Furnishing Status']=data1['Furnishing Status'].apply(lambda x: 2 if x == 'furnished' else 1 if x == 'semi-furnished' else 0)

In [None]:
data1

Unnamed: 0,Price,Area in sqft.,Bedrooms,Bathrooms,Guestrooms,Basement,Air Conditioning,Parking Area,Furnishing Status,Location
0,1750000,3620,2,1,0,0,1,2,2,"Whitefield, Bangalore"
1,1750000,2910,3,1,0,0,1,3,2,"Sector 62, Noida"
2,1750000,3850,3,1,0,0,1,2,1,"Gachibowli, Hyderabad"
3,1767150,2400,3,1,0,0,0,0,0,"Gachibowli, Hyderabad"
4,1820000,3000,2,1,1,0,1,1,0,"Bandra, Mumbai"
...,...,...,...,...,...,...,...,...,...,...
540,11410000,7420,4,1,0,1,0,1,2,"Sector 49, Gurgaon"
541,12215000,7500,4,2,0,1,0,0,0,"Whitefield, Bangalore"
542,12250000,8960,4,4,0,1,0,0,1,"Vashi, Navi Mumbai"
543,12250000,9960,3,2,0,1,0,0,0,"Andheri, Mumbai"


In [None]:
data1.isna().sum()

Unnamed: 0,0
Price,0
Area in sqft.,0
Bedrooms,0
Bathrooms,0
Guestrooms,0
Basement,0
Air Conditioning,0
Parking Area,0
Furnishing Status,0
Location,0


Correlation and Covariance

In [None]:
data1[['Area in sqft.','Price']].corr()

Unnamed: 0,Area in sqft.,Price
Area in sqft.,1.0,0.535997
Price,0.535997,1.0


In [None]:
data1[['Area in sqft.','Price']].cov()

Unnamed: 0,Area in sqft.,Price
Area in sqft.,4709512.0,2175676000.0
Price,2175676000.0,3498544000000.0


In [None]:
object_columns=data1.select_dtypes(include=['object']).columns
object_columns

Index(['Location'], dtype='object')

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
col_transfor=make_column_transformer(
    (OneHotEncoder(sparse_output=False),['Location']),
    remainder='passthrough'
)

In [None]:
Y=data1['Price']
X=data1.drop('Price',axis=1)

In [None]:
Scaler=StandardScaler()

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)

In [None]:
X_train.shape,Y_train.shape, X_test.shape, Y_test.shape

((436, 9), (436,), (109, 9), (109,))

In [None]:
rfr=RandomForestRegressor()

In [None]:
predicts=make_pipeline(col_transfor,Scaler,rfr)

In [None]:
predicts.fit(X_train,Y_train)

In [None]:
predicts.score(X_test,Y_test)

0.7576133999467122

In [None]:
mean_squared_error(Y_test,predicts.predict(X_test))

778921333623.5215

In [None]:
data1

Unnamed: 0,Price,Area in sqft.,Bedrooms,Bathrooms,Guestrooms,Basement,Air Conditioning,Parking Area,Furnishing Status,Location
0,1750000,3620,2,1,0,0,1,2,2,"Whitefield, Bangalore"
1,1750000,2910,3,1,0,0,1,3,2,"Sector 62, Noida"
2,1750000,3850,3,1,0,0,1,2,1,"Gachibowli, Hyderabad"
3,1767150,2400,3,1,0,0,0,0,0,"Gachibowli, Hyderabad"
4,1820000,3000,2,1,1,0,1,1,0,"Bandra, Mumbai"
...,...,...,...,...,...,...,...,...,...,...
540,11410000,7420,4,1,0,1,0,1,2,"Sector 49, Gurgaon"
541,12215000,7500,4,2,0,1,0,0,0,"Whitefield, Bangalore"
542,12250000,8960,4,4,0,1,0,0,1,"Vashi, Navi Mumbai"
543,12250000,9960,3,2,0,1,0,0,0,"Andheri, Mumbai"


In [None]:
input=pd.DataFrame([[12250000,9960,4,4,0,0,0,0,1,'Andheri, Mumbai']],columns=['Price','Area in sqft.','Bedrooms','Bathrooms','Guestrooms','Basement','Air Conditioning','Parking Area','Furnishing Status','Location'])

In [None]:
predicts.predict(input)

array([4524730.])

In [None]:
import pickle as pk

In [None]:
pk.dump(predicts,open('House_price_prediction_model.pkl','wb'))

In [None]:
data1.to_csv('Housing_Dataset.csv',index=False)