In [640]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20, 10)

#Data Load

In [641]:
df1 = pd.read_csv("bengaluru_house_prices.csv")
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [642]:
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [643]:
df1.shape

(13320, 9)

In [644]:
df1.groupby('area_type')['area_type'].agg('count')

Unnamed: 0_level_0,area_type
area_type,Unnamed: 1_level_1
Built-up Area,2418
Carpet Area,87
Plot Area,2025
Super built-up Area,8790


##Drop features that are not required to build our model

In [645]:
df2 = df1.drop(['area_type','society','availability'], axis='columns')
df2.shape

(13320, 6)

#Data Cleaning

In [646]:
df2.isnull().sum()

Unnamed: 0,0
location,1
size,16
total_sqft,0
bath,73
balcony,609
price,0


In [647]:
df2.groupby('balcony')['balcony'].agg('count')

Unnamed: 0_level_0,balcony
balcony,Unnamed: 1_level_1
0.0,1029
1.0,4897
2.0,5113
3.0,1672


In [648]:
df3 = df2.dropna()
df3.isnull().sum()

Unnamed: 0,0
location,0
size,0
total_sqft,0
bath,0
balcony,0
price,0


In [649]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12710 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    12710 non-null  object 
 1   size        12710 non-null  object 
 2   total_sqft  12710 non-null  object 
 3   bath        12710 non-null  float64
 4   balcony     12710 non-null  float64
 5   price       12710 non-null  float64
dtypes: float64(3), object(3)
memory usage: 695.1+ KB


#Feature Engineering

In [650]:
df2.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Kothanur,2 BHK,1200,2.0,1.0,51.0


In [651]:
df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))
df3.bhk.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))


array([ 2,  4,  3,  1,  6,  8,  7,  5, 11,  9, 27, 43, 14, 12, 10, 13])

##Explore total_sqft feature

In [652]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [653]:
df3[~df3['total_sqft'].apply(is_float)]

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,0.0,186.000,4
122,Hebbal,4 BHK,3067 - 8156,4.0,0.0,477.000,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,0.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,0.0,43.490,2
188,KR Puram,2 BHK,1015 - 1540,2.0,0.0,56.800,2
...,...,...,...,...,...,...,...
12975,Whitefield,2 BHK,850 - 1060,2.0,0.0,38.190,2
12990,Talaghattapura,3 BHK,1804 - 2273,3.0,0.0,122.000,3
13059,Harlur,2 BHK,1200 - 1470,2.0,0.0,72.760,2
13265,Hoodi,2 BHK,1133 - 1384,2.0,0.0,59.135,2


In [654]:
def convert_sqft_to_num(x):
  tokens = x.split('-')
  if(len(tokens) == 2):
    return (float(tokens[0])+float(tokens[1]))/2
  try:
        return float(x)
  except:
        return None

In [655]:
df4 = df3.copy()
df4.total_sqft = df4.total_sqft.apply(convert_sqft_to_num)
df4 = df4[df4.total_sqft.notnull()]
df4.head(2)

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4


###Add new feature called price per square feet

In [656]:
df5 = df4.copy()
df5['price_per_sqft'] = df5['price']*100000/df5['total_sqft']
df5.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2,4250.0


In [657]:
df5['price_per_sqft'].describe()

Unnamed: 0,price_per_sqft
count,12668.0
mean,6876.277
std,22633.54
min,267.8298
25%,4242.721
50%,5376.344
75%,7142.857
max,2300000.0


In [658]:
df5.to_csv("bhp.csv",index=False)

In [659]:
df5.location = df5.location.apply(lambda x: x.strip())
location_stats = df5['location'].value_counts()
location_stats

Unnamed: 0_level_0,count
location,Unnamed: 1_level_1
Whitefield,514
Sarjapur Road,372
Electronic City,302
Kanakpura Road,259
Thanisandra,233
...,...
12th cross srinivas nagar banshankari 3rd stage,1
Tilak Nagar,1
Pattegarhpalya,1
Sarvobhogam Nagar,1


In [660]:
len(location_stats)

1248

In [661]:
len(location_stats[location_stats <= 10])

1013

##Dimensionality Reduction

Any location having less than 10 data points should be tagged as "other" location. This way number of categories can be reduced by huge amount. Later on when we do one hot encoding, it will help us with having fewer dummy columns

In [662]:
location_stats_less_than_10 = location_stats[location_stats<=10]
df5.location = df5.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
len(df5.location.unique())

236

CHAT GPT advised me that usually the square footage per bedroom is 300 (i.e. a 2 bedroom apartment is a minimum of 600 square feet). If you have a 2 bedroom apartment with 400 sq ft, for example, then this looks suspicious and may be removed as an outlier. We will remove such outliers while maintaining the minimum threshold for two-bedroom apartments at 300 sq ft.

In [663]:
df6 = df5[~(df5.total_sqft/df5.bhk<300)]
df6.shape

(12013, 8)

##Outlier Removal Using Standard Deviation and Mean

In [664]:
df6.price_per_sqft.describe()

Unnamed: 0,price_per_sqft
count,12013.0
mean,6206.082347
std,3985.518807
min,267.829813
25%,4199.363057
50%,5252.525253
75%,6823.529412
max,176470.588235



Here we find that min price per sqft is 267 rs/sqft whereas max is 12000000, this shows a wide variation in property prices. We should remove outliers per location using mean and one standard deviation

In [665]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    return df_out

df7 = remove_pps_outliers(df6)
df7.shape

(9852, 8)

##Use One Hot Encoding For Location

In [666]:
dummies = pd.get_dummies(df7.location)
dummies.head(3)

Unnamed: 0,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,AECS Layout,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [667]:
df8 = pd.concat([df7,dummies.drop('other',axis='columns')],axis='columns')
df8.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft,1st Block Jayanagar,1st Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1st Block Jayanagar,4 BHK,2850.0,4.0,1.0,428.0,4,15017.54386,True,False,...,False,False,False,False,False,False,False,False,False,False
1,1st Block Jayanagar,3 BHK,1630.0,3.0,2.0,194.0,3,11901.840491,True,False,...,False,False,False,False,False,False,False,False,False,False
2,1st Block Jayanagar,3 BHK,1875.0,2.0,3.0,235.0,3,12533.333333,True,False,...,False,False,False,False,False,False,False,False,False,False
3,1st Block Jayanagar,3 BHK,1200.0,2.0,0.0,130.0,3,10833.333333,True,False,...,False,False,False,False,False,False,False,False,False,False
4,1st Block Jayanagar,2 BHK,1235.0,2.0,2.0,148.0,2,11983.805668,True,False,...,False,False,False,False,False,False,False,False,False,False


In [668]:
df8 = df8.drop(['location'],axis='columns')
df8 = df8.drop(['size'],axis='columns')
df8 = df8.drop(['price_per_sqft'],axis='columns')
df8.head()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,1.0,428.0,4,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1630.0,3.0,2.0,194.0,3,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1875.0,2.0,3.0,235.0,3,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1200.0,2.0,0.0,130.0,3,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1235.0,2.0,2.0,148.0,2,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [669]:
df = df8

#Model

In [670]:
df.shape

(9852, 240)

In [671]:
X = df.drop(['price'], axis='columns')
X.shape

(9852, 239)

In [672]:
y = df.price
len(y)

9852

In [673]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [674]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

0.790191855604815

##Use K Fold cross validation to measure accuracy of our LinearRegression model

In [675]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.8209707 , 0.74771638, 0.7945179 , 0.82886274, 0.76911856])

##Find best model using GridSearchCV

In [676]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
              'fit_intercept': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['squared_error','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X_scaled,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.792237,{'fit_intercept': True}
1,lasso,0.775292,"{'alpha': 1, 'selection': 'cyclic'}"
2,decision_tree,0.692671,"{'criterion': 'squared_error', 'splitter': 'be..."


Based on above results we can say that Linear Regression gives the best score. Hence we will use that.

In [677]:
X.head(1)

Unnamed: 0,total_sqft,bath,balcony,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Phase JP Nagar,6th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,1.0,4,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [678]:
def predict_price(location, sqft, bath, balcony, bhk):
  loc_index = np.where(X.columns == location)[0][0]
  x = np.zeros(len(X.columns))
  x[0] = sqft
  x[1] = bath
  x[2] = balcony
  x[3] = bhk
  if loc_index >= 0:
    x[loc_index] = 1
  return lr_clf.predict([x])[0]

In [679]:
print(predict_price("Electronic City Phase II", 1060, 2, 1, 2))

33.91905023204986




##Conclusions

The Linear Regression model performs quite well with an accuracy close to 0.792237. However, in order to achieve even better results, it is necessary to perform more thorough data cleaning.

####Improving data quality can significantly increase model performance.

#Export the tested model to a pickle file

In [680]:
import pickle
with open('banglore_home_prices_model.pickle','wb') as f:
    pickle.dump(lr_clf, f)

###Export location and column information to a file that will be useful later on in our prediction application

In [681]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))