Libraries


In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import copy
from pandas_profiling import ProfileReport

# Load dataset and overview 

In [3]:
df = pd.read_csv("Bengaluru_House_Data.csv")

In [4]:
df.shape

(13320, 9)

In [5]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [6]:
df.head(5)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


# Drop unnecessary columns


In [7]:
df2 = df.drop(['availability','society','area_type'],axis='columns')
df2.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Kothanur,2 BHK,1200,2.0,1.0,51.0


# Handiling NULL value


In [8]:
df2.isnull().sum()

location        1
size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

In [9]:
df2.balcony.mean()

1.5843757375501535

In [10]:
df3 = df2.fillna({"balcony" : df2.balcony.mean()})
df3.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
balcony        0
price          0
dtype: int64

In [11]:
df4= df3.dropna()

In [63]:
df4.isnull().sum()

location        0
size            0
total_sqft    190
bath            0
balcony         0
price           0
dtype: int64

# Strip location columns values

In [65]:
df4['location']  = df4.location.apply(lambda x: x.strip().lower())
df4.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,electronic city phase ii,2,1056.0,2.0,1.0,39.07
1,chikka tirupathi,4,2600.0,5.0,3.0,120.0
2,uttarahalli,3,1440.0,2.0,3.0,62.0
3,lingadheeranahalli,3,1521.0,3.0,1.0,95.0
4,kothanur,2,1200.0,2.0,1.0,51.0


# Feature Engineering

In [67]:
df4['size'].unique()

array(['2', '4', '3', '6', '1', '8', '7', '5', '11', '9', '27', '10',
       '19', '16', '43', '14', '12', '13', '18'], dtype=object)

In [69]:
df4["size"] = df4["size"].apply(lambda x : x.split(" ")[0])

In [71]:
df4.head(5)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,electronic city phase ii,2,1056.0,2.0,1.0,39.07
1,chikka tirupathi,4,2600.0,5.0,3.0,120.0
2,uttarahalli,3,1440.0,2.0,3.0,62.0
3,lingadheeranahalli,3,1521.0,3.0,1.0,95.0
4,kothanur,2,1200.0,2.0,1.0,51.0


In [73]:
df4.total_sqft.unique()

array([1056., 2600., 1440., ..., 2758.,  774., 4689.])

In [75]:
def total_s (x):
    try:
        float(x)
    except:
        return True
    return False

In [76]:
df4 [df4['total_sqft'].apply(total_s)]

Unnamed: 0,location,size,total_sqft,bath,balcony,price


In [77]:
def convert_to_int(x):
    y = x.split("-")
    if len(y) == 2:
        return (float(y[0])+float(y[1]))/2
    else:
        return x

In [78]:
df4["total_sqft"] = df["total_sqft"].apply(convert_to_int)
df4.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,electronic city phase ii,2,1056,2.0,1.0,39.07
1,chikka tirupathi,4,2600,5.0,3.0,120.0
2,uttarahalli,3,1440,2.0,3.0,62.0
3,lingadheeranahalli,3,1521,3.0,1.0,95.0
4,kothanur,2,1200,2.0,1.0,51.0


In [79]:
def clear_totalsqft(x):
    try:
        float(x)
        return float(x)
    except:
        return None

In [80]:
df4["total_sqft"] = df["total_sqft"].apply(clear_totalsqft)
df4.head()
df4.shape

(13246, 6)

In [81]:
df4["total_sqft"].isnull().sum()

190

In [82]:
df5 = df4.dropna().copy()

# Outlier detection and remove


In [83]:
df5['total_sqft']=df5["total_sqft"].apply(lambda x : float(x))
df5['price']=df5["price"].apply(lambda x : float(x))
df5['size']=df5["size"].apply(lambda x : float(x))

In [84]:
df5.total_sqft
df5.price

0         39.07
1        120.00
2         62.00
3         95.00
4         51.00
          ...  
13315    231.00
13316    400.00
13317     60.00
13318    488.00
13319     17.00
Name: price, Length: 13056, dtype: float64

In [85]:
df5['price_per_sqft'] = (df5.price / df5.total_sqft)*100000
df5.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,price_per_sqft
0,electronic city phase ii,2.0,1056.0,2.0,1.0,39.07,3699.810606
1,chikka tirupathi,4.0,2600.0,5.0,3.0,120.0,4615.384615
2,uttarahalli,3.0,1440.0,2.0,3.0,62.0,4305.555556
3,lingadheeranahalli,3.0,1521.0,3.0,1.0,95.0,6245.890861
4,kothanur,2.0,1200.0,2.0,1.0,51.0,4250.0


In [86]:
df6 = df5[~(df5["size"]>20)]
df6.shape

(13054, 7)

In [87]:
df6 =df6[~((df6.total_sqft/df6['size'])<300)]
df6.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,price_per_sqft
0,electronic city phase ii,2.0,1056.0,2.0,1.0,39.07,3699.810606
1,chikka tirupathi,4.0,2600.0,5.0,3.0,120.0,4615.384615
2,uttarahalli,3.0,1440.0,2.0,3.0,62.0,4305.555556
3,lingadheeranahalli,3.0,1521.0,3.0,1.0,95.0,6245.890861
4,kothanur,2.0,1200.0,2.0,1.0,51.0,4250.0


In [88]:
max_thresold = df6.price_per_sqft.quantile(0.995)
min_thresold = df6.price_per_sqft.quantile(0.005)

In [89]:
df6[(df6.price_per_sqft<min_thresold)|(df6.price_per_sqft>max_thresold)].shape[0]

117

In [90]:
df7 = df6[(df6.price_per_sqft>min_thresold) & (df6.price_per_sqft<max_thresold)]

In [91]:
df7["zscore"] = (df7.price_per_sqft-df7.price_per_sqft.mean())/df7.price_per_sqft.std()

In [92]:
df8 = df7[(df7.zscore>-3)&(df7.zscore<+3)]
df8.shape

(11862, 8)

In [93]:
df8 = df8[~(df8.total_sqft<500)]

In [94]:
df8.total_sqft.describe()

count    11785.000000
mean      1542.664806
std        903.196854
min        500.000000
25%       1119.000000
50%       1300.000000
75%       1680.000000
max      30400.000000
Name: total_sqft, dtype: float64

In [95]:
def remove_pps_outlier(df):
    df_out = pd.DataFrame()
    for key,subdf in df.groupby("location"):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduce_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduce_df],ignore_index=True)
    return df_out

In [96]:
df9 = remove_pps_outlier(df8)

In [97]:
df8.shape[0]

11785

### Visualise Data for best understandign the Dataset

In [130]:
report = ProfileReport(df8, title= "profilling_report",explorative=True,dark_mode=False)

In [131]:
report.to_widgets()

Summarize dataset: 100%|██████████| 23/23 [00:09<00:00,  2.42it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.51s/it]


VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [99]:
df8.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,price_per_sqft,zscore
0,electronic city phase ii,2.0,1056.0,2.0,1.0,39.07,3699.810606,-0.76939
1,chikka tirupathi,4.0,2600.0,5.0,3.0,120.0,4615.384615,-0.485794
2,uttarahalli,3.0,1440.0,2.0,3.0,62.0,4305.555556,-0.581763
3,lingadheeranahalli,3.0,1521.0,3.0,1.0,95.0,6245.890861,0.019249
4,kothanur,2.0,1200.0,2.0,1.0,51.0,4250.0,-0.598971


In [100]:
df8=df8[~(df8["size"]<df8.bath)]

In [101]:
df8.shape[0]

11084

In [102]:
df8 = df8[~(df8.bath>10)]


In [103]:
df10 = df8.drop(["price_per_sqft","zscore"],axis="columns")
df10.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,electronic city phase ii,2.0,1056.0,2.0,1.0,39.07
2,uttarahalli,3.0,1440.0,2.0,3.0,62.0
3,lingadheeranahalli,3.0,1521.0,3.0,1.0,95.0
4,kothanur,2.0,1200.0,2.0,1.0,51.0
5,whitefield,2.0,1170.0,2.0,1.0,38.0


# One Hot Encoding

In [104]:
df10.location.unique().size

1110

In [105]:
location_state=df10.groupby("location")["location"].agg('count').sort_values(ascending = False)
location_state

location
whitefield                            459
sarjapur  road                        341
electronic city                       279
kanakpura road                        252
thanisandra                           205
                                     ... 
khb colony extension                    1
kirloskar layout, basaveshwarnagar      1
kodanda reddy layout                    1
kodipalya                               1
1 giri nagar                            1
Name: location, Length: 1110, dtype: int64

In [106]:
location_state_lessthan10 = location_state[location_state<=10]
location_state_lessthan10.shape[0]

905

In [107]:
def other_name(x):
    if x in location_state_lessthan10:
        return "other"
    else:
        return x

In [108]:
df10.location = df10['location'].apply(lambda x: 'other' if x in location_state_lessthan10 else x)

In [109]:
df10.shape[0]

11082

In [110]:
df10[df10.location=="other"]

Unnamed: 0,location,size,total_sqft,bath,balcony,price
19,other,2.0,1100.0,2.0,2.0,48.00
25,other,3.0,1250.0,3.0,2.0,56.00
42,other,1.0,600.0,1.0,0.0,38.00
49,other,2.0,869.0,2.0,1.0,36.00
50,other,2.0,1270.0,2.0,1.0,50.00
...,...,...,...,...,...,...
13285,other,2.0,1353.0,2.0,2.0,110.00
13291,other,1.0,812.0,1.0,0.0,26.00
13292,other,3.0,1440.0,2.0,2.0,63.93
13294,other,4.0,2200.0,3.0,3.0,80.00


In [111]:
df10.location.unique().size

206

In [112]:
dummy_vars = pd.get_dummies(df10.location)
dummy_vars_droped=dummy_vars.drop("other",axis="columns")
dummy_vars_droped.columns

Index(['1st phase jp nagar', '2nd phase judicial layout', '5th phase jp nagar',
       '6th phase jp nagar', '7th phase jp nagar', '8th phase jp nagar',
       '9th phase jp nagar', 'abbigere', 'aecs layout', 'akshaya nagar',
       ...
       'vasanthapura', 'vidyaranyapura', 'vijayanagar', 'vittasandra',
       'whitefield', 'yelachenahalli', 'yelahanka', 'yelahanka new town',
       'yelenahalli', 'yeshwanthpur'],
      dtype='object', length=205)

In [113]:
df11 = pd.concat([df10,dummy_vars,dummy_vars_droped],axis="columns")
df11.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,1st phase jp nagar,2nd phase judicial layout,5th phase jp nagar,6th phase jp nagar,...,vasanthapura,vidyaranyapura,vijayanagar,vittasandra,whitefield,yelachenahalli,yelahanka,yelahanka new town,yelenahalli,yeshwanthpur
0,electronic city phase ii,2.0,1056.0,2.0,1.0,39.07,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,uttarahalli,3.0,1440.0,2.0,3.0,62.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,lingadheeranahalli,3.0,1521.0,3.0,1.0,95.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,kothanur,2.0,1200.0,2.0,1.0,51.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,whitefield,2.0,1170.0,2.0,1.0,38.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [114]:
df12 = df11.drop(["location"],axis="columns")
df12.head()

Unnamed: 0,size,total_sqft,bath,balcony,price,1st phase jp nagar,2nd phase judicial layout,5th phase jp nagar,6th phase jp nagar,7th phase jp nagar,...,vasanthapura,vidyaranyapura,vijayanagar,vittasandra,whitefield,yelachenahalli,yelahanka,yelahanka new town,yelenahalli,yeshwanthpur
0,2.0,1056.0,2.0,1.0,39.07,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3.0,1440.0,2.0,3.0,62.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.0,1521.0,3.0,1.0,95.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2.0,1200.0,2.0,1.0,51.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2.0,1170.0,2.0,1.0,38.0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


# Model Building

In [115]:
X = df12.drop("price",axis = "columns")
X

Unnamed: 0,size,total_sqft,bath,balcony,1st phase jp nagar,2nd phase judicial layout,5th phase jp nagar,6th phase jp nagar,7th phase jp nagar,8th phase jp nagar,...,vasanthapura,vidyaranyapura,vijayanagar,vittasandra,whitefield,yelachenahalli,yelahanka,yelahanka new town,yelenahalli,yeshwanthpur
0,2.0,1056.0,2.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3.0,1440.0,2.0,3.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.0,1521.0,3.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2.0,1200.0,2.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2.0,1170.0,2.0,1.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13314,3.0,1715.0,3.0,3.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13315,5.0,3453.0,4.0,0.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
13317,2.0,1141.0,2.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13318,4.0,4689.0,4.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [116]:
y = df12.price
y

0         39.07
2         62.00
3         95.00
4         51.00
5         38.00
          ...  
13314    112.00
13315    231.00
13317     60.00
13318    488.00
13319     17.00
Name: price, Length: 11082, dtype: float64

### train test splitting

In [117]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 1)

## check LinearRegression model score

In [118]:
%time
from sklearn.linear_model import LinearRegression
model_classifier = LinearRegression()
model_classifier.fit(X_train,y_train)
model_classifier.score(X_test,y_test)

Wall time: 0 ns


0.7547457916274025

In [119]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=7,test_size=0.2,random_state=0)

cross_val_score(LinearRegression(),X,y,cv=cv)

array([0.74395314, 0.72827429, 0.71558242, 0.72778905, 0.77751595,
       0.69116499, 0.79170068])

### Apply GridSearchCV for finding best Model and Parameters

In [120]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

In [121]:
def finding_best_model_with_params(X,y):


    algos={
        
        'linear_regression' :{
            "model" : LinearRegression(),
            "params" : {
                'normalize' : [True,False]
            }
        },
        
        'lasso': {
            "model" : Lasso(),
            "params" : {
                'alpha' : [1,2],
                'selection' : ['random','cyclic']
            }
        },

        'decision_tree' :{
            "model" : DecisionTreeRegressor(),
            "params" : {
                'criterion' : ['mse','friedman_mse'],
                'splitter' : ['best','random']

            }
        }
    }

    scores = []

    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    
    for algo_name,config in algos.items():
        gs =GridSearchCV(config['model'],config['params'], cv=cv,return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model' : algo_name,
            "best_score" : gs.best_score_,
            'best_params' : gs.best_params_ 
        })

    
    return pd.DataFrame(scores,columns=["model","best_score","best_params"])

In [122]:
finding_best_model_with_params(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.738623,{'normalize': False}
1,lasso,0.699722,"{'alpha': 1, 'selection': 'cyclic'}"
2,decision_tree,0.604585,"{'criterion': 'mse', 'splitter': 'random'}"


## Select LinearRegression as Best Model and check Prediction

In [123]:
X.columns

Index(['size', 'total_sqft', 'bath', 'balcony', '1st phase jp nagar',
       '2nd phase judicial layout', '5th phase jp nagar', '6th phase jp nagar',
       '7th phase jp nagar', '8th phase jp nagar',
       ...
       'vasanthapura', 'vidyaranyapura', 'vijayanagar', 'vittasandra',
       'whitefield', 'yelachenahalli', 'yelahanka', 'yelahanka new town',
       'yelenahalli', 'yeshwanthpur'],
      dtype='object', length=415)

In [124]:
def predict(location,size,total_sqft,bath,balcony):
    location_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = size
    x[1] = total_sqft
    x[2] = bath
    x[3] = balcony
    if location_index >= 0:
        x[location_index] = 1
    return model_classifier.predict([x])[0]

In [125]:
predict("yelachenahalli",4,2000,1,2)

121.40666497185026

#                                     THANKS
