### Import Library

In [166]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


### Load the DataSet

In [167]:
df = pd.read_csv('Bengaluru_House_Data.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


### Getting Some Insights from the Data and makign data suitable for modelling

In [168]:
df.shape
## means there are 13320 Rows and 9 Columns


(13320, 9)

In [169]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [170]:
df.info()
## there is conflict in our thinking ,that here data type of size columns and total_sqft columsni in object 
## but it need to in Integer format
## here we need to str the size string to get out bhk 
## and Second thing , there are only 7818 data entries in Society Column
## lot of missing values in Society Column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [171]:
## value counts of each colums
for i in df.columns:
    print(df[i].value_counts())
    print("---*---"*50)

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64
---*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*---
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: availability, Length: 81, dtype: int64
---*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*------*-----

#### Missing Values

In [172]:
df.isnull().sum()
## we need to drop society column , beacuse of lot of missing values
## and area_type , availability , balcony as there is no use
## and need to fix missing valies for the bath and size and location columsn

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [173]:
df.drop(columns=['area_type', 'availability', 'balcony', 'society'], inplace=True)

## Feature Engineering

#### Handling Missing Values

In [174]:
## missing value colums 
df.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [175]:
df['location'].value_counts()

Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64

In [176]:
## there is only 1 missing values in Location column 
## filling the Mode values
df['location'] = df['location'].fillna('Sarjapur Road')

In [177]:
df['size'].value_counts()

2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
13 BHK           1
12 Bedroom       1
27 BHK           1
43 Bedroom       1
16 BHK           1
19 BHK           1
18 Bedroom       1
Name: size, dtype: int64

In [178]:
## there are  17 missing valyuse
## filling it with most occured values
df['size'] = df['size'].fillna('2 BHK')

In [179]:
df['bath'].value_counts()

2.0     6908
3.0     3286
4.0     1226
1.0      788
5.0      524
6.0      273
7.0      102
8.0       64
9.0       43
10.0      13
12.0       7
13.0       3
11.0       3
16.0       2
27.0       1
40.0       1
15.0       1
14.0       1
18.0       1
Name: bath, dtype: int64

In [180]:
## there are 73 missil values 
## here we filling it with Median vlaus
df['bath'] = df['bath'].fillna(df['bath'].median())

In [181]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [182]:
## if we look closely to the dataset size and total_sqft is in String Formate and  , 
## from that size columsn we can retrieve  BHK data
df['bhk'] = df['size'].str.split().str.get(0).astype(int)


In [183]:
df['total_sqft'].unique()
## here some of the datapoint are given in the range 
## for that datapoint , given in range we will find median of range

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [184]:
def covertRange(x):
    temp = x.split('-')
    if len(temp) == 2 :
       return (float(temp[0]) + float(temp[1]))/2
    try:
      return float(x)
    except:
      return None

In [185]:
df['total_sqft'] = df['total_sqft'].apply(covertRange)

In [186]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


### Price Per Sqft

In [187]:
## we need to inTroudce price_per_sqft column in this dataset
df['price_per_sqft'] = df['price']*100000 / df['total_sqft']

In [188]:
df['location'].value_counts()
## if we look closely to the dataset , we observed that , there are some single values  which are not so that importnat

Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Uvce Layout                         1
Abshot Layout                       1
Name: location, Length: 1306, dtype: int64

In [189]:
df['location'] = df['location'].apply(lambda x:x.strip())
location_count = df['location'].value_counts()


In [190]:
location_count_less_10 = location_count[location_count<=10]
location_count_less_10

BTM 1st Stage                         10
Nagadevanahalli                       10
Basapura                              10
Sector 1 HSR Layout                   10
Dairy Circle                          10
                                      ..
1Channasandra                          1
Hosahalli                              1
Vijayabank bank layout                 1
near Ramanashree California resort     1
Abshot Layout                          1
Name: location, Length: 1054, dtype: int64

In [191]:
df['location'] = df['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)

In [192]:
df['location'].value_counts()

other                 2886
Whitefield             541
Sarjapur  Road         399
Electronic City        304
Kanakpura Road         273
                      ... 
Nehru Nagar             11
Banjara Layout          11
LB Shastri Nagar        11
Pattandur Agrahara      11
Narayanapura            11
Name: location, Length: 242, dtype: int64

#### OutLier Detection and Removal

In [193]:
df[df['total_sqft']/df['bhk']<300]

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
9,other,6 Bedroom,1020.0,6.0,370.0,6,36274.509804
45,HSR Layout,8 Bedroom,600.0,9.0,200.0,8,33333.333333
58,Murugeshpalya,6 Bedroom,1407.0,4.0,150.0,6,10660.980810
68,Devarachikkanahalli,8 Bedroom,1350.0,7.0,85.0,8,6296.296296
70,other,3 Bedroom,500.0,3.0,100.0,3,20000.000000
...,...,...,...,...,...,...,...
13277,other,7 Bedroom,1400.0,7.0,218.0,7,15571.428571
13279,other,6 Bedroom,1200.0,5.0,130.0,6,10833.333333
13281,Margondanahalli,5 Bedroom,1375.0,5.0,125.0,5,9090.909091
13303,Vidyaranyapura,5 Bedroom,774.0,5.0,70.0,5,9043.927649


In [194]:
df1 = df[~(df['total_sqft']/df['bhk']<300)]

In [195]:
df1.shape

(12576, 7)

In [196]:
df1.price_per_sqft.describe()

count     12530.000000
mean       6303.979357
std        4162.237981
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

There is outlier

I will keei that datpoint whcih lie between (mean - std ) & (mean + std)

In [197]:
def rem_pps_ol(df):
  df_res = pd.DataFrame()
  for key, subdf in df.groupby('location'):
    m = np.mean(subdf.price_per_sqft)
    sd = np.std(subdf.price_per_sqft)
    red_df = subdf[(subdf.price_per_sqft>(m-sd)) & (subdf.price_per_sqft<(m+sd))]
    df_res = pd.concat([df_res, red_df], ignore_index=True)
  return df_res

In [198]:
df2 = rem_pps_ol(df1)
df2.shape

(10301, 7)

In [199]:
def remove_bhk_outliers(df):
    exclude_indices=np.array([])
    for location,location_df in df.groupby('location'):
        bhk_stats={}
        for bhk,bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk]={
                'mean':np.mean(bhk_df.price_per_sqft),
                'std':np.std(bhk_df.price_per_sqft),
                'count':bhk_df.shape[0]
            }
        for bhk,bhk_df in location_df.groupby('bhk'):
            stats=bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices=np.append(exclude_indices,bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')
df2=remove_bhk_outliers(df2)
df2.shape

(7361, 7)

In [200]:
df3 = df2[df2.bath < df2.bhk+2]
df3.shape

(7282, 7)

In [201]:
df4 = df3.drop(['size','price_per_sqft'],axis='columns')
df4.head(15)

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2
5,1st Block Jayanagar,2750.0,4.0,413.0,4
6,1st Block Jayanagar,2450.0,4.0,368.0,4
8,1st Phase JP Nagar,1875.0,3.0,167.0,3
9,1st Phase JP Nagar,1500.0,5.0,85.0,5
10,1st Phase JP Nagar,2065.0,4.0,210.0,3


In [202]:
# creating dummies for location
dum = pd.get_dummies(df4.location)

In [203]:
df5 = pd.concat([df4,dum.drop('other', axis=1)], axis=1)
df5.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1st Block Jayanagar,2850.0,4.0,428.0,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1st Block Jayanagar,1630.0,3.0,194.0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1st Block Jayanagar,1875.0,2.0,235.0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1st Block Jayanagar,1200.0,2.0,130.0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1st Block Jayanagar,1235.0,2.0,148.0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [204]:
df6 = df5.drop('location', axis=1)
df6.head()

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,428.0,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,194.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,235.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1200.0,2.0,130.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1235.0,2.0,148.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Model Building

assigning x and y values (feature and target)


In [205]:
x = df6.drop('price', axis=1)
y = df6.price

##### Splitting the DataSet Into Train and Test Split 

In [206]:
# splitting the dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

#### Linear Regression

In [207]:
# building linear regression model and fitting it with the training dataset
from sklearn.linear_model import LinearRegression
z = LinearRegression()
z.fit(x_train, y_train)

In [208]:
z.score(x_test, y_test)

0.8821962827133929

In [209]:
y_pred = z.predict(x_test)

In [210]:
# using cross_val_score to find different sets of score values (K-Fold method)
from sklearn.model_selection import ShuffleSplit, cross_val_score

ss = ShuffleSplit(n_splits=5, test_size=0.3, random_state=5)

cross_val_score(z, x, y, cv=ss)

array([0.84383512, 0.82953609, 0.85680774, 0.83582098, 0.84083703])

###  Finding the best model using GridsearchCV By Doing Hyper Paremeter Tunning

In [211]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor


def bestmodel(x,y):

    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'copy_X' : [True, False],
                'fit_intercept' : [True, False],
                'n_jobs' : [1,2,3],
                'positive' : [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'ridge':{
            'model':Ridge(),
            'params':{'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]
                      }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    for name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=ss, return_train_score=False)
        gs.fit(x,y)
        scores.append({
            'model': name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })
    print(scores)
    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

from sklearn.linear_model import Ridge

RidgeRegression = Ridge()
hyperParameters = {'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}

In [212]:
df10 = bestmodel(x,y)

[{'model': 'linear_regression', 'best_score': 0.8422031970744885, 'best_params': {'copy_X': True, 'fit_intercept': False, 'n_jobs': 1, 'positive': False}}, {'model': 'lasso', 'best_score': 0.7051535104918756, 'best_params': {'alpha': 1, 'selection': 'random'}}, {'model': 'ridge', 'best_score': 0.8413909509096582, 'best_params': {'alpha': 0.01}}, {'model': 'decision_tree', 'best_score': 0.7200638865748581, 'best_params': {'criterion': 'friedman_mse', 'splitter': 'best'}}]


10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/home/codespace/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/codespace/.local/lib/python3.10/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/home/codespace/.local/lib/python3.10/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/home/codespace/.local/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_co

In [213]:
df10.iloc[0][2]
## Best Paremeter to predict better Outpyt

{'copy_X': True, 'fit_intercept': False, 'n_jobs': 1, 'positive': False}

## Model Testing

In [214]:
df6.columns

Index(['total_sqft', 'bath', 'price', 'bhk', '1st Block Jayanagar',
       '1st Phase JP Nagar', '2nd Phase Judicial Layout',
       '2nd Stage Nagarbhavi', '5th Block Hbr Layout', '5th Phase JP Nagar',
       ...
       'Vijayanagar', 'Vishveshwarya Layout', 'Vishwapriya Layout',
       'Vittasandra', 'Whitefield', 'Yelachenahalli', 'Yelahanka',
       'Yelahanka New Town', 'Yelenahalli', 'Yeshwanthpur'],
      dtype='object', length=245)

##### Accuracy OF the Model

In [215]:
from sklearn import metrics as m
print("R2-Score : ",m.r2_score(y_test, y_pred))
print("Mean Absolute Error : ", m.mean_absolute_error(y_test, y_pred))

R2-Score :  0.8821962827133929
Mean Absolute Error :  18.018076288964018


In [216]:
temp = LinearRegression(copy_X=True, fit_intercept=False,n_jobs=1, positive=False)

In [217]:
temp = temp.fit(x_train, y_train)

In [218]:
print("R2-Score : ",m.r2_score(y_test, temp.predict(x_test)))

R2-Score :  0.8822994401667984


In [219]:
def price_pred(location,sqft,bath,bhk):
   l_ind = np.where(x.columns == location)[0][0]

   a = np.zeros(len(x.columns))
   a[0] = sqft
   a[1] = bath
   a[2] = bhk
   if l_ind >= 0:
     a[l_ind] = 1

   return temp.predict([a])[0]    

In [220]:
price_pred('1st Phase JP Nagar',1000, 2, 3)



83.09307436096769

### Pickling the File

In [221]:
import pickle
pickle.dump(temp, open('LinearModel.pkl', 'wb'))