#importing essential libraries for building a machine learning model

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("BHP.csv")
df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


# EDA

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [4]:
df.shape

(13320, 9)

In [5]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [6]:
df.size

119880

In [7]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [8]:
df.drop(columns = ['area_type','availability','society','balcony'],inplace = True)

In [9]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [10]:
df.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [11]:
df.dropna(inplace = True)

In [12]:
df['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [13]:
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))

In [14]:
df.tail()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
13315,Whitefield,5 Bedroom,3453,4.0,231.0,5
13316,Richards Town,4 BHK,3600,5.0,400.0,4
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,60.0,2
13318,Padmanabhanagar,4 BHK,4689,4.0,488.0,4
13319,Doddathoguru,1 BHK,550,1.0,17.0,1


In [15]:
df.drop(columns = ['size'], axis = 1,inplace = True)

In [16]:
df[df.bhk > 22]

Unnamed: 0,location,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,8000,27.0,230.0,27
4684,Munnekollal,2400,40.0,660.0,43


In [17]:
df.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [18]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [19]:
df[~df['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,location,total_sqft,bath,price,bhk
30,Yelahanka,2100 - 2850,4.0,186.0,4
122,Hebbal,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,1042 - 1105,2.0,54.005,2
165,Sarjapur,1145 - 1340,2.0,43.49,2
188,KR Puram,1015 - 1540,2.0,56.8,2
410,Kengeri,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,1195 - 1440,2.0,63.77,2
648,Arekere,4125Perch,9.0,265.0,9
661,Yelahanka,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,3090 - 5002,4.0,445.0,4


In [20]:
def convert_sqft_into_number(x):
    token = x.split('-')
    if len(token) == 2:
        return (float(token[0]) + float(token[1])) / 2
    try:
        return float(x)
    except:
        return None

In [21]:
df1 = df.copy()
df1['total_sqft'] = df1['total_sqft'].apply(convert_sqft_into_number)

In [22]:
df1.loc[30]

location      Yelahanka
total_sqft       2475.0
bath                4.0
price             186.0
bhk                   4
Name: 30, dtype: object

In [23]:
df2 = df1.copy()

In [24]:
df2['price_per_sqft'] = df2['price']* 100000 / df2['total_sqft']
df2.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0


In [25]:
df2['location'].value_counts()

Whitefield           534
Sarjapur  Road       392
Electronic City      302
Kanakpura Road       266
Thanisandra          233
                    ... 
Vidyapeeta             1
Maruthi Extension      1
Okalipura              1
Old Town               1
Abshot Layout          1
Name: location, Length: 1304, dtype: int64

In [26]:
df2['location'] = df2['location'].apply(lambda x: x.strip())
df2.location.value_counts()

Whitefield                        535
Sarjapur  Road                    392
Electronic City                   304
Kanakpura Road                    266
Thanisandra                       236
                                 ... 
Vasantapura main road               1
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
Abshot Layout                       1
Name: location, Length: 1293, dtype: int64

In [27]:
len(df2.location.unique())

1293

In [28]:
df2[ (df2.total_sqft / df2.bhk < 300)].head()

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
9,Gandhi Bazar,1020.0,6.0,370.0,6,36274.509804
45,HSR Layout,600.0,9.0,200.0,8,33333.333333
58,Murugeshpalya,1407.0,4.0,150.0,6,10660.98081
68,Devarachikkanahalli,1350.0,7.0,85.0,8,6296.296296
70,Double Road,500.0,3.0,100.0,3,20000.0


In [29]:
df3 = df2[ ~(df2.total_sqft / df2.bhk < 300)]
df3.shape

(12502, 6)

In [30]:
df3.price_per_sqft.describe()

count     12456.000000
mean       6308.502826
std        4168.127339
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

# Handling Outliers

In [31]:
def remove_outliers_from_price_per_sqft(df):
    df_out = pd.DataFrame()
    for key,sub in df.groupby('location'):
        m = np.mean( sub.price_per_sqft )
        st = np.std( sub.price_per_sqft )
        reduce_df = sub[(sub.price_per_sqft>(m-st)) & (sub.price_per_sqft<=(m+st))]
        df_out = pd.concat( [df_out,reduce_df], ignore_index = True)
    return df_out

In [32]:
df4 = remove_outliers_from_price_per_sqft(df3)
df4.shape

(9267, 6)

In [33]:
df4.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,9267.0,9267.0,9267.0,9267.0,9267.0
mean,1504.603273,2.463149,94.140067,2.556707,5724.437494
std,894.614947,0.95202,110.642802,0.847092,2535.271814
min,300.0,1.0,10.0,1.0,1250.0
25%,1109.0,2.0,49.0,2.0,4259.259259
50%,1282.0,2.0,67.0,2.0,5185.185185
75%,1650.0,3.0,100.0,3.0,6404.356285
max,30400.0,14.0,2912.0,10.0,35000.0


In [34]:
df4.drop('location', axis = 'columns', inplace = True)
df4.head()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
0,1540.0,3.0,85.0,3,5519.480519
1,600.0,1.0,45.0,1,7500.0
2,3150.0,4.0,150.0,4,4761.904762
3,2300.0,3.0,80.0,3,3478.26087
4,1250.0,2.0,67.0,2,5360.0


In [35]:
X = df4.drop('price', axis = 1)
y = df4['price']

In [36]:
X.shape

(9267, 4)

In [37]:
y.shape

(9267,)

# Training Dataset

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)


In [40]:
X_train.shape

(7413, 4)

In [41]:
X_test.shape

(1854, 4)

# Linear Regression

In [42]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model

In [43]:
model_train = model.fit(X_train,y_train)
model_train

In [44]:
model_pred = model.predict(X_test)
model_pred

array([81.76345708, -4.29448653, -3.44319934, ..., 78.32060723,
       40.64308264, 30.42897257])

# Model training and testing score

In [45]:
model_score1 = model.score(X_train,y_train)
model_score1

0.8386347940567738

In [46]:
model_score = model.score(X_test,y_test)
model_score

0.8986395504508443