In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("dataset.csv")

In [3]:
data.head()

Unnamed: 0,active,amenities,balconies,bathroom,combineDescription,completeStreetName,deposit,facing,facingDesc,floor,...,property_size,reactivationSource,rent_amount,sharedAccomodation,shortUrl,swimmingPool,totalFloor,type_bhk,waterSupply,weight
0,True,"{""LIFT"":true,""GYM"":false,""INTERNET"":false,""AC""...",3,3,,"Shreya carnation, Block I, NCB Enclave, Gachib...",90000,W,West,3,...,2200,USER_DASHBOARD,28000,False,http://nobr.kr/sv/9Kvqj,False,5,BHK3,CORP_BORE,
1,True,"{""LIFT"":false,""GYM"":false,""INTERNET"":false,""AC...",1,2,,"Inner Ring Rd, near RTO Bandlaguda South Zone",45000,E,East,2,...,1200,USER_DASHBOARD,15000,False,http://nobr.kr/sv/j2pD6,False,2,BHK3,BOREWELL,
2,True,"{""LIFT"":true,""GYM"":true,""INTERNET"":false,""AC"":...",3,3,,"Rd Number 2, Shirdi Sai Nagar, Manikonda, Hyde...",80000,E,East,0,...,1800,MISSED_CALL,16000,False,http://nobr.kr/sv/TXJmj,False,3,BHK3,CORP_BORE,
3,True,"{""LIFT"":false,""GYM"":false,""INTERNET"":false,""AC...",1,2,,"Plot No. 44, Road No. 1/A, kakatiya colony, LB...",18000,W,West,2,...,750,MISSED_CALL,9000,False,http://nobr.kr/sv/FnCsf,False,2,BHK2,CORP_BORE,
4,True,"{""LIFT"":true,""GYM"":false,""INTERNET"":false,""AC""...",2,2,,"Madhapur HUDA Techno Enclave, Near MaxCure Su...",80000,E,East,2,...,1250,,32500,False,http://nobr.kr/sv/BQFRb,False,5,BHK2,CORP_BORE,


In [4]:
data.shape

(19110, 36)

## Preprocessing

##### Extracting information out of amenities column as it is one of the important factors in deciding price values followed by getting rid of unwanted columns

In [5]:
amenities_columns = ['LIFT', 'GYM', 'INTERNET', 'AC', 'CLUB', 'INTERCOM', 'POOL', 'CPA',
                     'FS', 'SERVANT', 'SECURITY', 'SC', 'GP', 'PARK', 'RWH', 'STP',
                     'HK', 'PB', 'VP']

In [6]:
for column in amenities_columns:
    data[column] = data['amenities'].str.extract(f"({column}:(True|False))", expand=True)[1].astype(bool).astype('int64')

In [7]:
df = data.drop(['amenities','balconies','active','combineDescription','completeStreetName','facing','id','gym','lift','localityId','location','ownerName','parkingDesc','propertyTitle','reactivationSource','shortUrl','type_bhk','weight'], axis=1)

In [8]:
df = df.replace('None', pd.NA).dropna()

##### Making sure that the data types of respective columns are suitable for EDA

In [9]:
df['maintenanceAmount'] = df['maintenanceAmount'].astype('int64')

In [10]:
df['swimmingPool'] = df['swimmingPool'].astype('int64')

#### Since total rent is a sum of the maintenance and usual rent amount, they have been collectively taken as one entity

In [11]:
df['totalprice'] = df[['maintenanceAmount', 'rent_amount']].apply(lambda x: x[x.notnull()].sum(), axis=1)

In [12]:
dff = df.drop(['maintenanceAmount','deposit','isMaintenance','loanAvailable','rent_amount','propertyType','floor','sharedAccomodation','FS','PARK','POOL','SC','CPA','GP','RWH','STP','HK','PB','VP'],axis=1)

In [13]:
dff.head()

Unnamed: 0,bathroom,facingDesc,furnishingDesc,locality,parking,property_age,property_size,swimmingPool,totalFloor,waterSupply,LIFT,GYM,INTERNET,AC,CLUB,INTERCOM,SERVANT,SECURITY,totalprice
0,3,West,Semi,Gachibowli,BOTH,5,2200,0,5,CORP_BORE,1,1,1,1,1,1,1,1,30000
2,3,East,Semi,Manikonda,BOTH,0,1800,0,3,CORP_BORE,1,1,1,1,1,1,1,1,17000
3,2,West,Unfurnished,LB Nagar,BOTH,0,750,0,2,CORP_BORE,1,1,1,1,1,1,1,1,9500
4,2,East,Semi,HITEC City,BOTH,5,1250,0,5,CORP_BORE,1,1,1,1,1,1,1,1,34500
5,3,North,Semi,Kondapur,BOTH,1,1517,1,6,CORP_BORE,1,1,1,1,1,1,1,1,35700


#### Now, it's looking better

In [14]:
dff.shape

(5240, 19)

### Some data cleaning

#### Cleaning the locality column of delimiters

In [15]:
data['locality'] = data['locality'].str.replace(',', '')

In [16]:
locality_counts = dff['locality'].value_counts()

In [17]:
df3 = dff[dff['locality'].isin(locality_counts[locality_counts > 10].index)]

#### Renaming the columns for more clarity

In [18]:
df4 = df3.rename(columns = {'furnishingDesc' : 'furnishing', 'LIFT' : 'lift', 'GYM' : 'gym','INTERNET' : 'internet', 'CLUB' : 'club', 'SERVANT' : 'servant', 'SECURITY' : 'security'})