In [3]:
import pandas as pd
import numpy as np

# 1. Problem Statement

Real estate transactions are quite opaque sometimes and it may be difficult for a newbie to know the fair price of any given home. Thus, multiple real estate websites have the functionality to predict the prices of houses given different features regarding it. Such forecasting models will help buyers to identify a fair price for the home and also give insights to sellers as to how to build homes that fetch them more money. Chennai house sale price data is shared here and the participants are expected to build a sale price prediction model that will aid the customers to find a fair price for their homes and also help the sellers understand what factors are fetching more money for the houses?

Data : 'https://raw.githubusercontent.com/Praveen3ravi/PROJECT/main/train-chennai-sale.csv'

Minimum Requirements It is not sufficient to just fit a model - the model must be analysed to find the important factors that contribute towards the price. Also, it will be better to give a forecast range (range of permissible values) rather than a single estimate as it will help the customers to negotiate.

# 2.Data Gathering

In [4]:
df = pd.read_csv('train-chennai-sale.csv')
df

Unnamed: 0,PRT_ID,AREA,INT_SQFT,DATE_SALE,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACIL,DATE_BUILD,...,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,REG_FEE,COMMIS,SALES_PRICE,INF_PRICE,1+0.04,"POWER(Z,Y)"
0,P09411,Anna Nagar,1986,19-12-2006,2.0,1.0,5,Ab Normal,No,22-12-1995,...,4.9,4.2,2.5,3.765,760122,304049,21717770,4.067698e+07,1.04,1.872981
1,P05346,Velachery,1855,13-03-2010,3.0,2.0,5,Family,No,18-03-1988,...,4.7,3.9,3.6,4.010,356321,77042,9630290,1.541840e+07,1.04,1.601032
2,P00219,Chrompet,1220,11-09-2014,2.0,1.0,4,Partial,No,12-09-2009,...,4.5,2.6,3.1,3.320,409027,198316,12394750,1.696307e+07,1.04,1.368569
3,P09105,Chrompet,1167,05-04-2007,1.0,1.0,3,Partial,No,12-04-1979,...,3.6,2.1,2.5,2.670,263152,33955,8488790,1.528783e+07,1.04,1.800944
4,P09679,Velachery,1847,13-03-2006,3.0,2.0,5,Family,No,15-03-1996,...,2.4,4.5,2.1,3.260,604809,235204,16800250,3.146655e+07,1.04,1.872981
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7104,P05438,T Nagar,1733,24-02-2010,1.0,1.0,4,Ab Normal,Yes,02-03-1985,...,3.4,3.7,2.1,2.890,702058,312026,19501600,3.122269e+07,1.04,1.601032
7105,P05042,Karapakkam,666,11-05-2010,1.0,1.0,2,Adj Land,Yes,20-05-1974,...,3.2,4.4,2.5,3.280,273317,74541,6211750,9.945212e+06,1.04,1.601032
7106,P10000,Velachery,1897,08-04-2004,3.0,2.0,5,Family,Yes,11-04-1995,...,3.6,4.5,3.3,3.920,346191,205551,10818480,2.191626e+07,1.04,2.025817
7107,P06508,Karapakkam,787,03-08-2009,1.0,1.0,2,Partial,Yes,11-08-1977,...,4.6,3.8,4.1,4.160,425350,119098,8507000,1.416478e+07,1.04,1.665074


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7109 entries, 0 to 7108
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PRT_ID         7109 non-null   object 
 1   AREA           7109 non-null   object 
 2   INT_SQFT       7109 non-null   int64  
 3   DATE_SALE      7109 non-null   object 
 4   N_BEDROOM      7108 non-null   float64
 5   N_BATHROOM     7104 non-null   float64
 6   N_ROOM         7109 non-null   int64  
 7   SALE_COND      7109 non-null   object 
 8   PARK_FACIL     7109 non-null   object 
 9   DATE_BUILD     7109 non-null   object 
 10  BUILDTYPE      7109 non-null   object 
 11  UTILITY_AVAIL  7109 non-null   object 
 12  STREET         7109 non-null   object 
 13  MZZONE         7109 non-null   object 
 14  QS_ROOMS       7109 non-null   float64
 15  QS_BATHROOM    7109 non-null   float64
 16  QS_BEDROOM     7109 non-null   float64
 17  QS_OVERALL     7061 non-null   float64
 18  REG_FEE 

In [6]:
df.describe()

Unnamed: 0,INT_SQFT,N_BEDROOM,N_BATHROOM,N_ROOM,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,REG_FEE,COMMIS,SALES_PRICE,INF_PRICE,1+0.04,"POWER(Z,Y)"
count,7109.0,7108.0,7104.0,7109.0,7109.0,7109.0,7109.0,7061.0,7109.0,7109.0,7109.0,7109.0,7109.0,7109.0
mean,1382.073006,1.637029,1.21326,3.688704,3.517471,3.507244,3.4853,3.503254,376938.330708,141005.726544,10894910.0,17672010.0,1.04,1.621739
std,457.410902,0.802902,0.409639,1.019099,0.891972,0.897834,0.887266,0.527223,143070.66201,78768.093718,3768603.0,6310658.0,3.330903e-14,0.141455
min,500.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,71177.0,5055.0,2156875.0,3591355.0,1.04,1.315932
25%,993.0,1.0,1.0,3.0,2.7,2.7,2.7,3.13,272406.0,84219.0,8272100.0,13230230.0,1.04,1.539454
50%,1373.0,1.0,1.0,4.0,3.5,3.5,3.5,3.5,349486.0,127628.0,10335050.0,16875150.0,1.04,1.601032
75%,1744.0,2.0,1.0,4.0,4.3,4.3,4.3,3.89,451562.0,184506.0,12993900.0,20981670.0,1.04,1.731676
max,2500.0,4.0,2.0,6.0,5.0,5.0,5.0,4.97,983922.0,495405.0,23667340.0,43542300.0,1.04,2.025817


# 3.Exploratory Data Analysis

## 3.1 Checking Null Values

In [8]:
df.isna().sum()

PRT_ID            0
AREA              0
INT_SQFT          0
DATE_SALE         0
N_BEDROOM         1
N_BATHROOM        5
N_ROOM            0
SALE_COND         0
PARK_FACIL        0
DATE_BUILD        0
BUILDTYPE         0
UTILITY_AVAIL     0
STREET            0
MZZONE            0
QS_ROOMS          0
QS_BATHROOM       0
QS_BEDROOM        0
QS_OVERALL       48
REG_FEE           0
COMMIS            0
SALES_PRICE       0
INF_PRICE         0
1+0.04            0
POWER(Z,Y)        0
dtype: int64