# Import Necessary Libraries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


In [2]:
# display in tabular form
def pdisplay(df , columns = None):
    if columns:
        display(pd.DataFrame(df , columns = columns))
    else:
        display(pd.DataFrame(df))

# Import Data

In [3]:
data = pd.read_csv('../data/Housing.csv')
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


## X & y Split

In [4]:
ydata = data['price']
Xdata = data.drop('price' , axis = 1)

display(Xdata.head())
pdisplay(ydata.head())

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


Unnamed: 0,price
0,13300000
1,12250000
2,12250000
3,12215000
4,11410000


In [5]:
# Xtrain , Xtest , ytrain , ytest = train_test_split(Xdara , ydata , test_size = 0.2 , random_state = 118)

# Analyze Data

In [6]:
pdisplay(Xdata.dtypes , ["Xdata"])

Unnamed: 0,Xdata
area,int64
bedrooms,int64
bathrooms,int64
stories,int64
mainroad,object
guestroom,object
basement,object
hotwaterheating,object
airconditioning,object
parking,int64


In [7]:
pdisplay( pd.DataFrame(ydata).dtypes , ["ydata"])

Unnamed: 0,ydata
price,int64


In [8]:
Xdata.describe()

Unnamed: 0,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0
mean,5150.541284,2.965138,1.286239,1.805505,0.693578
std,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1650.0,1.0,1.0,1.0,0.0
25%,3600.0,2.0,1.0,1.0,0.0
50%,4600.0,3.0,1.0,2.0,0.0
75%,6360.0,3.0,2.0,2.0,1.0
max,16200.0,6.0,4.0,4.0,3.0


In [23]:
object_cols = Xdata.dtypes.index[Xdata.dtypes == 'object']
num_cols = Xdata.dtypes.index[Xdata.dtypes != 'object']

print("object_cols =\n",object_cols)
print("num_cols =\n" , num_cols)

object_cols =
 Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')
num_cols =
 Index(['area', 'bedrooms', 'bathrooms', 'stories', 'parking'], dtype='object')


In [22]:
for col_inx in object_cols:
    pdisplay( Xdata[col_inx].value_counts())

Unnamed: 0_level_0,count
mainroad,Unnamed: 1_level_1
yes,468
no,77


Unnamed: 0_level_0,count
guestroom,Unnamed: 1_level_1
no,448
yes,97


Unnamed: 0_level_0,count
basement,Unnamed: 1_level_1
no,354
yes,191


Unnamed: 0_level_0,count
hotwaterheating,Unnamed: 1_level_1
no,520
yes,25


Unnamed: 0_level_0,count
airconditioning,Unnamed: 1_level_1
no,373
yes,172


Unnamed: 0_level_0,count
prefarea,Unnamed: 1_level_1
no,417
yes,128


Unnamed: 0_level_0,count
furnishingstatus,Unnamed: 1_level_1
semi-furnished,227
unfurnished,178
furnished,140


In [25]:
pdisplay(data.isnull().sum() , ["null count"])

Unnamed: 0,null count
price,0
area,0
bedrooms,0
bathrooms,0
stories,0
mainroad,0
guestroom,0
basement,0
hotwaterheating,0
airconditioning,0


# Identify Issues in the Dataset

# Data visualization

# Removing Irrelevant Reatures

# Convert Categorical to Numerical

# Missing Data Handling