# Basic Data Exploration using Pandas

In [31]:
import pandas as pd

In [32]:
housing_data = pd.read_csv('Housing.csv')
housing_data.shape

(545, 13)

In [33]:
# knowing the variables
housing_data.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [34]:
# accessing first few rows of the dataframe
housing_data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [35]:
# are there any missing values in my dataframe
housing_data.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [36]:
#summary statistics for my dataframe
housing_data.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [37]:
# Divide our dataset into two parts target and features
y = housing_data.price # target
y.head()

0    13300000
1    12250000
2    12250000
3    12215000
4    11410000
Name: price, dtype: int64

In [38]:
housing_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']
X = housing_data[housing_features] # features
X.describe()

Unnamed: 0,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0
mean,5150.541284,2.965138,1.286239,1.805505,0.693578
std,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1650.0,1.0,1.0,1.0,0.0
25%,3600.0,2.0,1.0,1.0,0.0
50%,4600.0,3.0,1.0,2.0,0.0
75%,6360.0,3.0,2.0,2.0,1.0
max,16200.0,6.0,4.0,4.0,3.0


In [39]:
# Defining model using decision tree
from sklearn.tree import DecisionTreeRegressor
housing_model = DecisionTreeRegressor(random_state = 1)
housing_model.fit(X, y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=1, splitter='best')

In [40]:
print(X.head())
print(housing_model.predict(X.head()))

   area  bedrooms  bathrooms  stories  parking
0  7420         4          2        3        2
1  8960         4          4        4        3
2  9960         3          2        2        2
3  7500         4          2        2        3
4  7420         4          1        2        2
[13300000. 12250000. 12250000. 12215000. 11410000.]


In [41]:
y.head()

0    13300000
1    12250000
2    12250000
3    12215000
4    11410000
Name: price, dtype: int64

In [42]:
# Model Validation
# Here we will take subset of the training data called validation data
# mae(mean absolute error) is used to know the quality of our model

In [43]:
# split data into training and validation data, for both features and target
# The spilt is based on a random number generator, Supplying numeric value to
# the random_state argument guarantees we get the same split every time 
# we run the script
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

In [44]:
# Define model
housing_model = DecisionTreeRegressor()

In [45]:
# Fit model
housing_model.fit(train_X, train_y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [51]:
# make predictions
predicted_house_prices = housing_model.predict(X)

In [52]:
# get predicted prices on validation data
from sklearn.metrics import mean_absolute_error
# mae before splitting the data
mean_absolute_error(y, predicted_house_prices)
val_prediction = housing_model.predict(val_X)
print(mean_absolute_error(val_y, val_prediction))

1192608.3941605838


In [53]:
# mae after splitting the data between training and validation
mean_absolute_error(y, predicted_house_prices)
val_prediction = housing_model.predict(val_X)
print(mean_absolute_error(val_y, val_prediction))

1192608.3941605838


In [None]:
# mae is much more when we split our data between training and validation
# mae was very less when we dont split our data