# Importing the libraries

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the dataset

In [45]:
df_housing = pd.read_csv('Housing.csv')
df_housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [46]:
df_housing.isna().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [47]:
df_housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


# Encoding categorical data

In [48]:
## i need furnished=0, semi-furnished=1, unfurnished=2 
#  label encoding
df_housing['furnishingstatus'] = df_housing['furnishingstatus'].astype('category')
df_housing['furnishingstatus'] = df_housing['furnishingstatus'].cat.codes

In [49]:
df_housing['mainroad'] = df_housing['mainroad'].astype('category')
df_housing['mainroad'] = df_housing['mainroad'].cat.codes

df_housing['guestroom'] = df_housing['guestroom'].astype('category')
df_housing['guestroom'] = df_housing['guestroom'].cat.codes

df_housing['basement'] = df_housing['basement'].astype('category')
df_housing['basement'] = df_housing['basement'].cat.codes

df_housing['hotwaterheating'] = df_housing['hotwaterheating'].astype('category')
df_housing['hotwaterheating'] = df_housing['hotwaterheating'].cat.codes

df_housing['airconditioning'] = df_housing['airconditioning'].astype('category')
df_housing['airconditioning'] = df_housing['airconditioning'].cat.codes

df_housing['prefarea'] = df_housing['prefarea'].astype('category')
df_housing['prefarea'] = df_housing['prefarea'].cat.codes

In [50]:
df_housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0


In [51]:
X = df_housing.iloc[:, 1:]
y = df_housing.iloc[:, 0]

In [52]:
X

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,7420,4,2,3,1,0,0,0,1,2,1,0
1,8960,4,4,4,1,0,0,0,1,3,0,0
2,9960,3,2,2,1,0,1,0,0,2,1,1
3,7500,4,2,2,1,0,1,0,1,3,1,0
4,7420,4,1,2,1,1,1,0,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
540,3000,2,1,1,1,0,1,0,0,2,0,2
541,2400,3,1,1,0,0,0,0,0,0,0,1
542,3620,2,1,1,1,0,0,0,0,0,0,2
543,2910,3,1,1,0,0,0,0,0,0,0,0


# Feature Scaling

In [53]:
from sklearn.preprocessing import StandardScaler
s = StandardScaler()
X = s.fit_transform(X)

# Splitting the dataset into the Training set and Test set

In [54]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)

In [55]:
print(x_train)

[[ 0.34566751  1.40341936  1.42181174 ...  0.35597563 -0.55403469
   1.22296203]
 [ 1.7201117  -1.30886273 -0.57018671 ...  0.35597563 -0.55403469
  -0.09166185]
 [ 0.57627895  0.04727831 -0.57018671 ...  0.35597563  1.80494113
  -0.09166185]
 ...
 [-0.51220705  0.04727831 -0.57018671 ...  0.35597563 -0.55403469
  -1.40628573]
 [ 1.55407146  0.04727831 -0.57018671 ...  1.51769249 -0.55403469
  -1.40628573]
 [-0.632125   -1.30886273 -0.57018671 ... -0.80574124 -0.55403469
  -0.09166185]]


In [56]:
print(x_train)

[[ 0.34566751  1.40341936  1.42181174 ...  0.35597563 -0.55403469
   1.22296203]
 [ 1.7201117  -1.30886273 -0.57018671 ...  0.35597563 -0.55403469
  -0.09166185]
 [ 0.57627895  0.04727831 -0.57018671 ...  0.35597563  1.80494113
  -0.09166185]
 ...
 [-0.51220705  0.04727831 -0.57018671 ...  0.35597563 -0.55403469
  -1.40628573]
 [ 1.55407146  0.04727831 -0.57018671 ...  1.51769249 -0.55403469
  -1.40628573]
 [-0.632125   -1.30886273 -0.57018671 ... -0.80574124 -0.55403469
  -0.09166185]]


In [57]:
print(x_test)

[[ 1.04672629  1.40341936  1.42181174 ...  1.51769249  1.80494113
  -1.40628573]
 [ 0.58550341  0.04727831 -0.57018671 ... -0.80574124  1.80494113
  -1.40628573]
 [-0.29082007  0.04727831 -0.57018671 ... -0.80574124 -0.55403469
  -0.09166185]
 ...
 [-0.99187885 -1.30886273 -0.57018671 ...  0.35597563 -0.55403469
   1.22296203]
 [ 0.85301268  0.04727831 -0.57018671 ... -0.80574124 -0.55403469
   1.22296203]
 [-0.83045084  0.04727831 -0.57018671 ... -0.80574124 -0.55403469
   1.22296203]]


# Training the Decision Tree Classification model on the Training set

In [58]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)

In [62]:
y_pred = classifier.predict(x_test)