In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error,mean_squared_error


In [2]:
dataset_path = './Housing.csv'
df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [14]:
categorical_cols = df.select_dtypes(include=['object']).columns.to_list()
print(df.select_dtypes(include=['object']))

    mainroad guestroom basement hotwaterheating airconditioning prefarea  \
0        yes        no       no              no             yes      yes   
1        yes        no       no              no             yes       no   
2        yes        no      yes              no              no      yes   
3        yes        no      yes              no             yes      yes   
4        yes       yes      yes              no             yes       no   
..       ...       ...      ...             ...             ...      ...   
540      yes        no      yes              no              no       no   
541       no        no       no              no              no       no   
542      yes        no       no              no              no       no   
543       no        no       no              no              no       no   
544      yes        no       no              no              no       no   

    furnishingstatus  
0          furnished  
1          furnished  
2     semi-furnish

In [11]:
ordinal_encoder = OrdinalEncoder()
encoded_categorical_cols = ordinal_encoder.fit_transform(df[categorical_cols])
encoded_categorical_df = pd.DataFrame(
    encoded_categorical_cols,
    columns = categorical_cols
)
numerical_df = df.drop(categorical_cols,axis=1)

encoded_df = pd.concat([numerical_df,encoded_categorical_df],axis=1)
encoded_df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,furnishingstatus
0,13300000,7420,4,2,3,2,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,12250000,8960,4,4,4,3,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,12250000,9960,3,2,2,2,1.0,0.0,1.0,0.0,0.0,1.0,1.0
3,12215000,7500,4,2,2,3,1.0,0.0,1.0,0.0,1.0,1.0,0.0
4,11410000,7420,4,1,2,2,1.0,1.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,2,1.0,0.0,1.0,0.0,0.0,0.0,2.0
541,1767150,2400,3,1,1,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
542,1750000,3620,2,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
543,1750000,2910,3,1,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
normalizer = StandardScaler()
dataset_arr = normalizer.fit_transform(encoded_df)

In [22]:
X,y = dataset_arr[:,1:], dataset_arr[:,0]

In [24]:
test_size = 0.3
random_state = 1
is_shuffle = True
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=test_size,random_state=random_state,shuffle=is_shuffle)

In [26]:
regressor = RandomForestRegressor(
    random_state = random_state
)
regressor.fit(X_train,y_train)

In [35]:
Ada_regressor = AdaBoostRegressor(
    random_state=random_state
)
Ada_regressor.fit(X_train,y_train)

In [34]:
Gradient_regressor = GradientBoostingRegressor(
    random_state = random_state
)
Gradient_regressor.fit(X_train,y_train)

In [36]:
RandomForest_y_pred = regressor.predict(X_val)
Ada_y_pred = Ada_regressor.predict(X_val)
Gradient_y_pred = Gradient_regressor.predict(X_val)

In [37]:
#validate_model
#random forest
regressor_mae = mean_absolute_error(y_val,RandomForest_y_pred)
regressor_mse = mean_squared_error(y_val,RandomForest_y_pred)
#Ada Boosting
Ada_regressor_mae = mean_absolute_error(y_val,Ada_y_pred)
Ada_regressor_mse = mean_squared_error(y_val,Ada_y_pred)
# Gradient Boosting
Gradient_regressor_mae = mean_absolute_error(y_val,Gradient_y_pred)
Gradient_regressor_mse = mean_squared_error(y_val,Gradient_y_pred)

print("Evaluation results on validation set:\n Random Forest")
print(f'Mean Absolute Error of Random Forest: {regressor_mae}')
print(f'Mean Squared Error of Random Forest: {regressor_mse}')
print('Ada Boosting')
print(f'Mean Absolute Error of Ada Boosting: {Ada_regressor_mae}')
print(f'Mean Squared Error of Ada Boosting: {Ada_regressor_mse}')
print('Gradient Boost')
print(f'Mean Absolute Error of Gradient Boost: {Gradient_regressor_mae}')
print(f'Mean Squared Error of Gradient Boost: {Gradient_regressor_mse}')


Evaluation results on validation set:
 Random Forest
Mean Absolute Error of Random Forest: 0.46093873321571177
Mean Squared Error of Random Forest: 0.37944418523089524
Ada Boosting
Mean Absolute Error of Ada Boosting: 0.567680019897059
Mean Squared Error of Ada Boosting: 0.5739244030038942
Gradient Boost
Mean Absolute Error of Gradient Boost: 0.4516626127750995
Mean Squared Error of Gradient Boost: 0.39610445936979427
