splitting the dataset, training models, evaluating performance

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error



In [2]:
df = pd.read_csv("Housing.csv")
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [3]:

#mapping (yes - 1, no - 0) for the non numeric cathegories
df['mainroad'] = df['mainroad'].map({'yes': 1, 'no': 0})
print(df['mainroad'].head())

df['guestroom'] = df['guestroom'].map({'yes': 1, 'no': 0})
print(df['guestroom'].head())

df['basement'] = df['basement'].map({'yes': 1, 'no': 0})
print(df['basement'].head())

df['hotwaterheating'] = df['hotwaterheating'].map({'yes': 1, 'no': 0})
print(df['hotwaterheating'].head())

df['airconditioning'] = df['airconditioning'].map({'yes': 1, 'no': 0})
print(df['airconditioning'].head())

df['prefarea'] = df['prefarea'].map({'yes': 1, 'no': 0})
print(df['prefarea'].head())


0    1
1    1
2    1
3    1
4    1
Name: mainroad, dtype: int64
0    0
1    0
2    0
3    0
4    1
Name: guestroom, dtype: int64
0    0
1    0
2    1
3    1
4    1
Name: basement, dtype: int64
0    0
1    0
2    0
3    0
4    0
Name: hotwaterheating, dtype: int64
0    1
1    1
2    0
3    1
4    1
Name: airconditioning, dtype: int64
0    1
1    0
2    1
3    1
4    0
Name: prefarea, dtype: int64


In [4]:
# one-hot encoding for categorical columns with multiple categories 
# (in our situation furnishingstatusgetting three values: unfurnished, semi-furnished, furnished)
df = pd.get_dummies(df, columns=['furnishingstatus'], drop_first=True)

print(df.head())

      price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0                0        2         1   
3         1                0                1        3         1   
4         1                0                1        2         0   

   furnishingstatus_semi-furnished  furnishingstatus_unfurnished  
0                            False                         False  
1                       

In [5]:
X = df.drop('price', axis = 1 )
Y = df['price']

In [6]:
#training and testing sets
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size=0.2, random_state = 42)

print(f'training test size: {X_train.shape}')
print(f'test set size {X_test.shape}')


training test size: (436, 13)
test set size (109, 13)


test_size=0.2 means 20% of the data will be used for testing, while 80% will be used for training
random_state=42 ensures that the data split is reproducible.

possible models 
Linear Regression 
Decision Tree Regressor
Random Forest Regressor

In [7]:
# linear regression 
#I choosed linear regression as it has the best precission of 65% in comparition with DecisionTreeRegressor and RandomForestRegressor
linear_regression = LinearRegression()
linear_regression.fit(X_train, Y_train)

Y_pred_lr = linear_regression.predict(X_test)



In [8]:
#evaluating the model
mse_lr = mean_squared_error(Y_test, Y_pred_lr)
r2_lr = r2_score(Y_test, Y_pred_lr)

print(f'linear regression mean square error: {mse_lr}, r^2: {r2_lr}')

linear regression mean square error: 1754318687330.664, r^2: 0.6529242642153184


In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

Y_pred =linear_regression.predict(X_test)


#DataFrame to compare actual and predicted values
comparison = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
print(comparison.head())


      Actual     Predicted
316  4060000  5.164654e+06
77   6650000  7.224722e+06
360  3710000  3.109863e+06
90   6440000  4.612075e+06
493  2800000  3.294646e+06


In [10]:
importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': linear_regression.coef_})
importance = importance.sort_values(by='Importance', ascending=False)
print(importance)


                            Feature    Importance
2                         bathrooms  1.094445e+06
8                   airconditioning  7.914267e+05
7                   hotwaterheating  6.846499e+05
10                         prefarea  6.298906e+05
3                           stories  4.074766e+05
6                          basement  3.902512e+05
4                          mainroad  3.679199e+05
5                         guestroom  2.316100e+05
9                           parking  2.248419e+05
1                          bedrooms  7.677870e+04
0                              area  2.359688e+02
11  furnishingstatus_semi-furnished -1.268818e+05
12     furnishingstatus_unfurnished -4.136451e+05


final analysis:

the model has a 65% precision rate.

most important features were bathrooms airconditioning pref area and storiesand hotwaterheating

moderatly imortant are the parking guestrooms mainroad and the presence of the basement

less importand were the area and the bedrooms 