splitting the dataset, training models, evaluating performance

In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score



In [39]:
df = pd.read_csv("Housing.csv")
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [40]:
# Ensure consistency by converting the categorical columns to lowercase and removing any leading/trailing spaces
df['mainroad'] = df['mainroad'].str.strip().str.lower()
df['guestroom'] = df['guestroom'].str.strip().str.lower()
df['basement'] = df['basement'].str.strip().str.lower()
df['hotwaterheating'] = df['hotwaterheating'].str.strip().str.lower()
df['airconditioning'] = df['airconditioning'].str.strip().str.lower()
df['prefarea'] = df['prefarea'].str.strip().str.lower()

# Now apply the mapping (yes -> 1, no -> 0)
df['mainroad'] = df['mainroad'].map({'yes': 1, 'no': 0})
print(df['mainroad'].head())

df['guestroom'] = df['guestroom'].map({'yes': 1, 'no': 0})
print(df['guestroom'].head())

df['basement'] = df['basement'].map({'yes': 1, 'no': 0})
print(df['basement'].head())

df['hotwaterheating'] = df['hotwaterheating'].map({'yes': 1, 'no': 0})
print(df['hotwaterheating'].head())

df['airconditioning'] = df['airconditioning'].map({'yes': 1, 'no': 0})
print(df['airconditioning'].head())

df['prefarea'] = df['prefarea'].map({'yes': 1, 'no': 0})
print(df['prefarea'].head())


0    1
1    1
2    1
3    1
4    1
Name: mainroad, dtype: int64
0    0
1    0
2    0
3    0
4    1
Name: guestroom, dtype: int64
0    0
1    0
2    1
3    1
4    1
Name: basement, dtype: int64
0    0
1    0
2    0
3    0
4    0
Name: hotwaterheating, dtype: int64
0    1
1    1
2    0
3    1
4    1
Name: airconditioning, dtype: int64
0    1
1    0
2    1
3    1
4    0
Name: prefarea, dtype: int64


In [41]:
# Apply One-Hot Encoding for categorical columns with multiple categories
df = pd.get_dummies(df, columns=['furnishingstatus'], drop_first=True)

print(df.head())

      price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0                0        2         1   
3         1                0                1        3         1   
4         1                0                1        2         0   

   furnishingstatus_semi-furnished  furnishingstatus_unfurnished  
0                            False                         False  
1                       

In [42]:
X = df.drop('price', axis = 1 )
Y = df['price']

In [43]:
#training and testing sets
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size=0.2, random_state = 42)

print(f'training test size: {X_train.shape}')
print(f'test set size {X_test.shape}')-+


training test size: (436, 13)
test set size (109, 13)


test_size=0.2 means 20% of the data will be used for testing, while 80% will be used for training.
random_state=42 ensures that the data split is reproducible.

possible models 
Linear Regression
Decision Tree Regressor
Random Forest Regressor

In [44]:
# linear regression 
linear_regression = LinearRegression()
linear_regression.fit(X_train, Y_train)

Y_pred_lr = linear_regression.predict(X_test)



In [45]:
#evaluating the model
mse_lr = mean_squared_error(Y_test, Y_pred_lr)
r2_lr = r2_score(Y_test, Y_pred_lr)

print(f'linear regression - mean square error: {mse_lr}, r^2: {r2_lr}')

linear regression - mean square error: 1754318687330.664, r^2: 0.6529242642153184


: 