### House predictions with linear regression

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from skimpy import skim
import utils

In [26]:
# Load the data using pandas
data = pd.read_csv('Hyderabad.csv')
data.head()

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,LiftAvailable,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator
0,6968000,1340,Nizampet,2,0,0,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
1,29000000,3498,Hitech City,4,0,0,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
2,6590000,1318,Manikonda,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5739000,1295,Alwal,3,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,5679000,1145,Kukatpally,2,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0


In [27]:
# Use Skimpy to get a summary of the dataset
skim(data)

In [28]:
# Convert the Location column to categorical and assign it category codes
data["Location"] = data["Location"].astype('category')
data["Location"] = data["Location"].cat.codes
data

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,LiftAvailable,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator
0,6968000,1340,162,2,0,0,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
1,29000000,3498,85,4,0,0,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
2,6590000,1318,132,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5739000,1295,9,3,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,5679000,1145,118,2,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2513,11000000,1460,150,2,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
2514,26000000,1314,132,2,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
2515,13300000,2625,124,3,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
2516,10800000,2050,85,3,0,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9


In [29]:
# Remove the outlier row (maximum value in this case for Price). May help in modelling. 
data = data[data["Price"] != data["Price"].max()]
data

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,LiftAvailable,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator
0,6968000,1340,162,2,0,0,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
1,29000000,3498,85,4,0,0,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
2,6590000,1318,132,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5739000,1295,9,3,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,5679000,1145,118,2,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2513,11000000,1460,150,2,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
2514,26000000,1314,132,2,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
2515,13300000,2625,124,3,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
2516,10800000,2050,85,3,0,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9


In [30]:
X = data.drop(columns=['Price']).values  # All features except the target
y = data['Price'].values # Target

In [31]:
X

array([[1340,  162,    2, ...,    0,    0,    0],
       [3498,   85,    4, ...,    0,    0,    0],
       [1318,  132,    2, ...,    0,    0,    0],
       ...,
       [2625,  124,    3, ...,    9,    9,    9],
       [2050,   85,    3, ...,    9,    9,    9],
       [1805,  158,    3, ...,    9,    9,    9]], shape=(2517, 39))

In [32]:
# Step 1: First split into training+validation and test sets (e.g., 80% train_val, 20% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
# Step 2: Now split training+validation into training and validation sets (e.g., 75% train, 25% val of the 80%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

In [34]:
# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

In [35]:
# Get the coefficients
coefficients = model.coef_
intercept = model.intercept_
print("Coefficients:", coefficients)
print("Intercept:", intercept)

Coefficients: [ 1.02673734e+04 -5.38985568e+03 -1.38564720e+06  1.80234340e+06
 -1.27582128e+06 -1.06409066e+06  5.13889551e+05  9.42252005e+05
 -6.42865219e+05 -9.03811350e+05  8.12078316e+05 -1.68357429e+05
 -2.96051027e+05 -6.20412535e+05  3.69545934e+05  1.15857537e+06
 -4.66761423e+05 -7.80186249e+05  6.22308913e+05  5.23113722e+05
 -7.36155868e+04  9.36149972e+05  7.64204578e+05  4.54427981e+05
  9.11423006e+05  7.75620961e+05  1.71373408e+06 -2.29946273e+02
  2.99915169e+04 -1.21848628e+05 -1.58948274e+06  2.13619983e+05
 -1.58981607e+06 -2.11934456e+06  1.83114618e+06  1.31123665e+05
 -7.08919968e+05 -3.08121028e+05 -1.82511448e+05]
Intercept: -3705266.4078253414


In [36]:
# Get the training error
predictions = model.predict(X_train)
mse = mean_squared_error(y_train, predictions)
mabs = mean_absolute_error(y_train, predictions)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)
print("Mean Squared Error:", mse)

Mean Absolute Error: 2166681.975955541
Mean Squared Error: 13222938477051.74


In [37]:
# Get the validation error
predictions = model.predict(X_val)
mse = mean_squared_error(y_val, predictions)
mabs = mean_absolute_error(y_val, predictions)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)
print("Mean Squared Error:", mse)

Mean Absolute Error: 2170365.04366526
Mean Squared Error: 15626792273062.53


In [38]:
# Get the testing error
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
mabs = mean_absolute_error(y_test, predictions)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)
print("Mean Squared Error:", mse)

Mean Absolute Error: 2218415.350894788
Mean Squared Error: 14962390750139.14


In [39]:
rforest_model = RandomForestRegressor(random_state=42)
rforest_model.fit(X_train, y_train)
predictions_rforest1 = rforest_model.predict(X_train)
mabs = mean_absolute_error(y_train, predictions_rforest1)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)

Mean Absolute Error: 590290.6736999243


In [40]:
predictions_rforest2 = rforest_model.predict(X_val)
mabs = mean_absolute_error(y_val, predictions_rforest2)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)

Mean Absolute Error: 1410966.803124578


In [41]:
predictions_rforest3 = rforest_model.predict(X_test)
mabs = mean_absolute_error(y_test, predictions_rforest3)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)

Mean Absolute Error: 1635853.647225496


In [42]:
X_train

array([[1592,  118,    3, ...,    0,    0,    0],
       [ 500,  194,    2, ...,    0,    0,    0],
       [1025,  117,    2, ...,    0,    0,    0],
       ...,
       [1150,   87,    2, ...,    0,    0,    0],
       [1500,   16,    3, ...,    0,    0,    0],
       [1524,   16,    2, ...,    0,    0,    0]], shape=(1509, 39))

In [43]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler

#scaler = StandardScaler()
scaler = MinMaxScaler()
# Fit the scaler on the training,validation,testing data
train_X_scaled = scaler.fit_transform(X_train)
val_X_scaled = scaler.fit_transform(X_val)
test_X_scaled = scaler.transform(X_test)
# Create and train the model
model = LinearRegression()
model.fit(train_X_scaled, y_train)
# Get the coefficients
coefficients = model.coef_
intercept = model.intercept_
print("Coefficients after scaling:", coefficients)
print("Intercept after scaling:", intercept)


Coefficients after scaling: [ 9.13796237e+07 -1.29895522e+06 -9.69953038e+06  1.80234340e+06
 -1.14823916e+07 -9.57681598e+06  4.62500596e+06  8.48026804e+06
 -5.78578697e+06 -8.13430215e+06  7.30870484e+06 -1.51521687e+06
 -2.66445924e+06 -5.58371281e+06  3.32591341e+06  1.04271784e+07
 -4.20085280e+06 -7.02167624e+06  5.60078022e+06  4.70802349e+06
 -6.62540281e+05  8.42534975e+06  6.87784120e+06  4.08985183e+06
  8.20280705e+06  6.98058865e+06  1.54236067e+07 -2.06951646e+03
  2.69923652e+05 -1.09663765e+06 -1.43053447e+07  1.92257984e+06
 -1.43083446e+07 -1.90741010e+07  1.64803156e+07  1.18011299e+06
 -6.38027971e+06 -2.77308925e+06 -1.64260303e+06]
Intercept after scaling: 37383.262639531866


In [44]:
train_X_scaled

array([[0.12269663, 0.48547718, 0.28571429, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.80082988, 0.14285714, ..., 0.        , 0.        ,
        0.        ],
       [0.05898876, 0.4813278 , 0.14285714, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.07303371, 0.35684647, 0.14285714, ..., 0.        , 0.        ,
        0.        ],
       [0.11235955, 0.06224066, 0.28571429, ..., 0.        , 0.        ,
        0.        ],
       [0.11505618, 0.06224066, 0.14285714, ..., 0.        , 0.        ,
        0.        ]], shape=(1509, 39))

In [45]:
# Evaluate the model
predictions1 = model.predict(train_X_scaled)
mse = mean_squared_error(y_train, predictions1)
mabs = mean_absolute_error(y_train, predictions1)  # Mean Absolute Error
print("Mean Absolute Error after scaling:", mabs)
print("Mean Squared Error after scaling:", mse)


Mean Absolute Error after scaling: 2166681.9759555543
Mean Squared Error after scaling: 13222938477051.74


In [46]:
# Evaluate the model
predictions2 = model.predict(val_X_scaled)
mse = mean_squared_error(y_val, predictions2)
mabs = mean_absolute_error(y_val, predictions2)  # Mean Absolute Error
print("Mean Absolute Error after scaling:", mabs)
print("Mean Squared Error after scaling:", mse)


Mean Absolute Error after scaling: 2240810.7435023496
Mean Squared Error after scaling: 16735064520212.078


In [47]:
# Evaluate the model
predictions3 = model.predict(test_X_scaled)
mse = mean_squared_error(y_test, predictions3)
mabs = mean_absolute_error(y_test, predictions3)  # Mean Absolute Error
print("Mean Absolute Error after scaling:", mabs)
print("Mean Squared Error after scaling:", mse)

Mean Absolute Error after scaling: 2288193.754575312
Mean Squared Error after scaling: 14726052147319.979


In [48]:
rforest_model = RandomForestRegressor(random_state=42)
rforest_model.fit(train_X_scaled, y_train)
predictions_rforest1 = rforest_model.predict(train_X_scaled)
mabs = mean_absolute_error(y_train, predictions_rforest1)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)

Mean Absolute Error: 590875.0145811711


In [49]:
predictions_rforest2 = rforest_model.predict(val_X_scaled)
mabs = mean_absolute_error(y_val, predictions_rforest2)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)

Mean Absolute Error: 2325057.533315124


In [50]:
predictions_rforest3 = rforest_model.predict(test_X_scaled)
mabs = mean_absolute_error(y_test, predictions_rforest3)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)

Mean Absolute Error: 2308406.747322072
