### House predictions with linear regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from skimpy import skim
import utils

In [None]:
# Load the data using pandas
data = pd.read_csv('Hyderabad.csv')
data.head()

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,LiftAvailable,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator
0,6968000,1340,Nizampet,2,0,0,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
1,29000000,3498,Hitech City,4,0,0,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
2,6590000,1318,Manikonda,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5739000,1295,Alwal,3,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,5679000,1145,Kukatpally,2,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2513,11000000,1460,Nacharam,2,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
2514,26000000,1314,Manikonda,2,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
2515,13300000,2625,Madhapur,3,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
2516,10800000,2050,Hitech City,3,0,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9


In [3]:
skim(data)

In [4]:
data["Location"] = data["Location"].astype('category')
data["Location"] = data["Location"].cat.codes
data

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,LiftAvailable,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator
0,6968000,1340,162,2,0,0,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
1,29000000,3498,85,4,0,0,1,1,1,1,...,1,0,1,0,0,0,0,0,0,0
2,6590000,1318,132,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5739000,1295,9,3,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,5679000,1145,118,2,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2513,11000000,1460,150,2,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
2514,26000000,1314,132,2,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
2515,13300000,2625,124,3,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
2516,10800000,2050,85,3,0,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9


In [5]:
#data = data[data["Price"] != data["Price"].max()]
#data

In [6]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [7]:
# Prepare the data for multiple linear regression
train_X_all = train_set.drop(columns=['Price']).values  # All features except the target
train_y_all = train_set['Price'].values                 # Target
test_X_all = test_set.drop(columns=['Price']).values  # All features except the target
test_y_all = test_set['Price'].values                 # Target

In [8]:
# Create and train the model
model = LinearRegression()
model.fit(train_X_all, train_y_all)

In [9]:
# Get the coefficients
coefficients = model.coef_
intercept = model.intercept_
print("Coefficients:", coefficients)
print("Intercept:", intercept)

Coefficients: [   10879.77508748    -7710.9143361  -1672850.66140091  1249990.7982999
 -1310876.5475125   -966979.00364397   373073.39905112   915309.95854307
  -356684.8742547   -544042.9706718    803542.53756988    -7356.77776457
   -62187.07955261  -778576.14224564   569412.70235645   828579.91984261
 -1177711.16473181  -853333.3597781    490615.51682828   279157.6181841
  -451423.76659749   931301.99138044   730963.1708562    839132.33565165
  1409220.71093427   551387.86184038  1009203.58839715   526927.37993209
    90527.80492302  -365644.66079355  1220634.98996934   161652.90644123
 -2125746.38164657 -1878741.3516368    714512.01753033    94323.64332207
  -902436.56598906  -533535.5634043   -357304.24611863]
Intercept: -3198046.8852842804


In [10]:
# Evaluate the model
predictions = model.predict(test_X_all)
mse = mean_squared_error(test_y_all, predictions)
mabs = mean_absolute_error(test_y_all, predictions)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)
print("Mean Squared Error:", mse)

Mean Absolute Error: 2325532.3093254566
Mean Squared Error: 18009979696679.082


In [11]:
rforest_model = RandomForestRegressor(random_state=42)

In [12]:
rforest_model.fit(train_X_all, train_y_all)

In [13]:
predictions_rforest = rforest_model.predict(test_X_all)
mabs = mean_absolute_error(test_y_all, predictions_rforest)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)

Mean Absolute Error: 1573417.1210090418


In [14]:
reduced_features = ['Area', 'No. of Bedrooms', 'Location', "Price"]
data = data[reduced_features]
train_set, test_set = train_test_split(data[reduced_features], test_size=0.2, random_state=42)
# Prepare the data for multiple linear regression
train_X_all = train_set.drop(columns=['Price']).values  # All features except the target
train_y_all = train_set['Price'].values                 # Target
test_X_all = test_set.drop(columns=['Price']).values  # All features except the target
test_y_all = test_set['Price'].values                 # Target

In [15]:
# Create and train the model
model = LinearRegression()
model.fit(train_X_all, train_y_all)

In [16]:
# Get the coefficients
coefficients = model.coef_
intercept = model.intercept_
print("Coefficients:", coefficients)
print("Intercept:", intercept)

Coefficients: [   11125.29947163 -1587632.21508566    -7149.59112964]
Intercept: -3468823.3663623016


In [17]:
# Evaluate the model
predictions = model.predict(test_X_all)
mse = mean_squared_error(test_y_all, predictions)
mabs = mean_absolute_error(test_y_all, predictions)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)
print("Mean Squared Error:", mse)

Mean Absolute Error: 2429931.377149689
Mean Squared Error: 19085032627139.145


In [18]:
rforest_model = RandomForestRegressor(random_state=42)
rforest_model.fit(train_X_all, train_y_all)
predictions_rforest = rforest_model.predict(test_X_all)
mabs = mean_absolute_error(test_y_all, predictions_rforest)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)

Mean Absolute Error: 1846154.8360721942


In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit the scaler on the training data
train_X_scaled = scaler.fit_transform(train_X_all)
# Transform the test data
test_X_scaled = scaler.transform(test_X_all)
# Create and train the model
model = LinearRegression()
model.fit(train_X_scaled, train_y_all)
# Get the coefficients
coefficients = model.coef_
intercept = model.intercept_
print("Coefficients after scaling:", coefficients)
print("Intercept after scaling:", intercept)


Coefficients after scaling: [ 8310673.18180108 -1103599.21156273  -413950.09525238]
Intercept after scaling: 9843782.490069512


In [20]:
train_X_scaled

array([[ 0.70179197,  0.53143725,  0.74672768],
       [-0.46954961, -0.90715767, -0.03049548],
       [ 0.53981216,  1.97003216, -0.514101  ],
       ...,
       [-0.35576214,  0.53143725,  0.29766541],
       [ 0.47421703,  0.53143725,  0.29766541],
       [-0.83768554, -2.34575258, -0.75590376]], shape=(2014, 3))

In [21]:
# Evaluate the model
predictions = model.predict(test_X_scaled)
mse = mean_squared_error(test_y_all, predictions)
mabs = mean_absolute_error(test_y_all, predictions)  # Mean Absolute Error
print("Mean Absolute Error after scaling:", mabs)
print("Mean Squared Error after scaling:", mse)


Mean Absolute Error after scaling: 2429931.37714969
Mean Squared Error after scaling: 19085032627139.15


In [22]:
# Visualize the results using Plotly
fig = px.scatter(x=test_y_all, y=predictions, labels={'x': 'Actual Price', 'y': 'Predicted Price'}, title='Actual vs Predicted Prices')
fig.add_shape(type='line', x0=min(test_y_all), y0=min(test_y_all), x1=max(test_y_all), y1=max(test_y_all), line=dict(color='red', dash='dash'))
fig.show()
# Visualize the results using Plotly for Random Forest
fig_rforest = px.scatter(x=test_y_all, y=predictions_rforest, labels={'x': 'Actual Price', 'y': 'Predicted Price'}, title='Actual vs Predicted Prices (Random Forest)')
fig_rforest.add_shape(type='line', x0=min(test_y_all), y0=min(test_y_all), x1=max(test_y_all), y1=max(test_y_all), line=dict(color='red', dash='dash'))
fig_rforest.show()
# Visualize feature importance for Random Forest
importance = rforest_model.feature_importances_
features = train_set.drop(columns=['Price']).columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': importance})
fig_importance = px.bar(importance_df.sort_values(by='Importance', ascending=False), 
                        x='Feature', y='Importance', 
                        title='Feature Importance from Random Forest Model')
fig_importance.show()
# Visualize the distribution of prices
fig_price_distribution = px.histogram(data, x='Price', nbins=50, title='Distribution of Prices')
fig_price_distribution.show()
# Visualize the relationship between Area and Price
fig_area_price = px.scatter(data, x='Area', y='Price', color='Location', 
                             title='Area vs Price by Location', 
                             labels={'Area': 'Area (sq ft)', 'Price': 'Price (in INR)'})
fig_area_price.show()
# Visualize the relationship between No. of Bedrooms and Price
fig_bedrooms_price = px.box(data, x='No. of Bedrooms', y='Price', 
                             title='Price Distribution by Number of Bedrooms', 
                             labels={'No. of Bedrooms': 'Number of Bedrooms', 'Price': 'Price (in INR)'})
fig_bedrooms_price.show()
# Visualize the relationship between Location and Price
fig_location_price = px.box(data, x='Location', y='Price', 
                             title='Price Distribution by Location', 
                             labels={'Location': 'Location (Encoded)', 'Price': 'Price (in INR)'})
fig_location_price.show()
# Visualize the correlation matrix
correlation_matrix = data.corr()
fig_corr = px.imshow(correlation_matrix, 
                     title='Correlation Matrix', 
                     labels=dict(color='Correlation Coefficient'),
                     x=correlation_matrix.columns, 
                     y=correlation_matrix.columns)
fig_corr.update_xaxes(side="top")
fig_corr.show()