### House predictions with linear regression

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from skimpy import skim
import utils

In [None]:
# Load the data using pandas
data = pd.read_csv('Hyderabad.csv')
data

In [None]:
skim(data)

In [None]:
data["Location"] = data["Location"].astype('category')
data["Location"] = data["Location"].cat.codes
data

In [37]:
data = data[data["Price"] != data["Price"].max()]
data

Unnamed: 0,Area,No. of Bedrooms,Location,Price
0,1340,2,162,6968000
1,3498,4,85,29000000
2,1318,2,132,6590000
3,1295,3,9,5739000
4,1145,2,118,5679000
...,...,...,...,...
2513,1460,2,150,11000000
2514,1314,2,132,26000000
2515,2625,3,124,13300000
2516,2050,3,85,10800000


In [38]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [39]:
# Prepare the data for multiple linear regression
train_X_all = train_set.drop(columns=['Price']).values  # All features except the target
train_y_all = train_set['Price'].values                 # Target
test_X_all = test_set.drop(columns=['Price']).values  # All features except the target
test_y_all = test_set['Price'].values                 # Target

In [40]:
# Create and train the model
model = LinearRegression()
model.fit(train_X_all, train_y_all)

In [41]:
# Get the coefficients
coefficients = model.coef_
intercept = model.intercept_
print("Coefficients:", coefficients)
print("Intercept:", intercept)

Coefficients: [   10343.8828322  -1128353.15822895    -6655.86252056]
Intercept: -3576477.610464275


In [42]:
# Evaluate the model
predictions = model.predict(test_X_all)
mse = mean_squared_error(test_y_all, predictions)
mabs = mean_absolute_error(test_y_all, predictions)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)
print("Mean Squared Error:", mse)

Mean Absolute Error: 2352385.124377389
Mean Squared Error: 15652420747083.66


In [43]:
rforest_model = RandomForestRegressor(random_state=42)

In [44]:
rforest_model.fit(train_X_all, train_y_all)

In [45]:
predictions_rforest = rforest_model.predict(test_X_all)
mabs = mean_absolute_error(test_y_all, predictions_rforest)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)

Mean Absolute Error: 1630037.768419087


In [46]:
reduced_features = ['Area', 'No. of Bedrooms', 'Location', "Price"]
data = data[reduced_features]
train_set, test_set = train_test_split(data[reduced_features], test_size=0.2, random_state=42)
# Prepare the data for multiple linear regression
train_X_all = train_set.drop(columns=['Price']).values  # All features except the target
train_y_all = train_set['Price'].values                 # Target
test_X_all = test_set.drop(columns=['Price']).values  # All features except the target
test_y_all = test_set['Price'].values                 # Target

In [47]:
# Create and train the model
model = LinearRegression()
model.fit(train_X_all, train_y_all)

In [48]:
# Get the coefficients
coefficients = model.coef_
intercept = model.intercept_
print("Coefficients:", coefficients)
print("Intercept:", intercept)

Coefficients: [   10343.8828322  -1128353.15822895    -6655.86252056]
Intercept: -3576477.610464275


In [49]:
# Evaluate the model
predictions = model.predict(test_X_all)
mse = mean_squared_error(test_y_all, predictions)
mabs = mean_absolute_error(test_y_all, predictions)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)
print("Mean Squared Error:", mse)

Mean Absolute Error: 2352385.124377389
Mean Squared Error: 15652420747083.66


In [50]:
rforest_model = RandomForestRegressor(random_state=42)
rforest_model.fit(train_X_all, train_y_all)
predictions_rforest = rforest_model.predict(test_X_all)
mabs = mean_absolute_error(test_y_all, predictions_rforest)  # Mean Absolute Error
print("Mean Absolute Error:", mabs)

Mean Absolute Error: 1630037.768419087


In [51]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit the scaler on the training data
train_X_scaled = scaler.fit_transform(train_X_all)
# Transform the test data
test_X_scaled = scaler.transform(test_X_all)
# Create and train the model
model = LinearRegression()
model.fit(train_X_scaled, train_y_all)
# Get the coefficients
coefficients = model.coef_
intercept = model.intercept_
print("Coefficients after scaling:", coefficients)
print("Intercept after scaling:", intercept)


Coefficients after scaling: [7808735.96353503 -784094.66927253 -385890.22372529]
Intercept after scaling: 9713268.722305017


In [52]:
train_X_scaled

array([[ 0.69475633,  0.53472978,  0.74139293],
       [-0.69613165, -0.90432242,  1.46581197],
       [ 0.53447305,  1.97378198, -0.51771636],
       ...,
       [-0.3517213 ,  0.53472978,  0.29294304],
       [ 0.46956494,  0.53472978,  0.29294304],
       [-0.82859717, -2.34337462, -0.75918937]], shape=(2013, 3))

In [53]:
# Evaluate the model
predictions = model.predict(test_X_scaled)
mse = mean_squared_error(test_y_all, predictions)
mabs = mean_absolute_error(test_y_all, predictions)  # Mean Absolute Error
print("Mean Absolute Error after scaling:", mabs)
print("Mean Squared Error after scaling:", mse)


Mean Absolute Error after scaling: 2352385.124377388
Mean Squared Error after scaling: 15652420747083.639


In [54]:
# Visualize the results using Plotly
fig = px.scatter(x=test_y_all, y=predictions, labels={'x': 'Actual Price', 'y': 'Predicted Price'}, title='Actual vs Predicted Prices')
fig.add_shape(type='line', x0=min(test_y_all), y0=min(test_y_all), x1=max(test_y_all), y1=max(test_y_all), line=dict(color='red', dash='dash'))
fig.show()
# Visualize the results using Plotly for Random Forest
fig_rforest = px.scatter(x=test_y_all, y=predictions_rforest, labels={'x': 'Actual Price', 'y': 'Predicted Price'}, title='Actual vs Predicted Prices (Random Forest)')
fig_rforest.add_shape(type='line', x0=min(test_y_all), y0=min(test_y_all), x1=max(test_y_all), y1=max(test_y_all), line=dict(color='red', dash='dash'))
fig_rforest.show()
# Visualize feature importance for Random Forest
importance = rforest_model.feature_importances_
features = train_set.drop(columns=['Price']).columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': importance})
fig_importance = px.bar(importance_df.sort_values(by='Importance', ascending=False), 
                        x='Feature', y='Importance', 
                        title='Feature Importance from Random Forest Model')
fig_importance.show()
# Visualize the distribution of prices
fig_price_distribution = px.histogram(data, x='Price', nbins=50, title='Distribution of Prices')
fig_price_distribution.show()
# Visualize the relationship between Area and Price
fig_area_price = px.scatter(data, x='Area', y='Price', color='Location', 
                             title='Area vs Price by Location', 
                             labels={'Area': 'Area (sq ft)', 'Price': 'Price (in INR)'})
fig_area_price.show()
# Visualize the relationship between No. of Bedrooms and Price
fig_bedrooms_price = px.box(data, x='No. of Bedrooms', y='Price', 
                             title='Price Distribution by Number of Bedrooms', 
                             labels={'No. of Bedrooms': 'Number of Bedrooms', 'Price': 'Price (in INR)'})
fig_bedrooms_price.show()
# Visualize the relationship between Location and Price
fig_location_price = px.box(data, x='Location', y='Price', 
                             title='Price Distribution by Location', 
                             labels={'Location': 'Location (Encoded)', 'Price': 'Price (in INR)'})
fig_location_price.show()
# Visualize the correlation matrix
correlation_matrix = data.corr()
fig_corr = px.imshow(correlation_matrix, 
                     title='Correlation Matrix', 
                     labels=dict(color='Correlation Coefficient'),
                     x=correlation_matrix.columns, 
                     y=correlation_matrix.columns)
fig_corr.update_xaxes(side="top")
fig_corr.show()