## 0, Load Dependencies

In [9]:
import numpy as np
import pandas as pd
import plotly.express as px


## 1, Load Data

In [10]:
# Load dataset
data_url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
df = pd.read_csv(data_url)
df

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


This dataset contains information collected by the U.S Census Service concerning housing in the area of Boston, Massachusetts. The dataset contains 506 observations and 13 attributes including the median value of owner-occupied homes.

## 2, Prepare Data for modeling

In [11]:
# Prepare data
X = df.drop("medv", axis=1).values
y = df["medv"].values.reshape(-1, 1)


In [12]:
# Add bias column to X
X = np.hstack([np.ones((X.shape[0], 1)), X])


## 3, Modeling
Linear regression is a statistical method used to study the relationship between a dependent variable and one or more independent variables. The goal of linear regression is to find the line of best fit that represents the relationship between the variables.

In [14]:
# Compute optimal parameters
theta = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)

# Predict using optimal parameters
y_pred = X.dot(theta)


In [19]:
y_pred[:5] #Get a peak of the predictions

array([[30.00384338],
       [25.02556238],
       [30.56759672],
       [28.60703649],
       [27.94352423]])

## 4, Evaluation

In [20]:
# Compute R-squared
y_mean = np.mean(y)
ss_tot = np.sum((y - y_mean)**2)
ss_res = np.sum((y - y_pred)**2)
r_squared = 1 - (ss_res / ss_tot)

# Compute RMSE
mse = np.mean((y_pred - y)**2)
rmse = np.sqrt(mse)


we plot the scatter plot of the predicted values against the actual y values We also calculate the R-squared and RMSE values to evaluate the performance of the linear regression model.

In [39]:
# Create plotly figure with scatter plot and fitted line
fig = px.scatter(x=y_pred.flatten(), y=y.flatten())
# fig.add_trace(px.line(x=X[:, 1].sort(), y=y_pred.flatten()).data[0])
fig.update_layout(xaxis_title="Prediction", yaxis_title="Real Value",
                  title="Linear Regression: real VS prediction on Boston Housing Dataset <br>"
                  f"R-squared: {r_squared:.2f}, RMSE: {rmse:.2f}")
fig.show()


### If we want to improve the outcome, we can certainly do a 5-fold validation and a hold out test set and validation set.