<a href="https://colab.research.google.com/github/marykargozar/linear_regression/blob/main/Linear_Regression_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np

In [None]:
df = pd.read_csv('/content/boston_housing (1).csv')

In [None]:
df.isna().sum()

In [None]:
df.head(3)

In [None]:
df.describe()

In [None]:
df.shape[0]         # number of rows

506

In [None]:
df.shape[1]         # number of columns

14

In [None]:
sns.histplot(df['MEDV'], kde=True , linewidth=0)


In [None]:
features = df[df.columns[:-1]]       # == df[:-1]

target = df[df.columns[-1]]           # == df[-1]

In [None]:
features = (features - features.min() ) / (features.max() - features.min())   #min-max
target = (target - target.min())/( target.max() - target.min())


In [None]:
fig, axs = plt.subplots(7, 2, figsize=(14, 30))
for index, feature in enumerate(features.columns):
    subplot_idx = int(index / 2)
    axs[subplot_idx, index % 2].scatter(x=features[features.columns[index]], y=target)
    axs[subplot_idx, index % 2].set_xlabel(feature)
    axs[subplot_idx, index % 2].set_ylabel("Target")
plt.show(block=True)
plt.close()

In [None]:
correlation_matrix = df.corr().round(2)
sns.heatmap(data = correlation_matrix , annot=True)
plt.show()

In [None]:
independent_variables = ['RM' , 'LSTAT']     # linears
dependent_variable =['MEDV']

X = features[independent_variables].to_numpy()
y = target.to_numpy().reshape(-1)

*Linear Regression using OLS (Ordinary Least Square)
:A method used for estimating the parameters (coefficients) of a linear regression model*

In [None]:
X_aug = np.block([X , np.ones((X.shape[0],1))])        # X.shape[0] => number of rows

W = np.linalg.inv(X_aug.T.dot(X_aug)).dot(X_aug.T).dot(y)           # W(Coefficents) = (X.T * X)^-1 * (X.T) * (y)
W = W.ravel()        # making a vector
print(W)
print(f'Coefficients : {W[:-1]} , Bias: {W[-1]}')

[ 0.59088219 -0.51731258  0.2371775 ]
Coefficients : [ 0.59088219 -0.51731258] , Bias: 0.2371774951579


In [None]:
X_aug.shape   # => f1 , f2 , Bias

(506, 3)

In [None]:
type(X_aug)

numpy.ndarray

 *MSE :*

In [None]:
y_pred = W[0:-1].dot(X.T) + W[-1]                      # y(Prediction)= X(T).W + bias
MSE = np.sum(np.square(y-y_pred))/len(y)        # MSE= sum[y-y_pred]^2
print(f"MSE: {MSE:.4f}")                  #This line prints the MSE with four decimal places

MSE: 1.7733


In [None]:
data = np.c_[X[:,0], X[:,1]]     # X[:,0] => selects all rows from the first column of the array X
        # X[:,1] => selects all rows from the second column of the array X
# np.c => concatinate 2 columns

# Create X, Y data to predict|
mn = np.min(data, axis=0)
mx = np.max(data, axis=0)
XX, YY = np.meshgrid(np.linspace(mn[0], mx[0], 100), np.linspace(mn[1], mx[1], 100))
# calculate prediction
Z = W[0] * XX + W[1] * YY + W[-1]
# plot the surface
fig = plt.figure(figsize=(20,12))
ax = fig.add_subplot(projection="3d")
ax.plot_surface(XX, YY, Z, alpha=0.7)
ax.scatter(data[:, 0], data[:, 1], y, c="r", s=50)
plt.xlabel("RM")
plt.ylabel("LSTAT")
ax.set_zlabel("MEDV")
plt.show()

*Linear Regression SKlearn*

In [None]:
%%time
from sklearn.linear_model import LinearRegression

Lr = LinearRegression().fit(X,y)                                       # Linear Regression model
print(f'Coefficents= {Lr.coef_}  Bias= {Lr.intercept_}' )                  # Coefficents & Bias


Coefficents= [ 0.01539928  0.00799423  0.01600406 ...  0.00227875 -0.02139339
  0.02745186]  Bias= 9.994789047871407
CPU times: user 13.2 s, sys: 1.67 s, total: 14.9 s
Wall time: 10.1 s


*MSE :*

In [None]:
from sklearn.metrics import mean_squared_error

y_pred = Lr.predict(X)
MSE = mean_squared_error(y,y_pred)         # MSE
print(f'{MSE:.4f}')

1.7733


In [None]:
data = np.c_[X[:,0], X[:,1]]

# Create X, Y data to predict
mn = np.min(data, axis=0)
mx = np.max(data, axis=0)
XX, YY = np.meshgrid(np.linspace(mn[0], mx[0], 100), np.linspace(mn[1], mx[1], 100))

X_ = np.c_[XX.flatten(),YY.flatten()]
   # X_ => have two columns: the first column containing all the x-coordinates from the grid
   #  the second column containing all the y-coordinates.

# calculate prediction
Z = lr.predict(X_).reshape(XX.shape)
# plot the surface
fig = plt.figure(figsize=(20,12))
ax = fig.add_subplot(projection="3d")
ax.plot_surface(XX, YY, Z, alpha=0.7)
ax.scatter(data[:, 0], data[:, 1], y, c="g", s=50)
plt.xlabel("RM")  # LSTAT
plt.ylabel("LSTAT")  # RM
ax.set_zlabel("MEDV")
plt.show()

***Linear Regression - Gradient Descent***

In [None]:
%%time
learning_rate = 0.005
num_itterations = 1000

X_aug = np.block([X,np.ones((X.shape[0],1))])
theta = np.zeros(X_aug.shape[1])
for i in range(num_itterations):
    pred = X_aug.dot(theta)                      # Y(pred) = W.X + bias == X(aug).W || W = Coefficents = theta
    error = pred - y          # error = residual = y(pred) - y
    theta = theta - (X_aug.T.dot(error)*learning_rate)/len(X_aug)     # theta = theta - (LR)*X(aug)T*(error)/n
print(f"Coefficients: {theta[:-1]}, Bias: {theta[-1]}")             # Coefficents & Bias

Coefficients: [ 0.02189741  0.45179329  0.31186596 ...  0.07862193 -0.09717098
 -0.012529  ], Bias: 9.546475876581988
CPU times: user 59.3 s, sys: 3.22 s, total: 1min 2s
Wall time: 37.5 s


***MSE :***

In [None]:
y_pred = theta[0:-1].dot(X.T) + theta[-1]               # y(pred) = W.X(T) + Bias
MSE = np.sum(np.square(y-y_pred))/len(y)
print(f"MSE: {MSE:.4f}")

MSE: 878.3065


In [None]:
data = np.c_[X[:,0], X[:,1]]

# Create X, Y data to predict
mn = np.min(data, axis=0)
mx = np.max(data, axis=0)
XX, YY = np.meshgrid(np.linspace(mn[0], mx[0], 100), np.linspace(mn[1], mx[1], 100))
# calculate prediction
Z = theta[0] * XX + theta[1] * YY + theta[-1]
# plot the surface
fig = plt.figure(figsize=(20,12))
ax = fig.add_subplot(projection="3d")
ax.plot_surface(XX, YY, Z, alpha=0.7)
ax.scatter(data[:, 0], data[:, 1], y, c="r", s=50)  # [:,0] => select all rows from first column
plt.xlabel("RM")  # RM
plt.ylabel("LSTAT")  # LSTAT
ax.set_zlabel("MEDV")
plt.show()

***Residual analysis***

In [None]:
import matplotlib.pyplot as plt

rsd = y-y_pred
plot = plt.hist(rsd, bins=16)      # bins = number of bins in hist

***Scatter plot: between targets and residuals***

In [None]:
                    # ==> plt.scatter(x=y , y=rsd)
rsd_analysis = pd.DataFrame()
rsd_analysis["y"] = y
rsd_analysis["rsd"] = rsd
analysis_sorted = rsd_analysis.sort_values(by="y")
plt.scatter(x=analysis_sorted.y, y= analysis_sorted.rsd)