# Linear Regression Implementation
TOC:
* [Setup](#setup)
* [Load and analyze data](#load-analyze)
* [Features Scaling](#feature-scaling)
* [Features Engineering](#feature-engineer)
* [Baseline](#baseline)
* [Linear regression](#linear-regression)
* [Cost function](#cost)
* [Gradient Descent](#gradien-descent)

## Set up <a id='setup'></a>

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import model_selection, metrics

# Use plotly as it is an interaction plot
import plotly.express as px
# sub plot
from plotly.subplots import make_subplots
import plotly.graph_objects as go

## Load and Analyze data <a id='load-analyze'>

In [2]:
# Load data from TF dataset
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.boston_housing.load_data(
    path='boston_housing.npz', test_split=0.2, seed=113
)

In [4]:
X_train_df = pd.DataFrame(X_train, columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'])
y_train_df = pd.DataFrame(y_train, columns=['MEDV'])
y_test_df = pd.DataFrame(y_test, columns=['MEDV'])

In [5]:
def plot_relation(X: pd.DataFrame, y: pd.DataFrame, columns):
    '''
    Plot relation between X input and y target
    
    Args:
        X (pd.DataFrame (m,n))  : Data, m examples, n features
        y (pd.DataFrame (m,1))  : target values, m values
        columns (int)             : number of desired subplot column
        
    Output
        Interation graph
    
    '''
    
    m = X.shape[1]
    rows = m // columns     # Get row
    frac = m % columns      # Get fractual
    row = 0
    col = 1

    if frac > 0:
        rows += 1
            
    fig = make_subplots(rows=rows, cols=columns)

    for i in range(m):
            
        if row >= rows:
            row = 1
            col += 1
        else:
            row += 1
        
        fig.add_trace(go.Scatter(
            x=X.iloc[:,i],
            y=y[y.columns[0]],
            mode='markers',
            name=X.columns[i],
            customdata=X.index.values,                                  # Add customdata for data's row index for more convinient to analysis
            hovertemplate="index:%{customdata} (X: %{x}, y: %{y})"
        ), row=row, col=col)

    fig.update_layout(height=400 * rows, width=600 * columns, title_text='Relationship between All features / ' + y.columns.values[0])
    fig.show()

In [10]:
plot_relation(X_train_df, y_train_df, 2)

## Features scaling <a id='feature-scaling'></a>

## Feature engineering <a class='anchor' id='FeatureEng'></a>

In [71]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

# poly = PolynomialFeatures(6)
# X_train = poly.fit_transform(X_train)
# # poly = PolynomialFeatures(interaction_only=True)
# # poly.fit_transform(X_train)
# X_test = poly.transform(X_test)

degree = 2
alpha = 0.01
est = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=alpha))
est.fit(X_train, y_train)

y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)

train_error = mean_squared_error(y_train, y_train_pred)
test_error = mean_squared_error(y_test, y_test_pred)

In [70]:
test_error

9.937220171691292

In [72]:
# Model Evaluation w polynomial(6)
print('R^2:',metrics.r2_score(y_train, y_train_pred))
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_train, y_train_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_train, y_train_pred))
print('MSE:',metrics.mean_squared_error(y_train, y_train_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))

R^2: 0.9336257998521866
Adjusted R^2: 0.9314133265139262
MAE: 1.7350624981531888
MSE: 5.616734339017465
RMSE: 2.3699650501679272


In [73]:
# Model Evaluation w polynomial(6)
print('R^2:',metrics.r2_score(y_test, y_test_pred))
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_test_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_test, y_test_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_test_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

R^2: 0.8806251711280785
Adjusted R^2: 0.8629902532265445
MAE: 2.380194444920558
MSE: 9.937220171691292
RMSE: 3.152335669260381


In [78]:
def plot_result(X: pd.DataFrame, y: pd.DataFrame, y_pred: pd.DataFrame, columns):
    '''
    Plot relation between X input and y target
    
    Args:
        X (pd.DataFrame (m,n))  : Data, m examples, n features
        y (pd.DataFrame (m,1))  : target values, m values
        columns (int)             : number of desired subplot column
        
    Output
        Interation graph
    
    '''
    
    m = X.shape[1]
    rows = m // columns     # Get row
    frac = m % columns      # Get fractual
    row = 0
    col = 1

    if frac > 0:
        rows += 1
            
    fig = make_subplots(rows=rows, cols=columns)

    for i in range(m):
            
        if row >= rows:
            row = 1
            col += 1
        else:
            row += 1
        
        fig.add_trace(go.Scatter(
            x=X.iloc[:,i],
            y=y[y.columns[0]],
            mode='markers',
            name=X.columns[i],
            customdata=X.index.values,                                  # Add customdata for data's row index for more convinient to analysis
            hovertemplate="index:%{customdata} (X: %{x}, y: %{y})"
        ), row=row, col=col)
        
        fig.add_trace(go.Scatter(
            x=X.iloc[:,i],
            y=y_pred[y_pred.columns[0]],
            mode='markers',
            name=X.columns[i],
            customdata=X.index.values,                                  # Add customdata for data's row index for more convinient to analysis
            hovertemplate="index:%{customdata} (X: %{x}, y: %{y})"
        ), row=row, col=col)

    fig.update_layout(height=400 * rows, width=600 * columns, title_text='Relationship between All features / ')
    fig.show()

In [79]:
X_test_df = pd.DataFrame(X_test)
y_test_df = pd.DataFrame(y_test)
y_test_pred_df = pd.DataFrame(y_test_pred)
plot_result(X_test_df, y_test_df, y_test_pred_df, 2)