# Linear Regression Implementation

In [4]:
import tensorflow as tf
import numpy as np
import pandas as pd

# Use plotly as it is an interaction plot
import plotly.express as px
# sub plot
from plotly.subplots import make_subplots
import plotly.graph_objects as go

### Load data

In [5]:
# Load data from TF dataset
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.boston_housing.load_data(
    path='boston_housing.npz', test_split=0.2, seed=113
)

In [6]:
# We got X_train np.array with row:404, col:13
print(f"X Shape: {X_train.shape}, X Type:{type(X_train)})")
# We got y_train np.array with row:404
print(f"y Shape: {y_train.shape}, y Type:{type(y_train)})")

X Shape: (404, 13), X Type:<class 'numpy.ndarray'>)
y Shape: (404,), y Type:<class 'numpy.ndarray'>)


In [7]:
# To make sure you understand the data. Read dataspec first.

# Variables in order:

#  X_dataset
#  CRIM     per capita crime rate by town
#  ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
#  INDUS    proportion of non-retail business acres per town
#  CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
#  NOX      nitric oxides concentration (parts per 10 million)
#  RM       average number of rooms per dwelling
#  AGE      proportion of owner-occupied units built prior to 1940
#  DIS      weighted distances to five Boston employment centres
#  RAD      index of accessibility to radial highways
#  TAX      full-value property-tax rate per $10,000
#  PTRATIO  pupil-teacher ratio by town
#  B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
#  LSTAT    % lower status of the population

#  y_dataset
#  MEDV     Median value of owner-occupied homes in $1000's

Create pd.Dataframe from np.array for radability

In [8]:
X_df = pd.DataFrame(X_train, columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'])
y_df = pd.DataFrame(y_train, columns=['MEDV'])

### Take a look at dataset

In [9]:
X_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,1.23247,0.0,8.14,0.0,0.5380,6.142,91.7,3.9769,4.0,307.0,21.0,396.90,18.72
1,0.02177,82.5,2.03,0.0,0.4150,7.610,15.7,6.2700,2.0,348.0,14.7,395.38,3.11
2,4.89822,0.0,18.10,0.0,0.6310,4.970,100.0,1.3325,24.0,666.0,20.2,375.52,3.26
3,0.03961,0.0,5.19,0.0,0.5150,6.037,34.5,5.9853,5.0,224.0,20.2,396.90,8.01
4,3.69311,0.0,18.10,0.0,0.7130,6.376,88.4,2.5671,24.0,666.0,20.2,391.43,14.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.21977,0.0,6.91,0.0,0.4480,5.602,62.0,6.0877,3.0,233.0,17.9,396.90,16.20
400,0.16211,20.0,6.96,0.0,0.4640,6.240,16.3,4.4290,3.0,223.0,18.6,396.90,6.59
401,0.03466,35.0,6.06,0.0,0.4379,6.031,23.3,6.6407,1.0,304.0,16.9,362.25,7.83
402,2.14918,0.0,19.58,0.0,0.8710,5.709,98.5,1.6232,5.0,403.0,14.7,261.95,15.79


In [10]:
y_df

Unnamed: 0,MEDV
0,15.2
1,42.3
2,50.0
3,21.1
4,17.7
...,...
399,19.4
400,25.2
401,19.4
402,19.4


In [11]:
# Plot and see relationship
def plot_relation(X: pd.DataFrame, y: pd.DataFrame, columns):
    '''
    Plot relation between X input and y target
    
    Args:
        X (pd.DataFrame (m,n))  : Data, m examples, n features
        y (pd.DataFrame (m,1))  : target values, m values
        columns (int)             : number of desired subplot column
        
    Output
        Interation graph
    
    '''
    
    m = X.shape[1]
    rows = m // columns     # Get row
    frac = m % columns      # Get fractual
    row = 0
    col = 1

    if frac > 0:
        rows += 1
            
    fig = make_subplots(rows=rows, cols=columns)

    for i in range(m):
            
        if row >= rows:
            row = 1
            col += 1
        else:
            row += 1
        
        fig.add_trace(go.Scatter(
            x=X.iloc[:,i],
            y=y[y.columns[0]],
            mode='markers',
            name=X.columns[i],
            customdata=X.index.values,                                  # Add customdata for data's row index for more convinient to analysis
            hovertemplate="index:%{customdata} (X: %{x}, y: %{y})"
        ), row=row, col=col)

    fig.update_layout(height=400 * rows, width=600 * columns, title_text='Relationship between All features / ' + y.columns.values[0])
    fig.show()
    
plot_relation(X_df, y_df, 2)

### Check the datatype and null values

In [12]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404 entries, 0 to 403
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     404 non-null    float64
 1   ZN       404 non-null    float64
 2   INDUS    404 non-null    float64
 3   CHAS     404 non-null    float64
 4   NOX      404 non-null    float64
 5   RM       404 non-null    float64
 6   AGE      404 non-null    float64
 7   DIS      404 non-null    float64
 8   RAD      404 non-null    float64
 9   TAX      404 non-null    float64
 10  PTRATIO  404 non-null    float64
 11  B        404 non-null    float64
 12  LSTAT    404 non-null    float64
dtypes: float64(13)
memory usage: 41.2 KB


There are all numeric (no catagorial) and no null data.

So no need to preprocessing the null and encoding

### Check min max for feature scaling

In [13]:
X_df.agg(['min', 'max'])

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,188.0,12.6,0.32,1.73
max,88.9762,100.0,27.74,1.0,0.871,8.725,100.0,10.7103,24.0,711.0,22.0,396.9,37.97


Min in  0 and max is 396

So Scaling is needed.

In this case we will use z-score normalization

### Z-score normalization

$ x^{(i)}_j = \frac{x^{(i)}_j - \mu_j }{ \sigma_j}$

$ \mu_j = \frac{1}{m} \sum_{i=0}^{m-1} x^{(i)}_j $

$ \sigma^2 = \frac{1}{m} \sum_{i=0}^{m-1} (x^{(i)}_j - \mu_j)^2 $

In [14]:
# Z-score normalization loop
def zscore_normalize_features(X):
    
    m = X.shape[0]
    n = X.shape[1]
    
    mu = np.zeros(n)
    sigma = np.zeros(n)
    X_norm = np.zeros((m,n))
    

    
    for j in range(n):
        
        x_j_sum = 0
        sigma_j_sum = 0
        
        for i in range(m):
            x_j_sum += X[i][j]
        mu[j] = x_j_sum / m
        
        for i in range(m):
            sigma_j_sum += (X[i][j] - mu[j]) ** 2
        sigma[j] = (sigma_j_sum / m) ** (1/2)

        for i in range(m):
            X_norm[i][j] = (X[i][j] - mu[j]) / sigma[j]
            
    return (X_norm, mu, sigma)

In [15]:
# Z-score normalization np
def zscore_normalize_features(X: np.array):
    '''
    Feature scaling: Z-score normalize
    Args:
        X       (np.array (m,n))    : Data, m,n examples
    Returns
        X_norm  (np.array (m,n))    : Data with z-score normallized, m,n examples
    '''
    
    # Mean
    mu = np.mean(X, axis=0)
    # Standard deviation
    sigma = np.std(X, axis=0)
    # Z-score normalize
    X_norm = (X - mu) / sigma   
            
    return X_norm

Compute and check if min and max

In [16]:
X_zscore = zscore_normalize_features(X_train)
armin, armax = np.min(X_zscore[0]), np.max(X_zscore[0])
print(f'min: {armin}, max: {armax}')

min: -0.6262490526587586, max: 1.1485004386235735


In [17]:
X_zscore_df  = pd.DataFrame(X_zscore, columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'])

Let's plot and see X scale of all features

In [18]:
# Plot and see relationship
def plot_relation(X: pd.DataFrame, y: pd.DataFrame, columns):
    '''
    Plot relation between X input and y target
    
    Args:
        X (pd.DataFrame (m,n))  : Data, m examples, n features
        y (pd.DataFrame (m,1))  : target values, m values
        columns (int)             : number of desired subplot column
        
    Output
        Interation graph
    
    '''
    
    m = X.shape[1]
    rows = m // columns     # Get row
    frac = m % columns      # Get fractual
    row = 0
    col = 1

    if frac > 0:
        rows += 1
            
    fig = make_subplots(rows=rows, cols=columns)

    for i in range(m):
            
        if row >= rows:
            row = 1
            col += 1
        else:
            row += 1
        
        fig.add_trace(go.Scatter(
            x=X.iloc[:,i],
            y=y[y.columns[0]],
            mode='markers',
            name=X.columns[i],
            customdata=X.index.values,                                  # Add customdata for data's row index for more convinient to analysis
            hovertemplate="index:%{customdata} (X: %{x}, y: %{y})"
        ), row=row, col=col)

    fig.update_layout(height=400 * rows, width=600 * columns, title_text='Relationship between All features / ' + y.columns.values[0])
    fig.show()
    
plot_relation(X_zscore_df, y_df, 2)

You will see that X scale is between -4 to 10

much better than 0 to 700

# Linear regression

### Formula

$ f_{w,b}(x^{(i)}) = wx^{(i)} + b $

In [20]:
# Vectorized implementation
def compute_linear_regression_v(X, w, b):
    f_wb = X.dot(w.T) + b
    return f_wb

In [21]:
w_init = np.ones(X_zscore.shape[1])
b_init = 1
f_wb = compute_linear_regression_v(X=X_zscore, w=w_init , b=b_init)

# Cost function

$ J(w, b) = \frac{1}{2m} \sum_{i=1}^{m} (f_{w,b}(x^{(i)}) - y^{(i)})^2 $

In [23]:
def compute_cost_v(X, y, w, b):
    f_wb = compute_linear_regression_v(X, w, b)
    cost = ((f_wb - y) ** 2).mean() / 2
    return cost

In [24]:
compute_cost_v(X_zscore, y_train, w=w_init, b=b_init)

297.91773611355814

Gradeint Descent

$ \{ $
    
$ w^{(i)}_j := w^{(i)}_j - \alpha \frac{\sigma}{\sigma w}J(w, b)x^{(i)}_j $

$ b^{(i)} := b^{(i)} - \alpha \frac{\sigma}{\sigma w}J(w, b) $

$ \} {stimulous update} $


$ \frac{\sigma}{\sigma w}J(w, b) = \frac{1}{m} \sum_{i=1}^{m} (h_{\theta}(x^{(i)}) - y^{(i)}) $

In [25]:
def gradient_function_v(X, y, w, b):
    
    m = X.shape[0]
    
    dj_dw = 0
    dj_db = 0

    f_wb = compute_linear_regression_v(X, w, b)
    error = f_wb - y
    dj_dw = error.T.dot(X)
    dj_db = sum(error)
    dj_dw = dj_dw / m
    dj_db = dj_db / m
        
    return dj_dw, dj_db

In [26]:
dj_dw, dj_db = gradient_function_v(X_zscore, y_train, w=w_init, b=b_init)

In [41]:
def gradient_descent(X, y, w, b, alpha, num_iters, cost_function, gradient_function):
    
    j_hist = []
    p_hist = []

    for i in range(num_iters):
        
        dj_dw, dj_db = gradient_function(X, y, w, b)
        
        w = w - np.dot(alpha, dj_dw)
        b = b - np.dot(alpha, dj_db)
    
        j_hist.append(cost_function(X, y, w, b))
        p_hist.append([w,b])
        
    return w, b, j_hist, p_hist

In [42]:
w_out, b_out, j_hist, p_hist = gradient_descent(X_zscore, y_train, w=w_init, b=b_init,alpha=0.3, num_iters=10000, cost_function=compute_cost_v, gradient_function=gradient_function_v)

Let's check is decrease gradually

In [43]:
j_hist

[162.4424484628074,
 93.61480122111978,
 57.90689435043225,
 38.88548838643343,
 28.41710432824258,
 22.429776021063496,
 18.855542717959462,
 16.62476889279441,
 15.171112616186964,
 14.185885606873507,
 13.494946313867947,
 12.996205737683121,
 12.627365925842561,
 12.34890030958685,
 12.13482736503636,
 11.967543535647945,
 11.834821273003307,
 11.72799621064244,
 11.64083018205859,
 11.568773049800182,
 11.50846906945383,
 11.45741875274458,
 11.413742855126364,
 11.376015263985925,
 11.343143376961686,
 11.314281753961723,
 11.288769370488321,
 11.266083765429826,
 11.245807365805266,
 11.227602634974028,
 11.211193641919929,
 11.196352320711682,
 11.18288816766154,
 11.170640466819044,
 11.159472381696196,
 11.149266429936823,
 11.139920987314142,
 11.131347561679016,
 11.123468646098729,
 11.116216010474703,
 11.109529327495855,
 11.103355055551503,
 11.097645520859919,
 11.092358155499976,
 11.087454858669123,
 11.08290145635263,
 11.078667240420398,
 11.074724572510258,
 11.07

In [44]:
print(f'result cost:{min(j_hist)} with parameter w:{w_out} b:{b_out}')

result cost:11.00240041917407 with parameter w:[-1.10749194  1.35308963  0.02710126  0.9943535  -2.40156557  2.3962877
  0.2111952  -3.47121628  2.90814885 -1.95741889 -1.98298545  0.81955092
 -4.02739073] b:22.39504950495051


Test with gradient descent output w, b parameters

In [45]:
compute_cost_v(X_zscore, y_train, w=w_out, b=b_out)

11.002400419174071

Previously we test with known data (Training dataset)

Now let's check with unknown data (Test dataset)

In [46]:
X_test_zscore = zscore_normalize_features(X_test)
X_test_zscore_df  = pd.DataFrame(X_test_zscore, columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'])
y_test_df = pd.DataFrame(y_test, columns=['MEDV'])

In [47]:
compute_cost_v(X_test_zscore, y_test, w=w_out, b=b_out)

10.491472595351286

Wow! better than training dataset

#### Let's plot Cost function

In [48]:
fig = make_subplots(rows=1, cols=1)

fig.add_trace(
    go.Scatter(
        x=[i for i in range(len(j_hist))],
        y=j_hist,
        mode='lines+markers',
        name=X_df.columns[5]
    ))

# include shapes in layout
fig.update_layout(height=400, width=600, title_text="Cost Function")
fig.show()

Cost not decrease significantly after 300 iteration

Maybe we should consider to set threshold to interupt iteration after reach certain value ($ \epsilon $)

In [49]:
print(f'result cost:{min(j_hist)} with parameter w:{w_out} b:{b_out}')

result cost:11.00240041917407 with parameter w:[-1.10749194  1.35308963  0.02710126  0.9943535  -2.40156557  2.3962877
  0.2111952  -3.47121628  2.90814885 -1.95741889 -1.98298545  0.81955092
 -4.02739073] b:22.39504950495051


Let's check our result with parameter w,b

and plot to see how the model fit the targets

In [50]:
# Plot and see relationship
def plot_relation(X: pd.DataFrame, y: pd.DataFrame, columns):
    '''
    Plot relation between X input and y target
    
    Args:
        X (pd.DataFrame (m,n))  : Data, m examples, n features
        y (pd.DataFrame (m,1))  : target values, m values
        columns (int)             : number of desired subplot column
        
    Output
        Interation graph
    
    '''
    
    m = X.shape[1]
    rows = m // columns     # Get row
    frac = m % columns      # Get fractual
    row = 0
    col = 1

    if frac > 0:
        rows += 1
            
    fig = make_subplots(rows=rows, cols=columns)

    for i in range(m):
            
        if row >= rows:
            row = 1
            col += 1
        else:
            row += 1
        
        fig.add_trace(
            go.Scatter(
                x=X.iloc[:,i],
                y=y[y.columns[0]],
                mode='markers',
                name=X.columns[i],
                customdata=X.index.values,                                  # Add customdata for data's row index for more convinient to analysis
                hovertemplate="index:%{customdata} (X: %{x}, y: %{y})"
            ), 
            row=row, col=col)
        
        fig.add_trace(
            go.Scatter(
                x=X.iloc[:,i],
                y=np.dot(X_zscore, w_out) + b_out,
                mode='markers',
                name=f'{X.columns[i]} prediction',
                customdata=X.index.values,                                  # Add customdata for data's row index for more convinient to analysis
                hovertemplate="index:%{customdata} (X: %{x}, y: %{y})"
            ),
            row=row, col=col)

    fig.update_layout(height=400 * rows, width=600 * columns, title_text='Relationship between All features / ' + y.columns.values[0])
    fig.show()
    
plot_relation(X_zscore_df, y_df, 2)

Plot again with test set

In [51]:
# Plot and see relationship
def plot_relation(X: pd.DataFrame, y: pd.DataFrame, columns):
    '''
    Plot relation between X input and y target
    
    Args:
        X (pd.DataFrame (m,n))  : Data, m examples, n features
        y (pd.DataFrame (m,1))  : target values, m values
        columns (int)             : number of desired subplot column
        
    Output
        Interation graph
    
    '''
    
    m = X.shape[1]
    rows = m // columns     # Get row
    frac = m % columns      # Get fractual
    row = 0
    col = 1

    if frac > 0:
        rows += 1
            
    fig = make_subplots(rows=rows, cols=columns)

    for i in range(m):
            
        if row >= rows:
            row = 1
            col += 1
        else:
            row += 1
        
        fig.add_trace(
            go.Scatter(
                x=X.iloc[:,i],
                y=y[y.columns[0]],
                mode='markers',
                name=X.columns[i],
                customdata=X.index.values,                                  # Add customdata for data's row index for more convinient to analysis
                hovertemplate="index:%{customdata} (X: %{x}, y: %{y})"
            ), 
            row=row, col=col)
        
        fig.add_trace(
            go.Scatter(
                x=X.iloc[:,i],
                y=np.dot(X_zscore, w_out) + b_out,
                mode='markers',
                name=f'{X.columns[i]} prediction',
                customdata=X.index.values,                                  # Add customdata for data's row index for more convinient to analysis
                hovertemplate="index:%{customdata} (X: %{x}, y: %{y})"
            ),
            row=row, col=col)

    fig.update_layout(height=400 * rows, width=600 * columns, title_text='Relationship between All features / ' + y.columns.values[0])
    fig.show()
    
plot_relation(X_test_zscore_df, y_test_df, 2)