# Linear Regression Implementation (Single variable)

In [5]:
import tensorflow as tf
import numpy as np
import pandas as pd

# Use plotly as it is an interaction plot
import plotly.express as px
# sub plot
from plotly.subplots import make_subplots
import plotly.graph_objects as go

### Load data

In [6]:
# Load data from TF dataset
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.boston_housing.load_data(
    path='boston_housing.npz', test_split=0.2, seed=113
)

In [7]:
# We got X_train np.array with row:404, col:13
print(f"X Shape: {X_train.shape}, X Type:{type(X_train)})")
# We got y_train np.array with row:404
print(f"y Shape: {y_train.shape}, y Type:{type(y_train)})")


X Shape: (404, 13), X Type:<class 'numpy.ndarray'>)
y Shape: (404,), y Type:<class 'numpy.ndarray'>)


In [8]:
# To make sure you understand the data. Read dataspec first.

# Variables in order:

#  X_dataset
#  CRIM     per capita crime rate by town
#  ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
#  INDUS    proportion of non-retail business acres per town
#  CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
#  NOX      nitric oxides concentration (parts per 10 million)
#  RM       average number of rooms per dwelling
#  AGE      proportion of owner-occupied units built prior to 1940
#  DIS      weighted distances to five Boston employment centres
#  RAD      index of accessibility to radial highways
#  TAX      full-value property-tax rate per $10,000
#  PTRATIO  pupil-teacher ratio by town
#  B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
#  LSTAT    % lower status of the population

#  y_dataset
#  MEDV     Median value of owner-occupied homes in $1000's

In [9]:
# Construct np.array to pd.Dataframe for easier handling as it is more human readable
X_df = pd.DataFrame(X_train, columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'])
y_df = pd.DataFrame(y_train, columns=['MEDV'])

### Take a look at dataset

In [10]:
X_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,1.23247,0.0,8.14,0.0,0.5380,6.142,91.7,3.9769,4.0,307.0,21.0,396.90,18.72
1,0.02177,82.5,2.03,0.0,0.4150,7.610,15.7,6.2700,2.0,348.0,14.7,395.38,3.11
2,4.89822,0.0,18.10,0.0,0.6310,4.970,100.0,1.3325,24.0,666.0,20.2,375.52,3.26
3,0.03961,0.0,5.19,0.0,0.5150,6.037,34.5,5.9853,5.0,224.0,20.2,396.90,8.01
4,3.69311,0.0,18.10,0.0,0.7130,6.376,88.4,2.5671,24.0,666.0,20.2,391.43,14.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.21977,0.0,6.91,0.0,0.4480,5.602,62.0,6.0877,3.0,233.0,17.9,396.90,16.20
400,0.16211,20.0,6.96,0.0,0.4640,6.240,16.3,4.4290,3.0,223.0,18.6,396.90,6.59
401,0.03466,35.0,6.06,0.0,0.4379,6.031,23.3,6.6407,1.0,304.0,16.9,362.25,7.83
402,2.14918,0.0,19.58,0.0,0.8710,5.709,98.5,1.6232,5.0,403.0,14.7,261.95,15.79


In [11]:
y_df

Unnamed: 0,MEDV
0,15.2
1,42.3
2,50.0
3,21.1
4,17.7
...,...
399,19.4
400,25.2
401,19.4
402,19.4


In [12]:
# Plot and see relationship

def plot_relation(X: pd.DataFrame, y: pd.DataFrame, columns):
    '''
    Plot relation between X input and y target
    
    Args:
        X (pd.DataFrame (m,n))  : Data, m examples, n features
        y (pd.DataFrame (m,1))  : target values, m values
        columns (int)             : number of desired subplot column
        
    Output
        Interation graph
    
    '''
    
    m = X.shape[1]
    rows = m // columns     # Get row
    frac = m % columns      # Get fractual
    row = 0
    col = 1

    if frac > 0:
        rows += 1
            
    fig = make_subplots(rows=rows, cols=columns)

    for i in range(m):
            
        if row >= rows:
            row = 1
            col += 1
        else:
            row += 1
        
        fig.add_trace(go.Scatter(
            x=X.iloc[:,i],
            y=y[y.columns[0]],
            mode='markers',
            name=X.columns[i],
            customdata=X.index.values,                                  # Add customdata for data's row index for more convinient to analysis
            hovertemplate="index:%{customdata} (X: %{x}, y: %{y})"
        ), row=row, col=col)

    fig.update_layout(height=400 * rows, width=600 * columns, title_text='Relationship between All features / ' + y.columns.values[0])
    fig.show()
    
plot_relation(X_df, y_df, 2)

You will get some ideas for example.
Prices tend to be higer as crime rate(CRIM) lower

# Linear regression

Formular

$ f_{w,b}(x^{(i)}) = wx^{(i)} + b $

In [13]:
# Slice to smaller dataset for simplify explanation
X_train_s = X_train[[395, 332, 317], 5]
y_train_s = y_train[[395, 332, 317]]

In [14]:
fig = make_subplots(rows=1, cols=1)

fig.add_trace(
    go.Scatter(
        x=X_train_s,
        y=y_train_s,
        mode='markers',
        name=X_df.columns[5]
    ))

# include shapes in layout
fig.update_layout(height=400, width=600, title_text="Target")
fig.show()

In [15]:
# Loop implementation
def compute_linear_regression(X, w, b):
    """
    Computes the prediction of a linear model
    Args:
      x (ndarray (m,)): Data, m examples 
      w,b (scalar)    : model parameters  
    Returns
      y (ndarray (m,)): target values
    """
    m = X.shape[0]
    f_wb = np.zeros(m)
    for i in range(m):
        f_wb[i] = w * X[i] + b
        
    return f_wb

In [29]:
# Vectorized implementation
def compute_linear_regression_v(X, w, b):
    f_wb = w * X + b
    return f_wb

compute_linear_regression_v(X=X_train_s, w=w_init , b=b_init)

array([ 7.4495, 28.796 , 51.6125])

In [16]:
w_init = 10.5
b_init = -40
f_wb = compute_linear_regression(X=X_train_s, w=w_init, b=b_init)
f_wb

array([ 7.4495, 28.796 , 51.6125])

In [17]:
fig = make_subplots(rows=1, cols=1)

fig.add_trace(
    go.Scatter(
        x=X_train_s,
        y=y_train_s,
        mode='markers',
        name=X_df.columns[5]
    ))

fig.add_trace(
    go.Line(
        x=X_train_s,
        y=f_wb,
        name='Predict'
    ))

shapes = []

m = X_train_s.shape[0]

for i in range(m):
    shapes.append(
        go.layout.Shape(
            type="line",
            x0=X_train_s[i],
            y0=f_wb[i],
            x1=X_train_s[i],
            y1=y_train_s[i],
            line=dict(
                #color=np.random.choice(colors,1)[0],
                color = 'black',
                width=1),
            opacity=1,
            layer='above'
            )
        )

# include shapes in layout
fig.update_layout(shapes=shapes)
fig.update_layout(height=400, width=600, title_text="Linear regression")
fig.show()


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




# Cost function

$ J(w, b) = \frac{1}{2m} \sum_{i=1}^{m} (f_{w,b}(x^{(i)}) - y^{(i)})^2 $

In [18]:
def compute_cost(X, y, w, b):
    '''
    '''
    m = X.shape[0]    

    f_wb = compute_linear_regression(X, w, b)
    
    error_sum = 0
    
    for i in range(m):
        error_sum += (f_wb[i] - y[i]) ** 2
        cost = error_sum / (2 * m)
    return cost

In [19]:
compute_cost(X_train_s, y_train_s, w=w_init, b=b_init)

1.685637083333339

Gradeint Descent

$ \{ $
    
$ w^{(i)}_j := w^{(i)}_j - \alpha \frac{\sigma}{\sigma w}J(w, b)x^{(i)}_j $

$ b^{(i)} := b^{(i)} - \alpha \frac{\sigma}{\sigma w}J(w, b) $

$ \} {stimulous update} $


$ \frac{\sigma}{\sigma w}J(w, b) = \frac{1}{m} \sum_{i=1}^{m} (h_{\theta}(x^{(i)}) - y^{(i)}) $

In [20]:
def gradient_descent(X, y, w, b, alpha, iteration):
    
    m = X.shape[0]
    
    f_wb = compute_linear_regression(X, w, b)
    
    error_sum = 0
    deri_J = 0
    iterate = []
    cost = []

    for x in range(iteration):
        for i in range(m):
            error_sum += (f_wb[i] - y[i])
        deri_J = error_sum / m
        w = w - alpha * deri_J * X[i]
        b = b - alpha * deri_J
        ccost = compute_cost(X, y, w, b)
        
        if(x % 1000 == 0): 
            print(ccost)
            iterate.append(x)
            cost.append(ccost)
        
    return w, b, iterate, cost
    

In [21]:
w, b, iterate, cost = gradient_descent(X_train_s, y_train_s, w=10, b=-40 ,alpha=0.0000000001, iteration=30000)

7.766499925856458
7.7293736627376335
7.618895459472447
7.437097751766898
7.187366578860846
6.874441583527857
6.504416012075612
6.084736714345213
5.624204143711947
5.13297235708455
4.622549014905766
4.105795381152164
3.5969263233339372
3.111510312495394
2.6664694232143886
2.280079333602667
1.9719693253059152
1.7631222835034812
1.6758746969086928
1.733916657768562
1.9622918618639755
2.3873976085096005
3.0369848005539946
3.9401579443793686
5.127375149901926
6.630448130571611
8.48254220337208
10.71817628882078
13.373222910968956
16.48490819740171


In [22]:
fig = make_subplots(rows=1, cols=1)

fig.add_trace(
    go.Scatter(
        x=iterate,
        y=cost,
        mode='lines+markers',
        name=X_df.columns[5]
    ))

# include shapes in layout
fig.update_layout(height=400, width=600, title_text="Cost Function")
fig.show()

In [23]:
print(f'w: {w} b: {b}')

w: 11.37946848075005 b: -39.84189472999998
