In [1]:
import sympy
import numpy as np
import math
from sympy import symbols, diff, simplify
from sympy import Function, Derivative

# MLS - My Notes C1 - A - ML Calculus Gradient Descent

## Basic Mathematics

In [2]:
x = symbols('x')
c = symbols('c')
a = symbols('a')
w = symbols('w')
n = symbols('n')
m = symbols('m')
b = symbols('b')
y = symbols('y')
f = Function('f')
g = Function('g')
F = Function('F')
J = Function('J')

$$x^1 = x$$
$$x^0 = 1$$

In [3]:
x**1

x

In [4]:
x**0

1

In [5]:
f = x**1
f

x

In [6]:
f = x**0
f

1

## Derivative Rules

$$f(x) = x^n$$
$$f'(x) = \frac {d}{dx} \left[ x^n \right] = nx^{n-1} \tag A$$ 

In [7]:
f = x**n
f

x**n

In [8]:
d_dx = diff(f,x)
simplify(d_dx) 

n*x**(n - 1)

E.g. 
$$f(x) = x^2$$
$$\frac{d}{dx} = 2x$$

In [9]:
diff(x**2)

2*x

E.g. 
$$f(x) = x^3$$
$$\frac{d}{dx} = 3x^2$$

In [10]:
diff(x**3)

3*x**2

In [11]:
d_dx.subs([(n,4)])

4*x**3

In [12]:
d_dx.subs([(n,8)])

8*x**7

In [13]:
diff(x**43)

43*x**42

$$f(x) = x$$
$$f'(x) = \frac {d}{dx} \left[ x \right] = 1 \tag B$$ 
Working:
$$\frac {d}{dx} = 1 \cdot x^{1-1} = 1 \cdot x^0 =1$$

In [14]:
f = x
f

x

In [15]:
d_dx = diff(f,x)
d_dx

1

In [16]:
diff(x)

1

In [17]:
d_dx.subs([(x,4)])

1

In [18]:
d_dx.subs([(x,231)])

1

$$f(x) = c$$
Where $c$ is a constant number
$$f'(x) = \frac {d}{dx} \left[ c \right] = 0 \tag C$$ 

In [19]:
f = c
f

c

In [20]:
d_dx = diff(f,x)
d_dx

0

In [21]:
diff(1)

0

In [22]:
d_dx.subs([(c,827)])

0

$$f(x) = cx$$
Where $c$ is a constant number
$$f'(x) = \frac {d}{dx} \left[ cx \right] = c \tag D$$ 
Working:
$$\frac {d}{dx} = c \cdot 1 \cdot x^{1-1} = c \cdot 1 \cdot x^0 = c \cdot 1 \cdot 1 = c$$

In [23]:
f = c * x
f

c*x

In [24]:
d_dx = diff(f,x)
d_dx

c

In [25]:
diff(1*x)

1

In [26]:
diff(123*x)

123

In [27]:
d_dx.subs([(c,8123)])

8123

$$f(x) = c + x$$
Where $c$ is a constant number
$$f'(x) = \frac {d}{dx} \left[ c + x \right] = 1 \tag E$$ 
Working:
$$\frac {d}{dx} \left[ c + x \right ]= \frac {d}{dx} \left[ c \right ] + \frac {d}{dx} \left[ x \right ] = 0 + 1 = 1$$

In [28]:
f = c + x
f

c + x

In [29]:
d_dx = diff(f,x)
d_dx

1

In [30]:
diff(1 + x)

1

In [31]:
d_dx.subs([(c,7232)])

1

$$f(x) = a + cx$$
Where $a$ and $c$ is a constant number
$$f'(x) = \frac {d}{dx} \left[ a + cx \right] = c \tag F$$ 
Working:
$$\frac {d}{dx} \left[ a + cx \right ]= \frac {d}{dx} \left[ a \right ] + \frac {d}{dx} \left[ cx \right ] = 0 + c = c$$

In [32]:
f = a + c*x
f

a + c*x

In [33]:
d_dx = diff(f,x)
d_dx

c

In [34]:
diff(4321 + (123 * x))

123

In [35]:
d_dx.subs([(a,12),(c,873)])

873

In [36]:
d_dx.subs([(a,12422),(c,83443)])

83443

$$f(x) = x^{-n} = \frac {1}{x^n}\tag G$$
Working
$$f(x) = x^{-2} = \frac {1}{x^2}$$

In [37]:
f = x**-n
f

x**(-n)

In [38]:
f.subs([(n,763)])

x**(-763)

In [39]:
x**-1

1/x

In [40]:
x**-2

x**(-2)

In [41]:
x**-45

x**(-45)

$$f(x) = x^{-n} = \frac {1}{x^n}$$
$$f'(x) = \frac {d}{dx} \left[ nx^{n-1} \right] = - \frac {n}{x^{n+1}} \tag H$$ 
Working:
$$f'(x) = \frac {d}{dx} \left[ -1 \cdot x^{-1-1} \right] = \frac {d}{dx} \left[ -1 \cdot x^{-2} \right] = \frac {d}{dx} \left[ -1 \cdot \frac {1}{x^2} \right] = - \frac {1}{x^2}$$
$$f'(x) = \frac {d}{dx} \left[ -2 \cdot x^{-2-1} \right] = \frac {d}{dx} \left[ -2 \cdot x^{-3} \right] = \frac {d}{dx} \left[ -2 \cdot \frac {1}{x^3} \right] = - \frac {2}{x^3}$$

In [42]:
diff(x**-1)

-1/x**2

In [43]:
diff(x**-2)

-2/x**3

In [44]:
diff(x**-3)

-3/x**4

In [45]:
diff(x**-42)

-42/x**43

In [46]:
f = x**-n
f

x**(-n)

In [47]:
d_dx = diff(f,x)
simplify(d_dx)

-n*x**(-n - 1)

In [48]:
d_dx.subs([(n,4)])

-4/x**5

## Chain Rule

We have 2 function $f(x)$ and $g(x)$. A function of function will be:
$$$$
$$F(x) = f(g(x))$$
Then
$$F'(x) = \frac {d}{dx} = f'(g(x)) \cdot g'(x)$$
This can also be explained as:
$$y = f(u)$$
$$u = g(x)$$
$$$$
$$ y' = \frac {dy}{du}$$
$$u'=\frac {du}{dx}$$ 
$$$$
$$\frac {dy}{dx} = \frac {dy}{du} \cdot \frac {du}{dx}$$

Another rule of thumb for **Chain Rule** : 
$$$$
$\frac {dy}{dx} = \left ( \text { derivatives of outside function } \right ) \cdot \left ( \text { derivatives of inside function } \right )$

In [49]:
f = (3*x + 1) ** 7
f

(3*x + 1)**7

In [50]:
d_dx = diff(f,x)
d_dx

21*(3*x + 1)**6

$$f(x) = (3x + 1)^7$$
$$f'(x) = \frac {df}{dx} \left[  (3x + 1)^7 \right ] = 7(3x + 1)^6 \cdot (3 + 0)$$
$$f'(x) = \frac {df}{dx} \left[  (3x + 1)^7 \right ] = 7(3x + 1)^6 \cdot 3 = 21(3x + 1)^6$$

Another Example:

In [51]:
f = (x**2 + 1) ** 5
f

(x**2 + 1)**5

In [52]:
d_dx = diff(f,x)
d_dx

10*x*(x**2 + 1)**4

$$f(x) = (x^2 + 1)^5$$
$$f'(x) = \frac {df}{dx} \left[  (x^2 + 1)^5 \right ] = 5(x^2 + 1)^4 \cdot (2x + 0)$$
$$f'(x) = \frac {df}{dx} \left[  (x^2 + 1)^5 \right ] = 5(x^2 + 1)^4 \cdot 2x = 10x(x^2 + 1)^4$$

## Calculus of Linear Regression

**Linear Regression**
$$f_{w,b}(x) = b + wx$$

**Cost Function**
$$J(w,b) = \frac {1}{2m} \sum \limits_{i=1}^m \left( f_{w,b}(x) - y \right)^2$$

### Gradient Descent Algorithm

**Gradient Descent Algorithm** for $f_{w,b}(x) = b + wx$
$$$$
**Repeat Until Convergence** {
\begin{equation}
\begin{aligned}
    b_j & = b_j - \alpha \frac {d} {db_j} J(w_j,b_j)\\
    w_j & = w_j - \alpha \frac {d} {dw_j} J(w_j,b_j)
\end{aligned}
\end{equation}
}

### Gradient Descent Algorithm (Simplified)

**Assume**
- one example, $m=1$

Then
$$f_{w}(x) = b + wx$$
$$$$
**Gradient Descent Algorithm** for $f_{w}(x) = b + wx$
$$$$
**Repeat Until Convergence** {
\begin{equation}
\begin{aligned}
    w_j & = w_j - \alpha \frac {d} {dw_j} J(w_j,b_j)
\end{aligned}
\end{equation}
}

### Partial Derivative with respect to $w$

$$f_{w}(x) = b + wx$$
Cost Function:
$$J(w) = \frac {1}{2m} \left( f_{w}(x) - y \right)^2$$
$$J(w) = \frac {1}{2m} \left( (b + wx) - y \right)^2$$

$$J'(w) = f'(g(w)) \cdot g'(w)$$
$$g(w) = ((b + wx) - y)$$
$$f(w) = \frac {1}{2m} (g(w))^2$$

$$f(g(w)) = \frac {1}{2m} (g(w))^2$$
$$f'(g(w)) = \frac {1}{2m} \cdot 2 \cdot (g(w))^1$$
$$f'(g(w)) = \frac {1}{m}  \cdot (g(w))$$
$$f'(g(w)) = \frac {1}{m}  \cdot ((b + wx) -y)$$

In [53]:
gw = symbols('gw')

In [54]:
g = (1/(2*m))*gw**2
g

gw**2/(2*m)

In [55]:
dg = diff(g,gw)
dg

gw/m

In [56]:
gw = (b + w*x) -y
gw

b + w*x - y

$$g(w) = ((b + wx) - y)$$
$$g'(w) =  \frac{d}{dw} \left [ b \right ] + \frac{d}{dw} \left [ wx \right ] - \frac{d}{dw} \left [ y \right ]$$

In [57]:
f = b
diff(f,w)

0

In [58]:
f = (w * x)

In [59]:
d_dw = diff(f,w)
d_dw

x

In [60]:
f = y
d_dwy = diff(f,w)
d_dwy

0

In [61]:
f = (b + (w*x)) - y

In [62]:
d_dw = diff(f,w)
d_dw

x

$$g(w) = ((b+wx) - y)$$
$$g'(w) =  \frac{d}{dw} \left [ b \right ] + \frac{d}{dw} \left [ wx \right ] - \frac{d}{dw} \left [ y \right ]$$
$$g'(w) =  0 + x - 0$$
$$g'(w) =  x$$

$$J'(w) = f'(g(w)) \cdot g'(w)$$
$$f'(g(w)) = \frac {1}{m} \cdot ((b+wx) -y)$$
$$g'(w) =  x$$
$$J'(w) = \frac {1}{m} \cdot ((b+ wx) -y) \cdot x$$

In [63]:
gw = (b + (w*x)) - y
gw

b + w*x - y

In [64]:
J = (1/(2*m))*gw**2
J

(b + w*x - y)**2/(2*m)

In [65]:
d_dw = diff(J,w)
d_dw

x*(b + w*x - y)/m

### Applying Gradient Descent

Apply gradient descent for one example:
$$$$
$$\frac {d} {dw} J(w) = \frac {1}{m}  \cdot ((b + wx) -y) \cdot x$$
$$$$
**Repeat Until Convergence** {
\begin{equation}
\begin{aligned}
    w & = w - \alpha \frac {d} {dw} J(w)
\end{aligned}
\end{equation}
}
$$$$
**Repeat Until Convergence** {
\begin{equation}
\begin{aligned}
    w & = w - \alpha  \cdot \frac {1}{m} \cdot ((b + wx) -y) \cdot x
\end{aligned}
\end{equation}
}

With more examples:
$$$$
**Repeat Until Convergence** {
\begin{equation}
\begin{aligned}
    w & = w - \alpha \sum \limits_{i=1}^{m} \frac {1}{m}  \cdot (b + wx^{(i)} -y^{(i)}) \cdot x^{(i)}
\end{aligned}
\end{equation}
} 

With more weights $j$:
$$$$
**Repeat Until Convergence** {
\begin{equation}
\begin{aligned}
    w_j & = w_j - \alpha \sum \limits_{i=1}^{m} \frac {1}{m}  \cdot ((b_j + w_jx^{(i)}) -y^{(i)}) \cdot x^{(i)}
\end{aligned}
\end{equation}
} 

### Partial Derivative with respect to $b$

**Assume**
- w = 1
- one example, $m=1$

Then
$$f_{w}(x) = b + wx$$
$$$$
**Gradient Descent Algorithm** for $f_{w}(x) = b + wx$
$$$$
**Repeat Until Convergence** {
\begin{equation}
\begin{aligned}
    b_j & = b_j - \alpha \frac {d} {db_j} J(w_j,b_j)
\end{aligned}
\end{equation}
}

$$f_{b}(x) = b + wx$$
Cost Function:
$$J(b) = \frac {1}{2m} \left( f_{w}(x) - y \right)^2$$
$$J(b) = \frac {1}{2m} \left( (b + wx) - y \right)^2$$

$$J'(b) = f'(g(b)) \cdot g'(b)$$
$$g(b) = ((b + wx) - y)$$
$$f(b) = \frac {1}{2m} (g(b))^2$$

$$f(g(b)) = \frac {1}{2m} (g(b))^2$$
$$f'(g(b)) = \frac {1}{2m} \cdot 2 \cdot (g(b))^1$$
$$f'(g(b)) = \frac {1}{m}  \cdot (g(b))$$
$$f'(g(b)) = \frac {1}{m}  \cdot ((b + wx) -y)$$

$$g(b) = ((b+wx) - y)$$
$$g'(b) =  \frac{d}{db} \left [ b + wx \right ] - \frac{d}{db} \left [ y \right ]$$
$$g'(b) =  \frac{d}{db} \left [ b \right ] + \frac{d}{db} \left [wx \right ] - \frac{d}{db} \left [ y \right ]$$
$$g'(b) =  1 + 0 - 0$$
$$g'(b) =  \frac{d}{db} \left [ b \right ]$$
$$g'(b) =  1$$

In [66]:
gb = symbols('gb')

In [67]:
f = b

In [68]:
diff(f,b)

1

In [69]:
f = w*x
diff(f,b)

0

In [70]:
f = y
diff(f,b)

0

In [71]:
gb = b + (w*x) -y

In [72]:
d_db = diff(gb, b)
d_db

1

$$J'(b) = f'(g(b)) \cdot g'(w)$$
$$f'(g(b)) = \frac {1}{m} \cdot ((b + wx) -y)$$
$$g'(b) =  1$$
$$J'(b) = \frac {1}{m} \cdot ((b + wx) -y) \cdot 1$$

### Applying Gradient Descent


Apply gradient descent for one example:
$$$$
$$\frac {d} {db} J(b) = \frac {1}{m}  \cdot ((b + wx) -y) \cdot 1$$
$$$$
**Repeat Until Convergence** {
\begin{equation}
\begin{aligned}
    b & = b - \alpha \frac {d} {db} J(b)
\end{aligned}
\end{equation}
}
$$$$
**Repeat Until Convergence** {
\begin{equation}
\begin{aligned}
    b & = b - \alpha  \cdot \frac {1}{m} \cdot ((b + wx) -y) \cdot 1
\end{aligned}
\end{equation}
}

With more examples:
$$$$
**Repeat Until Convergence** {
\begin{equation}
\begin{aligned}
    b & = b - \alpha \sum \limits_{i=1}^{m} \frac {1}{m}  \cdot ((b + wx^{(i)}) -y^{(i)})
\end{aligned}
\end{equation}
} 

With more weights $j$:
$$$$
**Repeat Until Convergence** {
\begin{equation}
\begin{aligned}
    b_j & = b_j - \alpha \sum \limits_{i=1}^{m} \frac {1}{m}  \cdot ((b_j + w_jx^{(i)}) -y^{(i)})
\end{aligned}
\end{equation}
} 

### Full Gradient Descent

**Gradient Descent Algorithm** for $f_{w,b}(x) = b + wx$
$$$$
**Repeat Until Convergence** {
\begin{equation}
\begin{aligned}
    b_j & = b_j - \alpha \frac {d} {db_j} J(w_j,b_j)\\
    w_j & = w_j - \alpha \frac {d} {dw_j} J(w_j,b_j)
\end{aligned}
\end{equation}
}

**Repeat Until Convergence** {
\begin{equation}
\begin{aligned}
    b_j & = b_j - \alpha \sum \limits_{i=1}^{m} \frac {1}{m}  \cdot ((b_j + w_jx^{(i)}) -y^{(i)})\\
    w_j & = w_j - \alpha \sum \limits_{i=1}^{m} \frac {1}{m}  \cdot ((b_j + w_jx^{(i)}) -y^{(i)}) \cdot x^{(i)}
\end{aligned}
\end{equation}
}

In [73]:
f = b + (w * x)

In [74]:
F = (1/(2*m)) * (f - y)**2

In [75]:
d_dw = diff(F,w)
simplify(d_dw)

x*(b + w*x - y)/m

In [76]:
d_db = diff(F,b)
simplify(d_db)

(b + w*x - y)/m

## Apply Normal Equation

**Cost Function**
$$J(w,b) = \frac {1}{2m} \sum \limits_{i=1}^m \left( f_{w,b}(x) - y \right)^2$$

**Derivative in Respect to Intercept**
$$J'(b) = \frac {1}{m} \cdot ((b + wx) -y) \cdot 1$$
$$J'(b) = \frac {1}{m} \cdot ((b + wx) -y)$$
$$0 = \frac {1}{m} \cdot ((b + wx) -y)$$
$$0 = \frac {(b + wx)}{m} - \frac {y}{m}$$
$$\frac {(b + wx)}{m} = \frac {y}{m}$$
$$b = \frac {y-wx}{m}$$
$$intercept = \frac {\sum {y}}{m} - w\frac {\sum{x}}{m}$$
$$$$
$$intercept = \text{(mean of y) } - \text{slope } \cdot \text{(mean of x)}$$

In [78]:
f = b + (w * x)
f

b + w*x

In [79]:
F = (1/(2*m)) * (f - y)**2
F

(b + w*x - y)**2/(2*m)

In [80]:
d_db = diff(F,b)
simplify(d_db)

(b + w*x - y)/m

In [81]:
def intercept(x,y, slope):
    result = y.mean() - (slope * x.mean())
    return result

In [82]:
x = np.array([1,2,3,4,5])
y = np.array([1,2,3,4,5])

In [83]:
x.mean()

3.0

In [84]:
y.mean()

3.0

In [85]:
intercept(x,y,1)

0.0

In [86]:
x = np.array([1,2,3,4,5])
y = np.array([3,4,5,6,7])

In [87]:
intercept(x,y,1)

2.0

In [88]:
x = np.array([1,2,3,4,5])
y = np.array([101,102,103,104,105])

In [89]:
intercept(x,y,1)

100.0

$$J'(w) = \frac {1}{m} \cdot ((b+ wx) -y) \cdot x$$

$$J'(w) = \frac {1}{m} \cdot ((b+ wx) -y) \cdot x$$
$$0 = \frac {1}{m} \cdot ((b+ wx) -y) \cdot x$$
$$0 = \frac {(b + wx)}{m} \cdot x - \frac {y}{m} \cdot x$$
$$0 = \frac {x(b + wx)}{m} - \frac {xy}{m}$$
$$\frac {x(b + wx)}{m} = \frac {xy}{m}$$
$$\frac {bx + wx^2}{m} = \frac {xy}{m}$$

$$\frac {wx^2}{m} = \frac {xy-bx}{m}$$


$$\frac {wx^2}{m} = \frac {xy-bx}{m}$$
$$\frac {wx^2}{m} = \frac {xy}{m} - \frac {bx}{m}$$
$$\frac {wx^2}{m} = \frac {xy}{m} - b \frac {x}{m}$$
$$b = \overline{y}-w\overline{x}$$

$$\frac {wx^2}{m} = \frac {xy}{m} - \overline{y}-w\overline{x} \cdot \frac {x}{m}$$

$$\frac {wx^2}{1} = \frac {xy}{1} - \frac {y-wx}{1} \cdot \frac {x}{1}$$
$$\frac {wx^2}{1} = \frac {xy}{1} - \frac {xy-wx^2}{1} $$
$$\frac {w}{1} = \frac {xy}{x^2} - \frac {xy-wx^2}{x^2} $$
$$\frac {w}{1} = \frac {xy-xy+wx^2}{x^2} $$

In [None]:
x = np.array([1,2,3,4,5])
y = np.array([100,200,300,400,500])