In [1]:
# Python imports
import numpy as np
import pandas as pd
import plotly.express as px

# Week 1:

Conventions:  
$x^{(i)}$  :denotes input variables  
$y^{(i)}$  :denotes output variables  
$(x^{(i)}, y^{(i)})$  :training example  

<div>
    <img src="attachment:image-2.png" width="500" align="left"/>
</div>


# Week 2:

## Linear regression cost function

Cost Function (squared error function, mean squared error). Goal is to minimize J.  
$J(θ)=(\frac{1}{2m}) \sum_{i=0}^{m}(\hat{y^{(i)}} - y^{(i)})^2$   
     $=(\frac{1}{2m}) \sum_{i=0}^{m}(h_\theta(x^{(i)}) - y^{(i)})^2$ 

where $\hat{y^{(i)}} = h_\theta(x^{(i)})= \theta^Tx = \theta_0 + \theta_1x^{(i)}$  (for linear regression)  
$h_\theta(x^{(i)})$ is the predicted (hypothesis) value and $y^{(i)}$ is the actual value.

Gradient Descent:  
$\theta_j≔ \theta_j−α \frac{d}{dθ_j} J(θ)$  
$\theta_j := \theta_j - \alpha \frac{1}{m} \sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)}) x_j^{(i)}$  (simultaneously update theta_j for all j)

Temp0 := theta_(j=0)  
Temp1 := theta_(j=1)  
Theta_(j=0) = Temp0  
Theta_(j=1) = Temp1  

## Linear algebra review

In [2]:
A = np.matrix("1, 2, 3; 4, 5, 6; 7, 8, 9; 10, 11, 12")
print("A:\n", A)

test = A[1,1] + A[0,1]
print("test: ", test)

# initialize a vector
v = np.matrix("1;2;3")
print("v:\n", v)

# Get the dimension of the matrix A where m = rows and n = columns
dim_A = A.shape 
print("dim_A:", dim_A) #dim_A has format (rows, columns)

# Get the dimension of the vector v 
dim_v = v.shape
print("dim_v:", dim_v)

A:
 [[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]
test:  7
v:
 [[1]
 [2]
 [3]]
dim_A: (4, 3)
dim_v: (3, 1)


## Week 2 Assignment

### 2.1 Plotting the Data

In [115]:
ex1data1DF = pd.read_csv("week1/ex1data1.txt", names=["X","y"])

print("ex1data1 info:")
ex1data1DF.info()
print("")

print("ex1data1DF preview:\n", ex1data1DF)

X=ex1data1DF["X"]
#print("X: ", X.values)
y=ex1data1DF["y"]
#print("y: ", y.values)
m=len(y)
print("m:",m)

ex1data1 info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97 entries, 0 to 96
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       97 non-null     float64
 1   y       97 non-null     float64
dtypes: float64(2)
memory usage: 1.6 KB

ex1data1DF preview:
           X         y
0    6.1101  17.59200
1    5.5277   9.13020
2    8.5186  13.66200
3    7.0032  11.85400
4    5.8598   6.82330
..      ...       ...
92   5.8707   7.20290
93   5.3054   1.98690
94   8.2934   0.14454
95  13.3940   9.05510
96   5.4369   0.61705

[97 rows x 2 columns]
m: 97


In [131]:
fig = px.scatter(x=X, y=y, 
    labels=dict(x="Population of City in 10k's", y="Profit in $10k's")
)
fig.update_layout(title_text='Profit vs Population', title_x=0.5)
fig.show()

### 2.2 Gradient Descent

In [43]:
# 2.2.2 Implementation
XwithOnesCol = np.matrix([np.ones(len(X)), ex1data1DF["X"].values]).transpose()
XwithOnesCol = pd.DataFrame(XwithOnesCol)
print("XwithOnesCol:\n", XwithOnesCol[:10])

theta = pd.DataFrame(np.zeros((2,1)))
print("theta:\n", theta)

# Some gradient descent settings
iterations = 1500
alpha = 0.01

XwithOnesCol:
      0       1
0  1.0  6.1101
1  1.0  5.5277
2  1.0  8.5186
3  1.0  7.0032
4  1.0  5.8598
5  1.0  8.3829
6  1.0  7.4764
7  1.0  8.5781
8  1.0  6.4862
9  1.0  5.0546
theta:
      0
0  0.0
1  0.0


In [106]:
""" 2.2.3 Computing the cost J(theta)
J_theta cost function. assume theta_1 = theta and that theta_0 = 0"""
def computeCost(XwithOnesCol, y, theta):
    X = XwithOnesCol
    #print("X:\n", X)
    #print("y:\n", y)
    
    h_theta = X.dot(theta)
    #print("h_theta:\n", h_theta)
    
    sqDiffArray = (h_theta[0] - y)**2
    #print("sqDiffArray:\n", sqDiffArray)
    
    sqError = sum(sqDiffArray)
    #print("sqError: ", sqError)
    
    J_theta = float(1/(2*len(sqDiffArray))) * sqError
    return J_theta

def test_computeCost():
    J_theta = computeCost(XwithOnesCol, y, theta)
    print("J_theta: ", J_theta)

#test_computeCost()

In [107]:
# """
# compute the new theta (for the next iteration), starting from an intial theta of
# theta:
#  [[0.]
#  [0.]]
# """
def computeThetaStep(currTheta, alpha, X, y, m):
    #print("--- computeThetaStep ---")
    h_theta = X[:m].dot(currTheta) # hypothesis of theta
    #print("h_theta:\n", h_theta)
    #print("y:\n", y[:5])
    a = h_theta[0] - y
    #print("a:\n", a)
    #print("X:\n", X)
    b = a * X[:m].transpose()
    #print("b:\n", b)
    c = b.sum(axis=1) # do a row-wise sum
    #print("c:\n", c)
    theta = currTheta[0] - (alpha/m) * c
    #print("theta:\n", theta)
    return theta

def test_computeThetaStep():
    currTheta = pd.DataFrame(np.zeros((2,1))) # test
    #print("initialTheta:\n", initialTheta)
    newTheta = computeThetaStep(currTheta, alpha, XwithOnesCol, y, m)
    print("newTheta:\n",newTheta)
    
#test_computeThetaStep()

In [124]:
""" 2.2.4 Gradient Descent
theta = gradientDescent(...) updates theta by taking num_iters gradient steps with learning rate alpha.
"""
def gradientDescent(X, y, theta, alpha, num_iters):
    m = len(y)
    J_history = np.zeros(num_iters)
    
    for j in range(num_iters):
        # Perform a single gradient step on the parameter vector theta
        theta = computeThetaStep(theta, alpha, X, y, m)
        #print("theta: ", theta)
        J_history[j] = computeCost(X,y,theta)
        #print("J_history[j={}]: {}".format(j, J_history[j]))

    return [theta, J_history]

def test_gradientDescent():
    initialTheta = pd.DataFrame(np.zeros((2,1))) # test
    num_iters = 100
    endTheta,J_history = gradientDescent(XwithOnesCol, y, initialTheta, alpha, num_iters)
    print("endTheta:\n{}".format(endTheta))
    print("J_history all:\n{}".format(J_history[:20]))
    return endTheta

endTheta = test_gradientDescent()
endTheta_0 = endTheta.iloc[0]
endTheta_1 = endTheta.iloc[1]
print("endTheta_1:\n{}".format(endTheta_1))

endTheta:
0    0.664871
1    0.843317
dtype: float64
J_history all:
[16.62539854 20.91999115 18.07560588 18.71327826 17.9418119  17.8455578
 17.4976665  17.29745736 17.06624519 16.87891692 16.69981128 16.54191913
 16.39719861 16.2670339  16.1488885  16.04212462 15.94543992 15.85797307
 15.77880581 15.70716786]
endTheta_1:
0.8433167812939487


In [132]:
""" Replot with linreg line: y=theta_1 * x
y_predicted will be = theta_1 * x_for_prediction
"""
print("X:\n{}".format(X))

# need to construct a linspace of m elements from min value of x to max value of x
x_for_prediction = np.linspace(X.min(), X.max()+1, num=m)
print("x_for_prediction: ", x_for_prediction)

y_predicted = []
for i in range(len(x_for_prediction)):
    x_for_prediction_val = x_for_prediction[i]
    y_predicted.append(endTheta_1 * x_for_prediction_val)

print("y_predicted: ", y_predicted)


fig = px.scatter(x=X, y=y, 
    labels=dict(x="Population of City in 10k's", y="Profit in $10k's")
)
fig.update_layout(title_text='Profit vs Population (with linreg line)', title_x=0.5)
fig.show()

X:
0      6.1101
1      5.5277
2      8.5186
3      7.0032
4      5.8598
       ...   
92     5.8707
93     5.3054
94     8.2934
95    13.3940
96     5.4369
Name: X, Length: 97, dtype: float64
x_for_prediction:  [ 5.0269      5.21623437  5.40556875  5.59490313  5.7842375   5.97357188
  6.16290625  6.35224063  6.541575    6.73090938  6.92024375  7.10957813
  7.2989125   7.48824688  7.67758125  7.86691563  8.05625     8.24558437
  8.43491875  8.62425313  8.8135875   9.00292188  9.19225625  9.38159063
  9.570925    9.76025938  9.94959375 10.13892813 10.3282625  10.51759688
 10.70693125 10.89626563 11.0856     11.27493438 11.46426875 11.65360313
 11.8429375  12.03227188 12.22160625 12.41094063 12.600275   12.78960938
 12.97894375 13.16827813 13.3576125  13.54694688 13.73628125 13.92561563
 14.11495    14.30428438 14.49361875 14.68295313 14.8722875  15.06162188
 15.25095625 15.44029063 15.629625   15.81895938 16.00829375 16.19762813
 16.3869625  16.57629688 16.76563125 16.95496563 17.1443  

# Week 3

## Logistic regression cost function

$Cost(h_\theta(x),y) = \begin{cases}
-log(h_\theta(x)) &\text{if y=1} \\
-log(1-h_\theta(x)) &\text{if y=0} \\ 
\end{cases}
$

<div>
    <img src="attachment:image.png" width="500" align="left"/>
    <img src="attachment:image-2.png" width="500" align="left"/>
</div>

Alternate form that satisfies both piecewise conditions:  
$Cost(h_\theta(x),y) = -ylog(h_\theta(x)) - (1-y)log(1-h_\theta(x))$

$J(θ)=(\frac{1}{m}) \sum_{i=0}^{m} Cost(h_\theta(x^{(i)}), y^{(i)})$  
$= -(\frac{1}{m})[\sum_{i=0}^{m} y^{(i)}log(h_\theta(x^{(i)})) + (1-y^{(i)})log(1-h_\theta(x^{(i)}))]$  

J is the overall cost function. $Cost(h_\theta(x^{(i)}), y^{(i)})$ is the cost of making different predictions on different labels of y^(i).  

To fit parameters theta, need to minize J(theta) to make a prediction given new x:

$h_\theta(x) = \frac{1}{1 + \exp(-\theta^T x)}$  (for logistic regression)  

To minimize J(theta):  (alpha is learning rate)  
repeat {  
    $\theta_j := \theta_j - \alpha \sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)}) x_j^{(i)}$  
}  

The above formula is exactly the same as for linear regression, but the difference is that h_theta, the hypothesis, is a logistic function rather than a linear function.

<div>
    <img src="attachment:image.png" width="500" align="left"/>
    <img src="attachment:image-2.png" width="300" align="left"/>
</div>


## Advanced Optimization

Optimization Algorithms:
- Gradient descent
- Conjugate gradient
- BFGS
- L-BFGS

Latter 3:
- Advantages:
    - no need to manually pick alpha
    - often faster than gradient descent
- Disadvantages:
    - more complex
    
![image.png](attachment:image.png)

In [7]:
def costFunction(theta):
    # code to compute J(theta)
    jVal = "..."
    
    # code to compute partial derivative d/(d theta_0) J(theta)
    gradient0 = "..."
    # code to compute partial derivative d/(d theta_1) J(theta)
    gradient1 = "..."
    
    return [jVal, gradient]

## The problem of overfitting

### Regularization

$J(\theta) = (\frac{1}{2m})[\sum_{i=0}^{m}(h_\theta(x^{(i)}) - y^{(i)})^2 + \lambda \sum_{j=1}^{n} \theta_j^2]$

In $\lambda \sum_{j=1}^{n} \theta_j^2$, lambda is the regularization parameter: controls a tradeoff between 2 goals
1. Fit training data well. 
2. Keep parameters small.

If lambda is too large, the model will end up underfitting. We say that the "hypothesis has too strong of a preconception".

### Regularized Linear Regression

Gradient descent for regularized linear regression.


