In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import seaborn as sns   # Why sns?  It's a reference to The West Wing
import matplotlib.pyplot as plt  # seaborn is based on matplotlib
sns.set(color_codes=True) # adds a nice background to the graphs
%matplotlib inline

In [2]:
df = pd.read_csv('topic3_regression_data.csv')
ndata = df.shape[0]
x1 = df['x1']
x2 = df['x2']
y = df['y']
print(ndata)

100


In [3]:
reg = LinearRegression().fit(df[['x1','x2']], df['y'])
beta_ols = np.append(reg.intercept_,reg.coef_)
print(beta_ols)

[ 1.15736763 -0.48388328  0.85474407]


# how to calculate the gradient


$ obj = \frac{1}{n} \sum_{i=1}^n (\hat{y}_i - y_i)^2 $

$ \frac{\partial obj}{\partial \beta_0} = \frac{1}{n} \sum_{i=1}^n 2(\hat{y}_i - y_i)$

$ \frac{\partial obj}{\partial \beta_1} = \frac{1}{n} \sum_{i=1}^n 2(\hat{y}_i - y_i)x_{1i}$

$ \frac{\partial obj}{\partial \beta_2} = \frac{1}{n} \sum_{i=1}^n 2(\hat{y}_i - y_i)x_{2i}$

$ \nabla obj = \left( \frac{\partial obj}{\partial \beta_0}, \frac{\partial obj}{\partial \beta_1}, \frac{\partial obj}{\partial \beta_2}\right) $

# SGD with Momentum

In [4]:
learnRate = 1e-3
epochs = 200
batches = 10
dat_per_bat = ndata//batches
theta1 = 0.9

In [5]:
# SGD with momentum

SGDMOMError = np.zeros(epochs)
beta = np.zeros(3)
k = 1
m = np.zeros(3)
for ep in range(epochs):
    shuf = np.random.choice(range(ndata),size=ndata,replace=False)
    for bat in range(batches):
        this_bat = shuf[(bat*dat_per_bat):((bat+1)*dat_per_bat)]
        yhat = beta[0] + beta[1]*x1[this_bat] + beta[2]*x2[this_bat]

        grad0 = 2.0*np.mean(yhat-y[this_bat])
        grad1 = 2.0*np.mean((yhat-y[this_bat])*x1[this_bat])
        grad2 = 2.0*np.mean((yhat-y[this_bat])*x2[this_bat])
        grad = np.array([grad0,grad1,grad2])

        m = theta1*m + (1-theta1)*grad
        mhat = m/(1-theta1**k)

        beta -= learnRate*mhat
        k += 1


    yhat = beta[0] + beta[1]*x1 + beta[2]*x2
    SGDMOMError[ep] = np.mean((yhat-y)**2)

print(SGDMOMError[epochs-1])
print(beta)
print(beta_ols)

0.35044887441848593
[ 1.1422902  -0.4851039   0.84476918]
[ 1.15736763 -0.48388328  0.85474407]


# ADAM

In [6]:
# adam

theta2 = 0.999
e=1e-12


ADAMError = np.zeros(epochs)
beta = np.zeros(3)
k = 1
m = np.zeros(3)
v = np.zeros(3)
for ep in range(epochs):
    shuf = np.random.choice(range(ndata),size=ndata,replace=False)
    for bat in range(batches):
        this_bat = shuf[(bat*dat_per_bat):((bat+1)*dat_per_bat)]
        yhat = beta[0] + beta[1]*x1[this_bat] + beta[2]*x2[this_bat]

        grad0 = 2.0*np.mean(yhat-y[this_bat])
        grad1 = 2.0*np.mean((yhat-y[this_bat])*x1[this_bat])
        grad2 = 2.0*np.mean((yhat-y[this_bat])*x2[this_bat])
        grad = np.array([grad0,grad1,grad2])

        m = theta1*m + (1-theta1)*grad
        mhat = m/(1-theta1**k)

        v = theta2*v + (1-theta2)*(grad**2)
        vhat = v/(1-theta2**k)

        beta -= learnRate*mhat/(np.sqrt(vhat)+e)
        k += 1


    yhat = beta[0] + beta[1]*x1 + beta[2]*x2
    ADAMError[ep] = np.mean((yhat-y)**2)

print(ADAMError[ep])
print(beta)
print(beta_ols)

0.3560130485551429
[ 1.08463773 -0.49107559  0.83351088]
[ 1.15736763 -0.48388328  0.85474407]
