In [2]:
import numpy as np
import csv
import matplotlib.pyplot as plt
%matplotlib inline  
inv = np.linalg.inv
plt.style.use('ggplot')

In [3]:
# import data
with open('online_news_popularity.csv', 'r') as f:
    reader = csv.reader(f)
    a = list(reader)
    labels = a[0]
    data = np.array(a[1:])
data.shape

(39644, 61)

In [4]:
# convert the data into a convenient array
data = data[:,1:]
arrToFloat = np.vectorize(float)
data = arrToFloat(data)
shares = np.log(data[:,-1])

In [5]:
# split data into training and test data
dataRatio = .75
trainingrows = int(data.shape[0]*3/4)
trdata = data[0:trainingrows, :]
trshares = shares[0:trainingrows]
tstdata = data[trainingrows:, :]
tstshares = shares[trainingrows:]

In [109]:
# compare gradient descent to closed form solution

trdata = trdata[0:1000,:]
trshares = trshares[0:1000]
trshares.reshape(trshares.shape[0],1)

# initialize weights randomly
b = np.random.randn(1)
weights = np.random.randn(trdata.shape[1])

# define learning parameters
lrnrate = .000000000001
normconst = 1.0
iterations = 10

# norm constant matrix, Gamma^T Gamma
gamma = normconst * np.identity(trdata.shape[1])

# reduce redundant calculations
normterm = trdata.T @ trdata + gamma
onecol = np.ones((trdata.shape[0],1))
yterm = sum(trshares)

overNterm = trdata.T @ onecol @ onecol.T @ trdata / trdata.shape[0]

# calculate closed form solution
identTerm = trdata.T @ (np.eye(trdata.shape[0]) - 1.0/trdata.shape[0])
ident = np.identity(trdata.shape[0])

toinv = (trdata.T @ trdata  + gamma.T @ gamma - overNterm)

otherterm = trdata.T @ trshares - trdata.T @ onecol @ onecol.T @ trshares / trdata.shape[0]

xstar = np.linalg.solve(toinv, otherterm)

bstar = trshares.T @ onecol -  (xstar.T @ trdata.T @ onecol)/trdata.shape[0]

def dist(w,b): # weights, b, real weights, real b
    wdif = w.T - xstar.T
    bdif = b - bstar
    return (wdif.T @ wdif  + bdif**2)**.5

# run gradient descent
for i in range(iterations):
    # calculate gradient w.r.t. weights and deriv w.r.t b

    gradw = normterm @ weights + trdata.T @ (b * np.ones(trshares.shape[0]) - trshares)
    derivb = weights.T @ trdata.T @ onecol - yterm + b * trdata.shape[0]
    
    # update weights and b
    weights = weights - 2.5e-12 * gradw
    b = b - .2 * derivb
    
    # check dist
    print(dist(weights,b))
    
    error = np.dot(tstdata, weights) - tstshares
    print([1/tstdata.shape[0] * np.dot(error,error)**.5]) # calc. RMSE



[ 2005716.81019188]
[8731.6585540781616]
[  4.06182440e+08]
[1879343.1461200621]
[  8.25631275e+10]
[381989457.60615361]
[  1.67820738e+13]
[77644522955.957443]
[  3.41118398e+15]
[15782301744895.068]
[  6.93369380e+17]
[3207966752392662.0]
[  1.40936725e+20]
[6.5206272512299238e+17]
[  2.86472997e+22]
[1.3254058732924699e+20]
[  5.82295196e+24]
[2.6940670909026935e+22]
[  1.18359391e+27]
[5.476056532219192e+24]


(60, 60)
(60, 1000)


In [19]:
1/7

0.14285714285714285

In [41]:
x = np.ones(10)
x.reshape(10,1)

array([[ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.]])

In [71]:
trdata.T @ onecol

array([[  7.22633000e+05],
       [  9.97900000e+03],
       [  4.76496000e+05],
       [  5.63953934e+02],
       [  9.97999994e+02],
       [  7.09122799e+02],
       [  9.57800000e+03],
       [  3.92200000e+03],
       [  3.87700000e+03],
       [  7.40000000e+02],
       [  4.65259780e+03],
       [  7.17600000e+03],
       [  1.02000000e+02],
       [  1.48000000e+02],
       [  1.55000000e+02],
       [  6.70000000e+01],
       [  2.34000000e+02],
       [  1.50000000e+02],
       [  1.99857000e+05],
       [  1.06184900e+06],
       [  4.71064229e+05],
       [  1.02898400e+06],
       [  3.75227000e+07],
       [  1.52853184e+07],
       [  2.85083611e+05],
       [  3.49809367e+06],
       [  1.81848221e+06],
       [  1.95632800e+06],
       [  4.58849000e+06],
       [  2.78880361e+06],
       [  2.23000000e+02],
       [  1.65000000e+02],
       [  2.49000000e+02],
       [  1.72000000e+02],
       [  1.07000000e+02],
       [  3.70000000e+01],
       [  4.70000000e+01],
 