In [14]:
X = 2 * rand(100, 1)
X_b = hcat(ones(100), X)

100×2 Array{Float64,2}:
 1.0  1.85474  
 1.0  0.299743 
 1.0  1.11612  
 1.0  0.741247 
 1.0  0.952258 
 1.0  0.6      
 1.0  0.231138 
 1.0  0.384389 
 1.0  0.431405 
 1.0  0.200832 
 1.0  1.79621  
 1.0  1.68678  
 1.0  1.09458  
 ⋮             
 1.0  0.107971 
 1.0  0.406447 
 1.0  0.691274 
 1.0  1.0265   
 1.0  1.89014  
 1.0  1.94844  
 1.0  1.84592  
 1.0  1.12895  
 1.0  0.0611248
 1.0  0.738629 
 1.0  1.86552  
 1.0  1.55379  

In [15]:
y = 4 + 3 * X + randn(100, 1)

100×1 Array{Float64,2}:
 11.0973 
  4.3729 
  7.84691
  5.31347
  6.2205 
  5.98568
  5.91409
  4.11933
  4.5978 
  4.72279
  9.51808
  7.79841
  8.0641 
  ⋮      
  4.0139 
  4.85471
  5.49445
  7.39412
  9.73597
  8.65986
  9.93085
  7.91649
  4.04658
  6.20148
  9.47201
  9.28205

In [45]:
function batch_gradient_descent(X, y, learning_rate=0.1, n_iterations=1000, m=100)
    theta = randn(2, 1)
    
    for iteration in 1:n_iterations
        gradients = (2/m) .* (X' * (X * theta - y))
        theta -= learning_rate .* gradients
    end
    
    return theta
    
end

batch_gradient_descent (generic function with 6 methods)

In [46]:
methods(batch_gradient_descent)

In [59]:
@time batch_gradient_descent(X_b, y, 0.0001)

  0.001588 seconds (6.00 k allocations: 2.075 MiB)


2×1 Array{Float64,2}:
 2.3947  
 0.240146

In [56]:
@time batch_gradient_descent(X_b, y, 0.02)

  0.001416 seconds (6.08 k allocations: 2.081 MiB)


2×1 Array{Float64,2}:
 3.81355
 3.10539

In [57]:
@time batch_gradient_descent(X_b, y, 0.1)

  0.001718 seconds (6.00 k allocations: 2.075 MiB)


2×1 Array{Float64,2}:
 3.81528
 3.10382

In [58]:
@time batch_gradient_descent(X_b, y, 0.5)

  0.001317 seconds (6.00 k allocations: 2.075 MiB)


2×1 Array{Float64,2}:
 -2.9726e10 
 -3.28898e10

In [66]:
function simple_schedule(t, t0=5, t1=50)
    return t0/(t+t1)
end

simple_schedule (generic function with 3 methods)

In [127]:
function SGD(X, y, learning_schedule=simple_schedule, n_epochs=50, m=100)
    theta = randn(2, 1)
    
    for epoch in 1:n_epochs
        for i in 1:m
            random_index = rand(1:m)
            x_i = X[random_index:random_index, :]
            y_i = y[random_index, :]
            
            gradients = (2/m) .* (x_i' * (x_i * theta - y_i))
            
            learning_rate = learning_schedule(epoch * m + i)
            
            theta -= learning_rate .* gradients
        end
    end

    return theta
end

SGD (generic function with 4 methods)

In [128]:
methods(SGD)

In [139]:
@time SGD(X_b, y)

  0.005432 seconds (55.01 k allocations: 3.968 MiB)


2×1 Array{Float64,2}:
 1.65373
 1.48308

In [156]:
theta_path_mgd = []

n_iterations = 50
minibatch_size = 20

theta = randn(2,1)  # random initialization

t0, t1 = 200, 1000

t, m = 0, 100

for epoch in 1:n_iterations
    shuffled_indices = randperm(m)
    
    X_b_shuffled = X_b[shuffled_indices, :]
    y_shuffled = y[shuffled_indices, :]
    
    for i in 1:minibatch_size:m
        t += 1
        
        xi = X_b_shuffled[i:i+minibatch_size-1, :]
        yi = y_shuffled[i:i+minibatch_size-1, :]
        
        gradients = (2/minibatch_size) .* (xi' * (xi * theta - yi))
        
        eta = simple_schedule(t, t0, t1)
        
        theta -= eta * gradients
        push!(theta_path_mgd, theta)
    end
end        

In [157]:
theta_path_mgd

250-element Array{Any,1}:
 [2.54918; 2.35904]
 [3.43309; 3.18573]
 [3.67141; 3.36518]
 [3.52323; 3.18562]
 [3.58384; 3.20245]
 [3.65825; 3.2555] 
 [3.54647; 3.15845]
 [3.69373; 3.36666]
 [3.64436; 3.23186]
 [3.71939; 3.26405]
 [3.60781; 3.1049] 
 [3.75187; 3.21616]
 [3.72784; 3.19755]
 ⋮                 
 [3.86762; 3.18434]
 [3.88339; 3.17573]
 [3.79371; 3.11992]
 [3.7993; 3.08705] 
 [3.75894; 3.05476]
 [3.81746; 3.11527]
 [3.88121; 3.17005]
 [3.76567; 3.03846]
 [3.90441; 3.12017]
 [3.85792; 3.09789]
 [3.7955; 3.09836] 
 [3.83998; 3.13671]