# Homework 3
## Minjie Fan, 998585352

## Exercise 1

In [390]:
# only for version 0.4.0
using Toms566

### Define Backtracking Function

In [454]:
function backtrack( obj, grd, x, d, rho=0.9, alpha=1.0, c=1e-4 )
    # Find step size by backtracking
    gxp = (grd(x)'*d)[1]
    while obj(x+alpha*d)>obj(x)+c*alpha*gxp
        alpha *= rho
    end
    return alpha
end

backtrack (generic function with 4 methods)

### Define the Function of Newton's Method

In [475]:
# Use the template provided
function newtmin( obj, grd, hes, x, maxIts=100, optTol=1e-6, modify=1 )
    # the sqrt of the machine precision
    delta = 1e-8
    n = length(x)
    t = NaN
    d = NaN
    for t = 1:maxIts
        f = obj(x)
        g = grd(x)
        H = hes(x)
        # check condition for breaking
        if norm(g)<optTol
            break
        end
        # try chol decomp first
        try
            F = chol(H)
            d = -R\(R'\g)
        # o.w., use eigenvalue modification
        catch
            F = eigfact(H)
            # method 1
            if modify==1
                Lambda_inv = 1./max(abs(F[:values]), delta)
            # method 2
            else
                Lambda_inv = 1./max(F[:values], delta)
            end
            d = -F[:vectors]*Diagonal(Lambda_inv)*F[:vectors]'*g
        end
        # find alpha
        alpha = backtrack( obj, grd, x, d )
        # update x
        x = x+alpha*d
    end
    return (x, t-1)
end

newtmin (generic function with 4 methods)

### Define the Function of Quasi-Newton's Method (BFGS)

In [511]:
# Use the template provided
function newtminBFGS( obj, grd, hes, x, maxIts=100, optTol=1e-6 )
    # Minimize a function f using Newton’s method.
    n = length(x)
    # the init value of H
    H_approx = eye(n)
    t = NaN
    f_pre = NaN
    for t = 1:maxIts
        f = obj(x)
        g = grd(x)
        H = hes(x)
        # check two conditions for breaking
        if norm(g)<optTol || (t>1 && abs((f_pre-f)/f_pre)<optTol)
            break
        end
        # get dir
        d = -H_approx*g
        # find alpha
        alpha = backtrack( obj, grd, x, d )
        # update x
        x_new = x+alpha*d
        # get s
        s = x_new-x
        # get y
        y = grd(x_new)-g
        rho_k = 1/(y'*s)[1]
        # update H
        H_approx = (eye(n)-rho_k*s*y')*H_approx*(eye(n)-rho_k*y*s')+rho_k*s*s'
        
        # update x and f_pre
        x = x_new
        f_pre = f
    end
    return (x, t-1)
end

newtminBFGS (generic function with 4 methods)

## Naive Test

In [460]:
myobj(x) = (x^2)[1]
mygrd(x) = 2.0*x
myhes(x) = 2.0

myhes (generic function with 1 method)

In [461]:
res = newtmin( myobj, mygrd, myhes, 10.0 )

(
1x1 Array{Float64,2}:
 0.0,

1)

In [462]:
res2 = newtminBFGS( myobj, mygrd, myhes, 10.0)

(
1x1 Array{Float64,2}:
 0.0,

2)

For this naive example, the Netwon's method converges to the true minimum in one step, while the Quasi-Newton's method (BFGS) converges to the true minimum in two steps.

## Test My Newton Function using Toms566

In [493]:
function test_Newton()
    @printf("%3s %12s %12s %6s %6s %10s\n",
    "No.", "f(x*)", "|∇f(x*)|", "Modify", "Iters", "Converged")
    modify = ones(18)
    modify[1] = 2
    for i = 1:18
        p = Problem(i);
        x = p.x0
        res = newtmin( p.obj, p.grd, p.hes, x, 1e3, 1e-8, modify[i] )
        x_star = res[1]
        norm_grd = norm(p.grd(x_star))
        if norm_grd<1e-8
            conv = "true"
        else
            conv = "false"
        end
        @printf("%3s %12e %12e %6i %6i %10s\n",
        i, p.obj(x_star), norm_grd, modify[i], res[2], conv)
    end
end

test_Newton (generic function with 1 method)

In [494]:
test_Newton()

No.        f(x*)     |∇f(x*)| Modify  Iters  Converged
  1 7.141922e-34 1.770799e-16      2     14       true
  2 1.669195e-03 3.070969e-03      1    999      false
  3 1.127933e-08 9.701954e-11      1      2       true
  4 7.694100e-31 1.754301e-15      1     85       true
  5 1.307394e-21 2.709262e-11      1     26       true
  6 1.621755e+01 2.059218e+01      1    999      false
  7 4.320501e-04 2.807938e-02      1    999      false
  8 5.250351e-04 8.434708e-11      1     20       true
  9 8.803193e+01 6.736629e-03      1    999      false
 10 1.262177e-29 7.105427e-09      1      8       true
 11 8.582220e+04 2.220736e-08      1    999      false
 12 5.742068e-29 8.715036e-14      1     14       true
 13 5.068991e-17 9.961953e-09      1    386       true
 14 5.963014e+01 1.283421e+01      1    999      false
 15 4.610843e-03 8.055350e-02      1    999      false
 16 2.959860e-19 9.647698e-10      1      8       true
 17 1.778157e-24 1.076259e-11      1     38       true
 18 6.6490

From the above table, we can see that the Newton's method (with eigenvalue modification) converges successfully for problems 1, 3, 4, 5, 8, 10, 12, 13, 16, 17.

## Test My BFGS Function using Toms566

In [489]:
using Optim

In [505]:
function test_BFGS()
    @printf("%3s %12s %12s %6s %6s %10s\n",
    "No.", "f(x*)", "Benchmark", "|∇f(x*)|", "Iters", "Converged")
    for i = 1:18
        p = Problem(i);
        x = p.x0
        res = newtminBFGS( p.obj, p.grd, p.hes, x, 1e3, 1e-8 )
        res_bench = optimize( p.obj, p.grd!, p.hes!, x, method = :bfgs )
        x_star = res[1]
        steps = res[2]
        if steps<999
            conv = "true"
        else
            conv = "false"
        end
        @printf("%3s %12e %12e %6e %6i %10s\n",
        i, p.obj(x_star), res_bench.f_minimum, norm(p.grd(x_star)), steps, conv)
    end
end

test_BFGS (generic function with 1 method)

In [512]:
test_BFGS()

No.        f(x*)    Benchmark |∇f(x*)|  Iters  Converged
  1 2.837687e-23 5.943187e-20 1.892953e-10     30       true
  2 1.498395e-02 5.655650e-03 1.969714e-01     22       true
  3 1.127933e-08 1.127933e-08 4.345071e-11      5       true
  4 7.661898e-29 4.780622e-24 1.374931e-09    239       true
  5 6.613458e-19 8.242706e-23 2.499883e-09     33       true
  6 8.319941e-25 2.233991e-21 2.109242e-11     29       true
  7 1.399760e-06 1.399760e-06 3.605054e-10     68       true
  8 6.786564e-04 5.250351e-04 7.237335e-04     20       true
  9 8.806565e+01 8.803248e+01 1.231225e+00     85       true
 10 0.000000e+00 1.972152e-31 0.000000e+00     19       true
 11 8.582220e+04 8.582220e+04 3.950739e-02     24       true
 12 3.039644e+00 9.688373e-24 8.187503e+00     11       true
 13 3.946270e-06 3.946270e-06 8.060639e-08     53       true
 14 6.310869e-19 4.163020e-20 7.927464e-09    168       true
 15 3.419221e-12 1.570424e-12 9.009860e-09    265       true
 16 1.102962e-18 2.877199e-2

In the above table, the benchmark minimial value of the function is computed by package *Optim*. The difficulty of implementing the BFGS method is the linesearch such that the Wolfe conditions are satisfied. However, the backtracking algorithm can only guarantee the satisficaiton of sufficent decrease but not curvature condition. In this case, *NaN* can happen. In order to alleviate this issue, I added another exit condition, which is 
$$\frac{|f(x)-f(x')|}{|f(x)|}< \epsilon.$$ According to my numerical experiments, it can partly solve the issue, but occasionaly, it may cause premature convergence.

Compared with the benchmark, all the problems work well except Prob. 12, which converges prematurely. This indicates the necessity of more sophisticated linesearch algorithms.

Since Prob. 12 has been successfully solved by the Newton's method, all the problems can be solved by combining the Newton's and BFGS methods.

## Exercise 2

In [513]:
using DataFrames

In [514]:
dt = readtable("binary.csv");

In [515]:
# get y and X
y = dt[:admit];
X = dt[[:gre, :gpa]];
m = size(X, 2)
std_vec = [std(X[i]) for i in 1:m]
# do feature rescaling
for i = 1:m
    X[i] = X[i]/std_vec[i]
end
n = length(y)
y = convert(Array, y);
X_full = convert(Matrix, [X ones(n)]);

In [516]:
# define the negative log-likelihood
function negloglik(a, y, X_full)
    value = 0.0;
    tmp = X_full*a;
    for i = 1:length(y)
        value = value-y[i]*tmp[i]+log(1+exp(tmp[i]));
    end
    return value;
end;

In [517]:
# define the gradient
function negloglik_grad(a, y, X_full)
    value = zeros(1, size(X_full, 2))
    n = length(y);
    tmp = exp(X_full*a);
    tmp = tmp./(1.+tmp).-y;
    for i = 1:n
        value = value+tmp[i]*X_full[i, :]
    end
    return value';
end;

In [518]:
# define the Hessian matrix
function negloglik_Hess(a, y, X_full)
    m = size(X_full, 2);
    n = length(y);
    tmp = exp(X_full*a);
    tmp = tmp./(1.+tmp).^2;
    value = zeros(m, m);
    for j = 1:m
        for k = j:m
            for i = 1:n
                value[j, k] = value[j, k]+tmp[i]*X_full[i, j]*X_full[i, k];
            end
            if k>j
                value[k, j] = value[j, k];
            end
        end
    end
    return value;
end;

In [519]:
obj(a) = negloglik(a, y, X_full);
grd(a) = negloglik_grad(a, y, X_full);
hes(a) = negloglik_Hess(a, y, X_full);

In [521]:
a0 = [1;1;1]
res = newtmin( obj, grd, hes, a0, 1e3, 1e-8, 2 )

(
3x1 Array{Float64,2}:
  0.310818
  0.287209
 -4.94938 ,

32.0)

In the previous homework, the gradient descent converges at the 107425-th iteration, while the modified Newton's method converges at the 32-th iteration. 

Generally speaking, among the gradient descent, Newton's and BFGS methods, the Newton's converges the fastest, then the BFGS and the gradient descent. But in some cases, the Newton's method does not work very well (due to the ill-conditioned Hessian matrix), and it takes longer time to solve for the descent direction. Comparatively speaking, BFGS is more robust in this sense. The gradient descent can be used when the Hessian matrix is not available.

In [522]:
beta = res[1]./[std_vec; 1]

3x1 Array{Any,2}:
  0.00269068
  0.754687  
 -4.94938   