In [25]:
from tqdm import tqdm, trange
from libsvm.svmutil import svm_read_problem # https://blog.csdn.net/u013630349/article/details/47323883
from time import time

import cvxpy as cp
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import scipy as sp
from scipy.linalg import hessenberg
def read_data(path):
    b, A = svm_read_problem(path)
    rows = len(b)   # 矩阵行数, i.e. sample 数
    cols = max([max(row.keys()) if len(row)>0 else 0 for row in A])  # 矩阵列数, i.e. feature 数
    b = np.array(b)
    A_np = np.zeros((rows,cols))
    for r in range(rows):
        for c in A[r].keys():
            # MatLab 是 1-index, python 则是 0-index
            A_np[r,c-1] = A[r][c]
    # 清楚全 0 features
    effective_row_ids = []
    for idx, row in enumerate(A_np):
        if   True or np.sum(row) > 1e-3:
            effective_row_ids.append(idx)
    return b[effective_row_ids], A_np[effective_row_ids]


def solve_tridiagonal_system(diag: np.ndarray, subdiag: np.ndarray, tau: float, b: np.ndarray) -> np.ndarray:
    n = diag.shape[0]
    c = np.zeros(n - 1)
    d = np.zeros(n)

    c[0] = subdiag[0] / (diag[0] + tau)
    d[0] = b[0] / (diag[0] + tau)
    for i in range(1, n - 1):
        w = diag[i] + tau - subdiag[i - 1] * c[i - 1]
        c[i] = subdiag[i] / w
        d[i] = (b[i] - subdiag[i - 1] * d[i - 1]) / w
    d[n - 1] = (b[n - 1] - subdiag[n - 2] * d[n - 2]) / (diag[n - 1] + tau - subdiag[n - 2] * c[n - 2])
    for i in range(n - 2, -1, -1):
        d[i] -= c[i] * d[i + 1]

    return d
lamda = 100
def phi(x,D):
    return -np.log(D/2-np.linalg.norm(x,ord=2))

def phi_grad(x,D):
    x_norm = np.linalg.norm(x,ord=2)
    return x/(x_norm*(D/2-x_norm))
 
def phi_hessian(x,D):
    x_norm = np.linalg.norm(x,ord=2)
    # print(x_norm)
    xxT = np.matmul(x[:,None],x[None,:])    # x * xT
    return np.eye(x.size)/(x_norm*(D/2-x_norm)) + (2*x_norm-D/2)/(x_norm**3 * (D/2-x_norm)**2)*xxT
def f(x,params):
    b=params['b']
    A=params['A_o']
    m=A.shape[0]
    bAx = b*(A@x)
    exp_mbAx = np.exp(-bAx)
    log1p_exp = np.log(1+exp_mbAx)
    overflow_idxs = np.where(exp_mbAx==float('inf'))
    log1p_exp[overflow_idxs] = -bAx[overflow_idxs]
    return log1p_exp.mean() + 1/(lamda*m)* x.T@x

def f_grad(x,params):
    b=params['b']
    A=params['A_o']
    m=A.shape[0]
    return np.ones(m)@(np.expand_dims((-b)/(1+np.exp(b*(A@x))), axis=1)*A)/m + 2/(lamda*m)*x

def f_hessian(x,params):
    b=params['b']
    A=params['A_o']
    m=A.shape[0]
    Ax = A@x
    exp_bAx = np.exp(b*Ax)
    return (A.T @ (np.expand_dims(b*b*exp_bAx/(1+exp_bAx)**2, axis=1)*A) )/m + 2/(lamda*m)*np.eye(x.size)
def f_int(x,params,x_k,gamma_k):
    return f_grad(x_k,params).dot(x-x_k)+gamma_k/2*(f_hessian(x_k,params)@(x-x_k)).dot(x-x_k)
def f_int_grad(x,params,x_k,gamma_k):
    return f_grad(x_k,params)+gamma_k*f_hessian(x_k,params)@(x-x_k)
def f_int_hess(x,params,x_k,gamma_k):
    return gamma_k*f_hessian(x_k,params)
def barrier_method(t_init, f, f_grad, f_hessian, phi, phi_grad, phi_hessian, A, b, x0, D, num_constraints, mu,
                        method='newton', epsilon=1e-6, maxIter=20):
    xt = x0
    t = t_init
    duality_gaps = []
    func_val_record = []
    t_s = time()
    for i in range(maxIter):
        xt,num_newton_step, fvals = solve_central(objective=f,
                                f=lambda x:t*f(x)+phi(x,D), 
                                f_grad=lambda x:t*f_grad(x)+phi_grad(x,D), 
                                f_hessian=lambda x:t*f_hessian(x)+phi_hessian(x,D),
                                x0=xt, D=D, method=method, epsilon=epsilon*1e3)
        duality_gaps.extend([num_constraints/t]*num_newton_step)
        func_val_record.extend(fvals)
        if num_constraints/t < epsilon:
            break
        t *= mu
    t_e = time()
    return xt, t_e-t_s, np.array(duality_gaps), np.array(func_val_record)
def armijo_search(f, f_grad, xk, t_hat, alpha, beta, D, isNewton=False, dk=None):
    if isNewton:
        assert dk is not None
    tk = t_hat*1
    grad = f_grad(xk)
    while True:
        if isNewton:
            if np.linalg.norm(xk+tk*dk,ord=2)<=D/2 and f(xk+tk*dk) <= f(xk) + alpha*tk*grad.T@dk:
                break
        else:
            # if np.linalg.norm(xk-tk*grad,ord=2)<=D/2 and f(xk-tk*grad) <= f(xk)-alpha*tk*grad.T@grad:
            if f(xk-tk*grad) <= f(xk)-alpha*tk*grad.T@grad:
                break
        tk *= beta
    return tk

def solve_central(objective, f, f_grad, f_hessian, x0, D, method='newton', epsilon=1e-6, max_iter=50):
    if method == 'newton':
        return damped_newton(objective, f=f, f_grad=f_grad, f_hessian=f_hessian, x0=x0, D=D, epsilon=epsilon, max_iter=max_iter)
    if method == 'bfgs':
        return bfgs(objective, f=f, f_grad=f_grad, f_hessian=f_hessian, x0=x0, D=D, epsilon=epsilon, max_iter=max_iter)
#* 阻尼牛顿
def damped_newton(objective, f, f_grad, f_hessian, x0, D, epsilon=1e-6, max_iter=50):
    xk = x0
    iter_cnt = 0
    fvals = []
    for idx in range(max_iter):
        iter_cnt += 1
        fvals.append(objective(xk))
        grad = f_grad(xk)
        hessian = f_hessian(xk)
        dk = -np.linalg.inv(hessian)@grad
        decrement = (-grad@dk)**0.5
        if decrement**2/2 <= epsilon:
            # print('** End The Loop - Iter Cnt.:',iter_cnt, 'Decrement:',decrement, 'fval:',f(xk))
            return xk, iter_cnt, fvals
        tk = armijo_search(f, f_grad, xk, t_hat=1, alpha=0.1, beta=0.5, D=D, isNewton=True, dk=dk)
        # print('Iter Cnt.:',iter_cnt, 'Decrement:',decrement, 'fval:',f(xk), 'tk:',tk)
        xk += tk*dk
    return xk, iter_cnt, fvals
def update_approximation_bfgs(mat, sk, yk, mat_type='H'):
    rhok = 1/(yk@sk)
    if mat_type == 'H':
        Hkyk = mat@yk
        ykTHkyk = yk@Hkyk
        HkykskT = Hkyk[:,None]@sk[None,:]
        skskT = sk[:,None]@sk[None,:]
        mat_new = mat + rhok*((rhok*ykTHkyk+1)*skskT - HkykskT - HkykskT.T)
    else:
        Bksk = mat@sk
        skTBksk = sk@Bksk
        mat_new = mat - Bksk[:,None]@Bksk[None,:]/skTBksk + yk[:,None]@yk[None,:]*rhok
    return mat_new

#* 拟牛顿方法 - 选择步长
def wolfe_condition(f, f_grad, xk, pk, D, c1=1e-4, c2=0.9, multiplier=1.2, t0=0, tmax=2):
    ### 
    while (np.linalg.norm(xk+tmax*pk)>=D/2):
        tmax /= 2
        # print('tmax:',tmax)
        if tmax<1e-6:
            # print('too small stepsize')
            return -1
    ###
    ti = tmax/2
    tprev = t0
    i = 1
    fval_cur = f(xk)
    grad_cur = f_grad(xk)
    while True:
        xk_next = xk+ti*pk
        fval_next = f(xk_next)
        if (fval_next > fval_cur + c1*ti*grad_cur@pk) or (fval_next >= fval_cur and i>1):
            return zoom(f, f_grad, xk, pk, fval_cur, grad_cur, c1, c2, tprev, ti)
        grad_next = f_grad(xk_next)
        grad_next_T_pk = grad_next@pk
        if np.abs(grad_next_T_pk) <= -c2*grad_cur@pk:
            return ti
        if grad_next_T_pk >= 0:
            return zoom(f, f_grad, xk, pk, fval_cur, grad_cur, c1, c2, ti, tprev)
        tprev = ti
        ti = tprev*multiplier
        i += 1
def zoom(f, f_grad, xk, pk, fval, grad, c1, c2, t_lo, t_hi):
    while True:
        # print(f"t_lo: {t_lo}\tt_hi: {t_hi}")
        t = (t_lo+t_hi)/2
        xk_next = xk + t*pk
        fval_next = f(xk_next)
        if fval_next > fval + c1*t*grad@pk or fval_next >= f(xk+t_lo*pk):
            t_hi = t
        else:
            grad_next = f_grad(xk_next)
            grad_next_T_pk = grad_next@pk
            if np.abs(grad_next_T_pk) <= -c2*grad@pk:
                return t
            if grad_next_T_pk*(t_hi-t_lo)>=0:
                t_hi = t_lo
            t_lo = t
        if t_lo == t_hi: # 死循环
            return -1
#* 拟牛顿
def bfgs(objective, f, f_grad, f_hessian, x0, D, alpha=0.1, beta=0.5, epsilon=1e-6, max_iter=500):
    xk = x0
    hessian = f_hessian(x0)
    mat_k = np.linalg.inv(hessian) 
    # mat_k = np.eye(n) 
    iter_cnt = 0
    fvals = []
    # pbar=tqdm(range(max_iter))
    for idx in range(max_iter):
        iter_cnt += 1
        grad_k = f_grad(xk)
        dk = -mat_k@grad_k 
        tk = wolfe_condition(f, f_grad, xk, dk, D, c1=1e-4, c2=0.9)
        if tk<0:
            return xk, iter_cnt-1, fvals
        fvals.append(objective(xk))
        sk = tk*dk
        xk_next = xk + sk
        grad_next = f_grad(xk_next)
        # if np.linalg.norm(grad_next, ord=2) <= epsilon:
        if np.linalg.norm(grad_next, ord=2) <= epsilon or np.linalg.norm(xk_next)>=D/2-1e-2:
            print(f'Iteration {iter_cnt} - grad_norm: {np.linalg.norm(grad_next)}, tk: {tk}, x_norm:{np.linalg.norm(xk_next)}')
            return xk_next, iter_cnt, fvals
        else:
            print(f'Iteration {iter_cnt} - grad_norm: {np.linalg.norm(grad_next)}, tk: {tk}, x_norm:{np.linalg.norm(xk_next)}')
        # mat_k = np.linalg.inv(f_hessian(xk_next))
        mat_k = update_approximation_bfgs(mat=mat_k, sk=sk, yk=grad_next-grad_k)
        xk = xk_next
    return xk_next, iter_cnt, fvals


def minimize_quadratic_on_l2_ball(g: np.ndarray, H: np.ndarray, R: float, inner_eps: float, params: dict,x_k: np.ndarray,gamma_k: float) -> np.ndarray:
    n = g.shape[0]
    x_opt_ipm_damped, t_ipm_damped, duality_gaps_damped, fvals_damped = barrier_method(t_init=params['t_init'], f=lambda x:f_int(x,params,x_k,gamma_k),
                            f_grad=lambda x:f_int_grad(x,params,x_k,gamma_k), f_hessian=lambda x:f_int_hess(x,params,x_k,gamma_k), phi=phi, phi_grad=phi_grad,
                            phi_hessian=phi_hessian, 
                A=params['A_o'], b=params['b'], x0=x_k, D=2*params['R'], num_constraints=1, method='bfgs', mu=10, epsilon=params['inner_eps'], maxIter=20)
    return x_opt_ipm_damped


def contracting_newton(params, c_0, decrease_gamma, history):
    # start_time = time.perf_counter()
    # last_logging_time = start_time
    # last_display_time = start_time

    n = params['A'].shape[1]
    m = params['A'].shape[0]
    inv_m = 1.0 / m
    data_accesses = m

    x_k = params['x_0'].copy()
    # Ax = params['A'].dot(x_k)
    Ax = params['A']@x_k
    g_k = np.zeros(n)
    H_k = np.zeros((n, n))
    v_k = np.zeros(n)

    gamma_str = f"gamma_k = {c_0}"
    if decrease_gamma:
        gamma_str += " / (3 + k)"
    print(f"Contracting Newton Method, {gamma_str}")
    pbar=tqdm(range(params['n_iters'] ))
    for k in pbar:
        to_finish = False
        # update_history(
        #     params,
        #     start_time,
        #     k,
        #     data_accesses,
        #     lambda: float('inf') if x_k.norm() > params['R'] + 1e-5 else inv_m * np.logaddexp(Ax, 0).sum(),
        #     last_logging_time,
        #     last_display_time,
        #     history,
        #     to_finish
        # )
        # print(np.linalg.norm(g_k))
        if to_finish or (k>=1 and np.linalg.norm(g_k)<params['outer_eps']):
            break

        gamma_k = c_0
        if decrease_gamma:
            gamma_k /= 3.0 + k
            # gamma_k=c_0*(1-(k/(k+1))**3)
            # print("Gamma_k=",gamma_k)
        # print("Round:",k,flush=True)
        g_k = inv_m * (params['A'].T.dot(1 / (1 + np.exp(-Ax)))+2*params['lambda']*x_k) 
        H_k = (inv_m ) * (params['A'].T.dot(((1 / (1 + np.exp(-Ax))) * (1 - 1 / (1 + np.exp(-Ax))))[:, np.newaxis] * params['A'])) + inv_m*2*params['lambda']*np.diag([1.0]*x_k.size)
        # g_k -= H_k.dot(x_k)

        v_k = minimize_quadratic_on_l2_ball(g_k, H_k, params['R'], params['inner_eps'],params,x_k,gamma_k)

        x_k += gamma_k * (v_k - x_k)
        Ax = params['A'].dot(x_k)
        data_accesses += m
        pbar.set_description('Function value: %.8f / Grad norm: %.8f'%(np.average(np.log(1+np.exp(-params['b']*(params['A_o']@x_k))))+inv_m*params['lambda']*np.linalg.norm(x_k)**2,np.linalg.norm(g_k)))
        # print("function value:",np.average(np.log(1+np.exp(-params['b']*(params['A_o']@x_k))))+inv_m*params['lambda']*np.linalg.norm(x_k)**2)
    print("Done.")


  0%|          | 0/50 [01:21<?, ?it/s]
  0%|          | 0/50 [01:04<?, ?it/s]
  0%|          | 0/50 [00:59<?, ?it/s]
  0%|          | 0/50 [00:58<?, ?it/s]
  0%|          | 0/50 [00:56<?, ?it/s]
  0%|          | 0/50 [00:55<?, ?it/s]
  0%|          | 0/50 [00:53<?, ?it/s]
  0%|          | 0/50 [00:51<?, ?it/s]
  0%|          | 0/50 [00:49<?, ?it/s]
  0%|          | 0/50 [00:44<?, ?it/s]
  0%|          | 0/50 [00:39<?, ?it/s]


In [27]:
b, A = read_data('w8a')
# b, A = read_data('ijcnn1.test')
# b, A = read_data('a9a.test')
# b, A = read_data('CINA.test')
m,n = A.shape
print(m,n)
# b=np.expand_dims(b, axis=1)
params=dict()
params['A']=-np.multiply(b,A.T).T
params['A_o']=A
params['b']=b
# print(params['A'][0,:])
print(np.sum(A==params['A']))
params['x_0']=np.zeros(n)+0.005
params['t_init']=1
c_0 = 3.0
params['R']=10
params['inner_eps']=1e-6
params['outer_eps']=1e-4
params['n_iters']=10000
params['lambda']=0.01
history=None
decrease_gamma=True
contracting_newton(params, c_0, decrease_gamma, history)

  0%|          | 0/10000 [00:00<?, ?it/s]

49749 300
14910962
Contracting Newton Method, gamma_k = 3.0 / (3 + k)
Iteration 1 - grad_norm: 0.22929800830210337, tk: 1.0, x_norm:1.9281132026839376
Iteration 2 - grad_norm: 0.1259083540499028, tk: 1.0, x_norm:1.2208181153545434
Iteration 3 - grad_norm: 0.15153596655678273, tk: 1.0, x_norm:0.618938493164876
Iteration 4 - grad_norm: 0.11446979685446487, tk: 1.0, x_norm:0.61643300379609
Iteration 5 - grad_norm: 0.0247493789669156, tk: 1.0, x_norm:0.8134796017482979
Iteration 6 - grad_norm: 0.020076785584696014, tk: 1.0, x_norm:0.827897387793117
Iteration 7 - grad_norm: 0.012280949732820423, tk: 1.0, x_norm:0.8519762166108459
Iteration 8 - grad_norm: 0.005443269954289677, tk: 1.0, x_norm:0.8676745661768795
Iteration 9 - grad_norm: 0.0020284932019427053, tk: 1.0, x_norm:0.8717157819025817
Iteration 10 - grad_norm: 0.0014664498213446213, tk: 1.0, x_norm:0.8707840861089593
Iteration 11 - grad_norm: 0.0011727823095993648, tk: 1.0, x_norm:0.8701104871419868
Iteration 1 - grad_norm: 0.0428707

Function value: 0.31470173 / Grad norm: 0.59389792:   0%|          | 1/10000 [01:26<239:05:46, 86.08s/it]

Iteration 1 - grad_norm: 70.72772818638094, tk: 1.0, x_norm:9.98586498202115
Iteration 2 - grad_norm: 47.17037097436463, tk: 1.0, x_norm:9.978808559283614
Iteration 3 - grad_norm: 28.324695208812763, tk: 1.0, x_norm:9.964717995241402
Iteration 4 - grad_norm: 17.72470141207779, tk: 1.0, x_norm:9.943639354879387
Iteration 5 - grad_norm: 10.931615731824408, tk: 1.0, x_norm:9.908671412237467
Iteration 6 - grad_norm: 6.7945988501286605, tk: 1.0, x_norm:9.853198748772765
Iteration 7 - grad_norm: 4.230624912941775, tk: 1.0, x_norm:9.764538920132294
Iteration 8 - grad_norm: 2.6605585042888253, tk: 1.0, x_norm:9.626137621501474
Iteration 9 - grad_norm: 1.7097277395208146, tk: 1.0, x_norm:9.41796955926857
Iteration 10 - grad_norm: 1.1595886843693761, tk: 1.0, x_norm:9.13121774242734
Iteration 11 - grad_norm: 0.8822327752972542, tk: 1.0, x_norm:8.8003412457135
Iteration 12 - grad_norm: 0.7801932999113491, tk: 1.0, x_norm:8.516931350027818
Iteration 13 - grad_norm: 0.751488641603401, tk: 1.0, x_no

Function value: 0.21920524 / Grad norm: 0.16492101:   0%|          | 2/10000 [04:18<379:18:46, 136.58s/it]

Iteration 1 - grad_norm: 2.098293950661454, tk: 1.0, x_norm:9.526489016691775
Iteration 2 - grad_norm: 1.399541572026269, tk: 1.0, x_norm:9.292318182360964
Iteration 3 - grad_norm: 0.8407593714835254, tk: 1.0, x_norm:8.829359592187693
Iteration 4 - grad_norm: 0.5268941812369141, tk: 1.0, x_norm:8.149576945219003
Iteration 5 - grad_norm: 0.32672893786662754, tk: 1.0, x_norm:7.06249064047081
Iteration 6 - grad_norm: 0.20726332824913696, tk: 1.0, x_norm:5.4783210909967375
Iteration 7 - grad_norm: 0.14037084575490064, tk: 1.0, x_norm:3.5447654605630836
Iteration 8 - grad_norm: 0.11614675785681412, tk: 1.0, x_norm:2.633815160057773
Iteration 9 - grad_norm: 0.09920651484249922, tk: 1.0, x_norm:2.379704941439295
Iteration 10 - grad_norm: 0.03425130328439572, tk: 1.0, x_norm:0.8828484477406766
Iteration 11 - grad_norm: 0.028535680263074976, tk: 1.0, x_norm:0.7862864402981002
Iteration 12 - grad_norm: 0.006134525488445054, tk: 1.0, x_norm:0.6250192223617279
Iteration 13 - grad_norm: 0.002069886

Function value: 0.17321519 / Grad norm: 0.06716901:   0%|          | 3/10000 [06:04<340:39:49, 122.68s/it]

Iteration 1 - grad_norm: 1.4393415355526982, tk: 1.0, x_norm:9.309930119342328
Iteration 2 - grad_norm: 0.958298308849638, tk: 1.0, x_norm:8.967309897861819
Iteration 3 - grad_norm: 0.5733042220833567, tk: 1.0, x_norm:8.287456849843128
Iteration 4 - grad_norm: 0.3564438647399909, tk: 1.0, x_norm:7.283340096742866
Iteration 5 - grad_norm: 0.21668325674607988, tk: 1.0, x_norm:5.665403054251642
Iteration 6 - grad_norm: 0.12869361730047166, tk: 1.0, x_norm:3.331887220681918
Iteration 7 - grad_norm: 0.09033060998749176, tk: 1.0, x_norm:1.6828492601339768
Iteration 8 - grad_norm: 0.07802552534678095, tk: 1.0, x_norm:1.4558705927008473
Iteration 9 - grad_norm: 0.0664590000942518, tk: 1.0, x_norm:1.035539426689138
Iteration 10 - grad_norm: 0.06825260229174632, tk: 0.4375, x_norm:0.16361785605739368
Iteration 11 - grad_norm: 0.07996718283763576, tk: 0.0625, x_norm:0.07115054928690225
Iteration 12 - grad_norm: 0.08499427932076346, tk: 0.03125, x_norm:0.028149528658074918
Iteration 13 - grad_norm

Function value: 0.15310140 / Grad norm: 0.02741626:   0%|          | 4/10000 [10:54<524:00:05, 188.72s/it]

Iteration 1 - grad_norm: 1.083190121011557, tk: 1.0, x_norm:9.08261275614477
Iteration 2 - grad_norm: 0.7208138292642399, tk: 1.0, x_norm:8.626383281467824
Iteration 3 - grad_norm: 0.430662076597366, tk: 1.0, x_norm:7.719638082453804
Iteration 4 - grad_norm: 0.267031889146807, tk: 1.0, x_norm:6.376436262290956
Iteration 5 - grad_norm: 0.16120864566622745, tk: 1.0, x_norm:4.198295812616076
Iteration 6 - grad_norm: 0.09100443044418517, tk: 1.0, x_norm:1.0949468486764866
Iteration 7 - grad_norm: 0.08506483114414676, tk: 0.25, x_norm:0.6642443844554293
Iteration 8 - grad_norm: 0.10190348730062619, tk: 1.0, x_norm:0.3264057935473885
Iteration 9 - grad_norm: 0.11380079406100164, tk: 0.5, x_norm:0.09042419427552668
Iteration 10 - grad_norm: 0.1255403014998628, tk: 0.03125, x_norm:0.031000871465936195
Iteration 11 - grad_norm: 0.08682974798397416, tk: 0.25, x_norm:0.03543205903947659
Iteration 12 - grad_norm: 0.0839750414933206, tk: 1.0, x_norm:0.027293337003389585
Iteration 13 - grad_norm: 0.

Function value: 0.14704785 / Grad norm: 0.01106030:   0%|          | 5/10000 [15:58<639:22:01, 230.29s/it]

Iteration 1 - grad_norm: 1.1326235983713813, tk: 1.0, x_norm:9.121026096419655
Iteration 2 - grad_norm: 0.7540470405884389, tk: 1.0, x_norm:8.683122877307794
Iteration 3 - grad_norm: 0.45098483826303815, tk: 1.0, x_norm:7.810980637327997
Iteration 4 - grad_norm: 0.28019958557743097, tk: 1.0, x_norm:6.513416203309818
Iteration 5 - grad_norm: 0.17018141204418052, tk: 1.0, x_norm:4.38612156030248
Iteration 6 - grad_norm: 0.10108207046468434, tk: 1.0, x_norm:1.1244893825620323
Iteration 7 - grad_norm: 0.09541841244254609, tk: 0.25, x_norm:0.2799549664630795
Iteration 8 - grad_norm: 0.09580212341318725, tk: 1.0, x_norm:0.06737041947334788
Iteration 9 - grad_norm: 0.10701983647663872, tk: 0.125, x_norm:0.04329887028333995
Iteration 10 - grad_norm: 0.11314007740625641, tk: 0.125, x_norm:0.020106675971924018
Iteration 11 - grad_norm: 0.09231861937476223, tk: 0.5, x_norm:0.011092821659578067
Iteration 12 - grad_norm: 0.08791096880362552, tk: 0.0625, x_norm:0.010794582341382223
Iteration 13 - gr

Function value: 0.14704785 / Grad norm: 0.01106030:   0%|          | 5/10000 [18:12<606:31:52, 218.46s/it]


KeyboardInterrupt: 