In [213]:
from tqdm import tqdm, trange
from libsvm.svmutil import svm_read_problem # https://blog.csdn.net/u013630349/article/details/47323883
from time import time

import cvxpy as cp
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
norm=np.linalg.norm
# import scipy as sp
from scipy.linalg import hessenberg
def ReadDataSparse(filename):
    try:
        input = open(filename)
    except:
        print("E: Unable to open file " + filename)
        return False
    m = 0  # Number of training examples.
    n = 0  # Number of features.
    nnz = 0  # Number of nonzero elements.
    data = []
    buffer = input.readline()
    labels = []
    while buffer:
        m += 1
        data.append([])
        ss = buffer.strip().split(" ")
        label = float(ss[0])
        labels.append(int(label))
        for i in range(1, len(ss)):
            key_value = ss[i].split(":")
            key = int(key_value[0])
            value = float(key_value[1])
            data[-1].append((key - 1, value))
            n = max(n, key)
            nnz += 1
        buffer = input.readline()
    input.close()
    return data

def read_data(path):
    b, A = svm_read_problem(path)
    rows = len(b)   # 矩阵行数, i.e. sample 数
    cols = max([max(row.keys()) if len(row)>0 else 0 for row in A])  # 矩阵列数, i.e. feature 数
    b = np.array(b)
    A_np = np.zeros((rows,cols))
    for r in range(rows):
        for c in A[r].keys():
            # MatLab 是 1-index, python 则是 0-index
            A_np[r,c-1] = A[r][c]
    # 清楚全 0 features
    effective_row_ids = []
    for idx, row in enumerate(A_np):
        if   True or np.sum(row) > 1e-3:
            effective_row_ids.append(idx)
    return b[effective_row_ids], A_np[effective_row_ids]


def solve_tridiagonal_system(diag: np.ndarray, subdiag: np.ndarray, tau: float, b: np.ndarray) -> np.ndarray:
    n = diag.shape[0]
    c = np.zeros(n - 1)
    d = np.zeros(n)

    c[0] = subdiag[0] / (diag[0] + tau)
    d[0] = b[0] / (diag[0] + tau)
    for i in range(1, n - 1):
        w = diag[i] + tau - subdiag[i - 1] * c[i - 1]
        c[i] = subdiag[i] / w
        d[i] = (b[i] - subdiag[i - 1] * d[i - 1]) / w
    d[n - 1] = (b[n - 1] - subdiag[n - 2] * d[n - 2]) / (diag[n - 1] + tau - subdiag[n - 2] * c[n - 2])
    for i in range(n - 2, -1, -1):
        d[i] -= c[i] * d[i + 1]

    return d

2 * np.finfo(float).eps
def ss(A, jj):
    """Subfunction for h_trid."""
    return np.sqrt(np.sum(A[jj + 1:, jj] ** 2))

def h_trid(A):
    """
    H_TRID(A) uses Householder method to form a tridiagonal matrix from A.
    Must have a SQUARE SYMMETRIC matrix as the input.
    """
    M, N = A.shape
    if M != N or (A != A.T).any():  # This just screens matrices that can't work.
        raise ValueError("Matrix must be square symmetric only, see help.")

    lngth = len(A)  # Preallocations.
    v = np.zeros(lngth)
    I = np.eye(lngth)
    Aold = A
    finalP=np.eye(lngth)
    for jj in range(lngth - 2):  # Build each vector j and run the whole procedure.
        v[:jj+1] = 0
        S = ss(Aold, jj)
        v[jj + 1] = np.sqrt(0.5 * (1 + abs(Aold[jj + 1, jj]) / (S+2 * np.finfo(float).eps)))
        v[jj + 2:] = Aold[jj + 2:, jj] * np.sign(Aold[jj + 1, jj]) / (2 * v[jj + 1] * S+2 * np.finfo(float).eps )
        P = I - 2 * np.outer(v, v)
        Anew = P @ Aold @ P
        Aold = Anew
        finalP=finalP@P
    # Anew[abs(Anew) < 5e-14] = 0  # Tolerance.

    return Anew,finalP



def minimize_quadratic_on_l2_ball(g: np.ndarray, H: np.ndarray, R: float, inner_eps: float) -> np.ndarray:
    n = g.shape[0]
    # np.savetxt('hess.txt',H)
    # print(np.linalg.norm(H))
    H_tridiag, Q = hessenberg(H,calc_q=True)
    diag = np.diag(H_tridiag)
    subdiag = np.diag(H_tridiag, k=-1)
    # print("Other:",np.sum(H_tridiag)-np.sum(diag)-np.sum(subdiag)*2,flush=True)
    g_ = Q.T.dot(g)

    tau = 1.0
    S_tau = np.zeros(n)
    S_tau_norm = 0.0
    phi_tau = 0.0
    # print(np.linalg.norm(H_tridiag))
    # print(np.linalg.norm(diag),"\t",np.linalg.norm(subdiag),"\t",np.linalg.norm(g_))
    N_LINE_SEARCH_ITERS = 100
    for i in range(N_LINE_SEARCH_ITERS + 1):
        if i == N_LINE_SEARCH_ITERS:
            print("W: Preliminaty line search iterations exceeded in MinimizeQuadraticOnL2Ball")
            break

        S_tau = solve_tridiagonal_system(diag, subdiag, tau, g_)
        S_tau_norm = np.linalg.norm(S_tau)
        phi_tau = 1.0 / S_tau_norm - 1.0 / R
        if phi_tau < inner_eps or tau < inner_eps:
            break
        tau *= 0.5
    # print("phi_tau:",phi_tau)
    if phi_tau < -inner_eps:
        S_tau_grad = np.zeros(n)
        for i in range(N_LINE_SEARCH_ITERS + 1):
            if i == N_LINE_SEARCH_ITERS:
                print("W: 1-D Newton iterations exceeded in MinimizeQuadraticOnL2Ball")
                break

            S_tau_grad = solve_tridiagonal_system(diag, subdiag, tau, S_tau)
            phi_tau_prime = (1.0 / S_tau_norm**3) * (S_tau.T.dot(S_tau_grad))
            tau -= phi_tau / phi_tau_prime

            S_tau = solve_tridiagonal_system(diag, subdiag, tau, g_)
            S_tau_norm = np.linalg.norm(S_tau)
            phi_tau = 1.0 / S_tau_norm - 1.0 / R

            if abs(phi_tau) < inner_eps or abs(phi_tau_prime) < inner_eps:
                break

    return -Q.dot(S_tau)

In [214]:


def Sigmoid(t):
    # if t>0:
    return 1.0 / (1 + np.exp(-t))
    # else:
    #     return np.exp(t) / (1 + np.exp(t))
def Log_one_exp(inner):
    if (inner > 0):
        return inner + np.log(1 + np.exp(-inner))
    else:
        return np.log(1 + np.exp(inner))

def Pi(k: int) -> int:
    k |= (k >> 1)
    k |= (k >> 2)
    k |= (k >> 4)
    k |= (k >> 8)
    k |= (k >> 16)
    return (k + 1) >> 1

def StochasticContractingNewton(params: dict, c_0: float, decrease_gamma: bool, variance_reduction: bool, hessian_variance_reduction: bool, history = None):
    params['AT']=params['A'].T
    n = params['A'].shape[1]
    m = params['A'].shape[0]
    inv_m = 1.0 / m
    data_accesses = 0

    x_k = params['x_0']

    A_batch = None
    AT_batch = None
    A_batch_triplets = []
    AT_batch_triplets = []

    g_k = np.zeros(n)
    H_k = np.zeros((n, n))
    v_k = np.zeros(n)

    sigma_prime = None
    Ax = None
    full_grad = None
    full_Hess = None
    z_k = None

    # np.random.seed(31415) # 原代码的随机数，但是由于编程语言差异，在Python中效果并不相同
    obj_indices = np.arange(m)
    batch = []
    norm_g=0
    method_name = "Stochastic Newton"
    if variance_reduction:
        method_name += " (HVR)" if hessian_variance_reduction else " (VR)"
    gamma_str = "gamma_k = " + str(c_0)
    if decrease_gamma:
        gamma_str += " / (3 + k)"
    pbar=tqdm(range(params['n_iters'] ))
    # InitDisplay(params, method_name + ", " + gamma_str)
    print('Iteration 0 - Function value: %.8f / Grad norm: %.8f'%(np.average(np.array([Log_one_exp(ax) for ax in params['A']@x_k]))+inv_m*params['lambda']*np.linalg.norm(x_k)**2,norm_g))
    for k in range(params['n_iters'] ):
        to_finish = None
        # UpdateHistory(
        #     params, start_time, k, data_accesses,
        #     lambda: (x_k.norm() > params.R_ + REGION_TOLERERANCE) ? INF : inv_m * ((*params.A_) * x_k).unaryExpr(&Log_one_exp).sum(),
        #     &last_logging_time,
        #     &last_display_time,
        #     history,
        #     &to_finish)
        if to_finish or (k>=1 and norm_g<params['outer_eps']):
            break

        if variance_reduction and Pi(k) == k:
            Ax = np.matmul(params['A'], x_k)
            full_grad = inv_m * (params['AT']@(np.array([Sigmoid(ax) for ax in Ax]))) 
            if hessian_variance_reduction:
                full_Hess = (inv_m) * (params['AT']@((np.array([Sigmoid(ax) for ax in Ax]) * (1 - np.array([Sigmoid(ax) for ax in Ax])))[:, np.newaxis] * params['A']))
            z_k = x_k.copy()
            data_accesses += m

        k_sqr = (k + 1) * (k + 1)
        batch_size = k_sqr if k_sqr < m else m
        if batch_size == m:
            print("W: batch_size equals m")
        np.random.shuffle(obj_indices)
        # if k!=0:
        batch = obj_indices[:batch_size].copy()
        # else:
        #     batch=[19539] // 原作者版本的第一个随机数

        if decrease_gamma:
            gamma_k = c_0 / (3 + k)

        if variance_reduction:
            g_k = full_grad
            if hessian_variance_reduction:
                H_k = gamma_k*full_Hess
            else:
                H_k=np.zeros((n,n))
        else:
            g_k=np.zeros(n)
            H_k=np.zeros((n,n))
                
        A_batch=np.zeros((batch_size,n))
        sigma_prime = np.zeros(batch_size)
        
        for batch_i in range(batch_size):
            i = batch[batch_i]
            sigma = Sigmoid(params['A'][i, :].dot(x_k))
            g_k += (sigma / batch_size) * params['A'][i, :]
            data_i = params['data'][i]
            for index_value in data_i:
                A_batch[batch_i, index_value[0]]=index_value[1]
            sigma_prime[batch_i] = sigma * (1 - sigma) * gamma_k / batch_size
            if variance_reduction:
                sigma_z = Sigmoid(params['A'][i, :].dot(z_k))
                g_k -= (sigma_z / batch_size) * params['A'][i, :]
                # print(sigma_z,"\t",norm(x_k),"\t",np.sum(params['A'][i, :]))
                if hessian_variance_reduction:
                    sigma_prime[batch_i] -= sigma_z * (1 - sigma_z) * gamma_k / batch_size
        # print(np.linalg.norm(sigma_prime),"\t",(np.average(np.log(1+np.exp(-params['b']*(params['A_o']@x_k))))))
        
        data_accesses += batch_size
        g_k+=2*inv_m*params['lambda']*x_k
        AT_batch = A_batch.T
        H_k += AT_batch @ np.diag(sigma_prime) @ A_batch  + inv_m*2*params['lambda']*np.diag([1.0]*x_k.size)
        # print(np.diag(sigma_prime))
        norm_g=np.linalg.norm(g_k)
        g_k -= H_k @ x_k
        v_k = minimize_quadratic_on_l2_ball(g_k, H_k, params['R'], params['inner_eps'])
        x_k += gamma_k * (v_k - x_k)
        # x_k=-x_k
        print(f'Iteration {k+1} - Function value: %.8f / Grad norm: %.8f/ Data access:{data_accesses}'%(np.average(np.array([Log_one_exp(ax) for ax in params['A']@x_k]))+inv_m*params['lambda']*np.linalg.norm(x_k)**2,norm_g))
        # print(x_k[:5],"\t",np.max((params['A']@x_k)))
        # print()
        # pbar.set_description('Function value: %.8f / Grad norm: %.8f'%(np.average(np.log(1+np.exp(-params['b']*(params['A_o']@x_k)))),norm_g)) # +inv_m*params['lambda']*np.linalg.norm(x_k)**2
    print("Done.")


In [215]:
np.exp(930)

  np.exp(930)


inf

In [216]:
b, A= read_data('w8a')
data = ReadDataSparse('w8a')
# b, A = read_data('ijcnn1.test')
# b, A = read_data('a9a.test')
# b, A = read_data('CINA.test')
m,n = A.shape
print(m,n)
b=-b #原论文将第一个label作为1，反之为-1
# b=np.expand_dims(b, axis=1)
params=dict()
params['data']=data
params['A']=-np.multiply(b,A.T).T
params['A_o']=A
params['b']=b
# print(params['A'][0,:])
print(np.sum(A==params['A']))
params['x_0']=np.zeros(n)
c_0 = 3.0
params['R']=100
params['inner_eps']=1e-9
params['outer_eps']=1e-4
params['n_iters']=100
params['lambda']=0.01
history=None
decrease_gamma=True
vr =True # Variance reduction
hessvr = False # Hessian Variance Reduction
StochasticContractingNewton(params, c_0, decrease_gamma,vr,hessvr, history)

  0%|          | 0/100 [18:57<?, ?it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

49749 300
14358852
Iteration 0 - Function value: 0.69314718 / Grad norm: 0.00000000
[[0.25]]
Iteration 1 - Function value: 1.76080351 / Grad norm: 0.56253632/ Data access:49750
[[1.30891007e-81 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 3.23749757e-07 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 4.68750000e-02 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 6.21378586e-54]]


  return 1.0 / (1 + np.exp(-t))


Iteration 2 - Function value: 31.07446237 / Grad norm: 0.02513149/ Data access:99503
[[1.66666667e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00]
 [0.00000000e+00 9.71075072e-15 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 1.48029737e-17 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 2.31495396e-03
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  1.66666667e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 

KeyboardInterrupt: 