# Part I: Libraries

In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat
from joblib import Parallel, delayed
import math
import random
import os
import time

def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(0)
import warnings
warnings.filterwarnings('ignore')

# Part II: Adversarial Debiasing Class

In [8]:
class Adversarial_Debiasing:
    def __init__(self, A, b, c, lamb, gamma, beta):
        self.A = A
        self.b = b
        self.c = c
        self.lamb = lamb
        self.gamma = gamma
        self.beta = beta
        self.m, self.dim = A.shape
        self.d = self.dim + 1
        self.n = A.shape[0]

    def grad(self, z):
        x = z[:self.dim]
        y = z[self.dim][0]
        p = 1 / (1 + np.exp(self.b * self.A @ x))
        q = 1 / (1 + np.exp(self.c * y * self.A @ x))
        gx = self.A.T @ (self.beta * y * q * self.c - p * self.b) / self.m + 2 * self.lamb * x
        gy = np.sum(self.beta * self.c * q * (self.A @ x)) / self.m - 2 * self.gamma * y
        g = np.zeros(self.d)
        g[:self.dim] = gx.flatten()
        g[-1] = gy
        return g.reshape(-1, 1)
    
    def Hxx(self, z):
        x = z[:self.dim]
        y = z[self.dim][0]
        p = 1 / (1 + np.exp(self.b * self.A @ x))
        q = 1 / (1 + np.exp(self.c * y * self.A @ x))
        
        return (self.A.T @ ((self.b**2 * (p - p**2) - self.beta * self.c**2 * y**2 * (q - q**2)) * self.A)) / self.m + 2 * self.lamb * np.eye(self.dim)
        
    
    def Hxy(self, z):
        x = z[:self.dim]
        y = z[self.dim][0]
        p = 1 / (1 + np.exp(self.b * self.A @ x))
        q = 1 / (1 + np.exp(self.c * y * self.A @ x))
        return self.beta * self.A.T @ (self.c * q) / self.m - ((self.beta * y * (q - q**2) * self.c**2) * self.A).T @ (self.A @ x) / self.m
        
    
    def Hyy(self, z):
        x = z[:self.dim]
        y = z[self.dim][0]
        p = 1 / (1 + np.exp(self.b * self.A @ x))
        q = 1 / (1 + np.exp(self.c * y * self.A @ x))
        return np.array([np.sum(-self.beta * self.c**2 * (self.A @ x)**2 * (q - q**2)) / self.m - 2 * self.gamma]).reshape(-1, 1)

In [34]:
class Adversarial_Debiasing_Sketch(Adversarial_Debiasing):
    def __init__(self, A, b, c, lamb, gamma, beta, method='uniform', pct=0.1):
        super().__init__(A, b, c, lamb, gamma, beta)
        self.method = method
        self.pct = pct
        self.sketch_size = int(self.pct * self.A.shape[0])
    
    def Hxx(self, z):
        x = z[:self.dim]
        y = z[self.dim][0]
        p = 1 / (1 + np.exp(self.b * self.A @ x))
        q = 1 / (1 + np.exp(self.c * y * self.A @ x))

        A0_aug = np.sqrt(self.b ** 2 * (p - p ** 2) - self.beta * self.c ** 2 * y ** 2 * (q - q ** 2)) * self.A

        if self.method == 'uniform':
            selected = np.random.choice(range(self.n), self.sketch_size, replace=False)
            return A0_aug[selected].T @ A0_aug[selected] / self.sketch_size + 2 * self.lamb * np.identity(self.dim)
        
        elif self.method == 'gaussian':
            S = np.random.normal(loc=0.0, scale=1/np.sqrt(self.sketch_size), size=(self.sketch_size, self.n))
            SA = S @ A0_aug
            return SA.T @ SA / self.sketch_size + 2 * self.lamb * np.identity(self.dim)

# Part III: PAN Framework

In [3]:
def PAN(params, A, b, c, lamb, gamma, beta, ratio=0.1, epochs=100, lr=1.0):
    gradients = []
    obj = Adversarial_Debiasing(A, b, c, lamb, gamma, beta)
    init_grad = np.linalg.norm(obj.grad(params), ord=2)
    cpu_time = [0.0]
    gradients.append(init_grad)
    print(f'epoch {0}, gradient norm: {init_grad}')
    sampled = np.random.choice(range(A.shape[0]), size=int(ratio * A.shape[0]), replace=False)
    A_sampled, b_sampled, c_sampled = A[sampled], b[sampled], c[sampled]
    local_obj = Adversarial_Debiasing(A_sampled, b_sampled, c_sampled, lamb, gamma, beta)
    
    for num_epoch in range(1, epochs+1):
        start_time = time.time()
        gz = obj.grad(params)
        gx, gy = gz[:-1].reshape(-1, 1), gz[-1].reshape(-1, 1)
        Hxy = obj.Hxy(params)
        Hyy = obj.Hyy(params)
        Hxx = local_obj.Hxx(params)
        Hxx_inv = np.linalg.inv(Hxx)
        Hyy_inv = np.linalg.inv(Hyy)
        
        G2 = Hyy - Hxy.T @ Hxx_inv @ Hxy
        G2_inv = np.linalg.inv(G2)
        G1_inv = Hxx_inv + (Hxx_inv @ Hxy) @ np.linalg.inv(G2) @ (Hxx_inv @ Hxy).T
        
        dx = G1_inv @ (gx - Hxy @ Hyy_inv @ gy)
        dy = G2_inv @ gy - Hyy_inv @ Hxy.T @ G1_inv @ gx
        d = np.vstack([dx, dy])
        
        params -= lr * d
        end_time = time.time()
        cpu_time.append(end_time - start_time)
        
        grad_norm = np.linalg.norm(obj.grad(params), ord=2)
        gradients.append(grad_norm)
        if num_epoch % 5 == 0:
            print(f'epoch {num_epoch}, gradient norm: {grad_norm}')
    
    return params, gradients, cpu_time

# Part IV: PANDA Algorithm

In [6]:
def PANDA(params, A, b, c, lamb, gamma, beta, n_workers=8, epochs=100, lr=1.0):
    gradients = []
    cpu_time = [0.0]
    
    N = A.shape[0]
    indices = np.arange(N)
    np.random.shuffle(indices)
    data_blocks = np.array_split(indices, n_workers)
    local_obj = [Adversarial_Debiasing(A[data_blocks[i]], b[data_blocks[i]], c[data_blocks[i]], lamb, gamma, beta) 
                 for i in range(n_workers)]
    init_grad = np.linalg.norm(np.mean([obj.grad(params) for obj in local_obj], axis=0), ord=2)
    gradients.append(init_grad)
    print(f'epoch {0}, gradient norm: {init_grad}')
    
    for num_epoch in range(1, epochs+1):
        time_count = 0.0
        
        g_xi, g_yi, H_xyi, H_yyi = [], [], [], []
        def parallel_mission1(worker_idx):
            obj = local_obj[worker_idx]
            gz = obj.grad(np.copy(params))
            gx, gy = gz[:-1].reshape(-1, 1), gz[-1].reshape(-1, 1)
            H_xy = obj.Hxy(np.copy(params)); H_yy = obj.Hyy(np.copy(params))
            g_xi.append(gx); g_yi.append(gy)
            H_xyi.append(H_xy); H_yyi.append(H_yy)
            return None
        
        ####### Start Parallel Mission 1
        local_start1 = time.time()
        Parallel(n_jobs=1, backend='threading')(delayed(parallel_mission1)(i) 
                                                 for i in range(n_workers))
        local_end1 = time.time()
        time_count += (local_end1 - local_start1) / n_workers
        
        server_start1 = time.time()
        g_x = np.mean(g_xi, axis=0).reshape(-1, 1)
        g_y = np.mean(g_yi, axis=0).reshape(-1, 1)
        H_xy = np.mean(H_xyi, axis=0).reshape(-1, 1)
        H_yy = np.mean(H_yyi, axis=0).reshape(-1, 1)
        server_end1 = time.time()
        time_count += (server_end1 - server_start1)
        
        pxi, pxyi = [], []
        def parallel_mission2(worker_idx):
            obj = local_obj[worker_idx]
            Hxx_inv = np.linalg.inv(obj.Hxx(np.copy(params)))
            px = Hxx_inv @ g_x; pxy = Hxx_inv @ H_xy
            pxi.append(px); pxyi.append(pxy)
            return None     
        
        local_start2 = time.time()
        Parallel(n_jobs=1, backend='threading')(delayed(parallel_mission2)(i) 
                                                 for i in range(n_workers))
        local_end2 = time.time()
        time_count += (local_end2 - local_start2) / n_workers
        
        server_start2 = time.time()
        p_xy = np.mean(pxyi, axis=0)
        p_x = np.mean(pxi, axis=0)

        C2 = H_yy - H_xy.T @ p_xy
        C2_inv = np.linalg.inv(C2)
        C1_inv_g_x = p_x + p_xy @ C2_inv @ H_xy.T @ p_x
        C1_inv_H_xy_H_yy_inv = p_xy @ np.linalg.inv(H_yy) + p_xy @ C2_inv @ H_xy.T @ p_xy @ np.linalg.inv(H_yy)
        
        d_x = C1_inv_g_x - C1_inv_H_xy_H_yy_inv @ g_y
        d_y = C2_inv @ g_y - np.linalg.inv(H_yy) @ H_xy.T @ C1_inv_g_x
        
        d = np.vstack([d_x, d_y])
        params = params - lr * d
        server_end2 = time.time()
        time_count += (server_end2 - server_start2)
        
        cpu_time.append(time_count)
        
        grad_norm = np.linalg.norm(np.mean([obj.grad(params) for obj in local_obj], axis=0), ord=2)
        gradients.append(grad_norm)
        if num_epoch % 5 == 0:
            print(f'epoch {num_epoch}, gradient norm: {grad_norm}')
        
    return params, gradients, cpu_time

# Part V: Extra Gradient & ProxSkip-VI Algorithms

In [7]:
def parallel_extra(params, A, b, c, lamb, gamma, beta, n_workers=8, epochs=100, lr=1.0):
    gradients = []
    cpu_time = [0.0]
    
    N = A.shape[0]
    indices = np.arange(N)
    np.random.shuffle(indices)
    data_blocks = np.array_split(indices, n_workers)
    local_obj = [Adversarial_Debiasing(A[data_blocks[i]], b[data_blocks[i]], c[data_blocks[i]], lamb, gamma, beta) 
                 for i in range(n_workers)]
    init_grad = np.linalg.norm(np.mean([obj.grad(params) for obj in local_obj], axis=0), ord=2)
    gradients.append(init_grad)
    print(f'epoch {0}, gradient norm: {init_grad}')
    
    for num_epoch in range(1, epochs+1):
        time_count = 0.0
        
        g_xi, g_yi = [], []
        def parallel_mission1(worker_idx):
            obj = local_obj[worker_idx]
            gz = obj.grad(params)
            gx, gy = gz[:-1].reshape(-1, 1), gz[-1].reshape(-1, 1)
            g_xi.append(gx)
            g_yi.append(gy)
            return None
        
        local_start1 = time.time()
        Parallel(n_jobs=1, backend='threading')(delayed(parallel_mission1)(i) 
                                              for i in range(n_workers))
        local_end1 = time.time()
        time_count += (local_end1 - local_start1) / n_workers
        
        server_start1 = time.time()
        g_x = np.mean(g_xi, axis=0)
        g_y = np.mean(g_yi, axis=0)
        server_end1 = time.time()
        time_count += (server_end1 - server_start1)
        
        server_start2 = time.time()
        params_temp = np.copy(params)
        params_temp[:-1] -= lr * g_x
        params_temp[-1] += lr * g_y.flatten()
        server_end2 = time.time()
        time_count += (server_end2 - server_start2)
        
        g_xi, g_yi = [], []
        def parallel_mission2(worker_idx):
            obj = local_obj[worker_idx]
            gz = obj.grad(params_temp)
            gx, gy = gz[:-1].reshape(-1, 1), gz[-1].reshape(-1, 1)
            g_xi.append(gx)
            g_yi.append(gy)
            return None
        
        local_start2 = time.time()
        Parallel(n_jobs=1, backend='threading')(delayed(parallel_mission2)(i) 
                                              for i in range(n_workers))
        local_end2 = time.time()
        time_count += (local_end2 - local_start2) / n_workers
        
        server_start3 = time.time()
        g_x = np.mean(g_xi, axis=0)
        g_y = np.mean(g_yi, axis=0)
        params[:-1] -= lr * g_x
        params[-1] += lr * g_y.flatten()
        server_end3 = time.time()
        time_count += (server_end3 - server_start3)
        
        cpu_time.append(time_count)
        grad_norm = np.linalg.norm(np.mean([obj.grad(params) for obj in local_obj], axis=0), ord=2)
        gradients.append(grad_norm)
        if num_epoch % 20 == 0:
            print(f'epoch {num_epoch}, gradient norm: {grad_norm}')
        
    return params, gradients, cpu_time

In [8]:
def proxskip(params, control, A, b, c, lamb, gamma, beta, n_workers=8, epochs=100, lr=1.0, prob=0.4):
    gradients = []
    cpu_time = [0.0]
    N = A.shape[0]
    indices = np.arange(N)
    np.random.shuffle(indices)
    data_blocks = np.array_split(indices, n_workers)
    local_obj = [Adversarial_Debiasing(A[data_blocks[i]], b[data_blocks[i]], c[data_blocks[i]], lamb, gamma, beta) 
                 for i in range(n_workers)]
    init_grad = np.linalg.norm(np.mean([obj.grad(params) for obj in local_obj], axis=0), ord=2)
    gradients.append(init_grad)
    print(f'epoch {0}, gradient norm: {init_grad}')
    
    local_params = np.repeat(params, n_workers, axis=1)
    local_params_hat = np.copy(local_params)
    local_params_prime = np.copy(local_params)
    local_controls = np.repeat(control, n_workers, axis=1)
    updates = np.random.binomial(1, prob, size=epochs)
    lr2 = lr / prob
    lr3 = 1 / lr2

    for num_epoch in range(1, epochs+1):
        update = updates[num_epoch-1]
        if num_epoch == 1 or updates[num_epoch-2] == 1:
            time_count = 0.0
            
        def parallel_mission1(worker_idx):
            obj = local_obj[worker_idx]
            worker_params = local_params[:, [worker_idx]]
            gz = obj.grad(worker_params)
            gx, gy = gz[:-1].reshape(-1, 1), gz[-1].reshape(-1, 1)
            local_params_hat[:-1, [worker_idx]] = local_params[:-1, [worker_idx]] - lr * (gx - local_controls[:-1, [worker_idx]])
            local_params_hat[-1, [worker_idx]] = local_params[-1, [worker_idx]] + lr * (gy.flatten() - local_controls[-1, [worker_idx]])

            if update == 1:
                local_params_prime[:-1, [worker_idx]] = local_params_hat[:-1, [worker_idx]] - lr2 * local_controls[:-1, [worker_idx]]
                local_params_prime[-1, [worker_idx]] = local_params_hat[-1, [worker_idx]] + lr2 * local_controls[-1, [worker_idx]]

            else:
                local_params[:, [worker_idx]] = local_params_hat[:, [worker_idx]]

            return None
    
        local_start1 = time.time()
        Parallel(n_jobs=1, backend='threading')(delayed(parallel_mission1)(i) 
                                              for i in range(n_workers))
        local_end1 = time.time()
        time_count += (local_end1 - local_start1) / n_workers
        
        if update == 1:
            server_start1 = time.time()
            avg_params = np.mean(local_params_prime, axis=1, keepdims=True)
            local_params = np.repeat(avg_params, n_workers, axis=1)
            server_end1 = time.time()
            time_count += (server_end1 - server_end1)
            cpu_time.append(time_count)
            
            grad_norm = np.linalg.norm(np.mean([obj.grad(avg_params) for obj in local_obj], axis=0), ord=2)
            gradients.append(grad_norm)
            print(f'epoch {num_epoch}, gradient norm: {grad_norm}')
        
        def parallel_mission2(worker_idx):
            local_controls[:-1, [worker_idx]] += lr3 * (local_params[:-1, [worker_idx]] - local_params_hat[:-1, [worker_idx]])
            local_controls[-1, [worker_idx]] += lr3 * (-local_params[-1, [worker_idx]] + local_params_hat[-1, [worker_idx]])
            return None
        
        local_start2 = time.time()
        Parallel(n_jobs=1, backend='threading')(delayed(parallel_mission2)(i) 
                                              for i in range(n_workers))
        local_end2 = time.time()
        time_count += (local_end2 - local_start2) / n_workers
        
    return avg_params, gradients, cpu_time

# Part VI: GIANT-PANDA Algorithm

In [37]:
def GIANT_PANDA(params, A, b, c, lamb, gamma, beta, n_workers=8, epochs=100, lr=1.0, method='uniform', pct=0.1):
    gradients = []
    cpu_time = [0.0]
    N = A.shape[0]
    indices = np.arange(N)
    np.random.shuffle(indices)
    data_blocks = np.array_split(indices, n_workers)
    local_obj = [Adversarial_Debiasing(A[data_blocks[i]], 
                                       b[data_blocks[i]], 
                                       c[data_blocks[i]], 
                                       lamb, gamma, beta) 
                 for i in range(n_workers)]
    init_grad = np.linalg.norm(np.mean([obj.grad(params) for obj in local_obj], axis=0), ord=2)
    gradients.append(init_grad)
    print(f'epoch {0}, gradient norm: {init_grad}')
    
    for num_epoch in range(1, epochs+1):
        time_count = 0.0
        g_xi, g_yi, H_xyi, H_yyi = [], [], [], []
        sub_objs = [Adversarial_Debiasing_Sketch(x.A, x.b, x.c, lamb, gamma, beta, method, pct) for x in local_obj]
        
        def parallel_mission1(worker_idx):
            obj = sub_objs[worker_idx]
            gz = obj.grad(np.copy(params))
            gx, gy = gz[:-1].reshape(-1, 1), gz[-1].reshape(-1, 1)
            H_xy = obj.Hxy(np.copy(params)); H_yy = obj.Hyy(np.copy(params))
            g_xi.append(gx); g_yi.append(gy)
            H_xyi.append(H_xy); H_yyi.append(H_yy)
            return None
        
        local_start1 = time.time()
        Parallel(n_jobs=1, backend='threading')(delayed(parallel_mission1)(i) 
                                                 for i in range(n_workers))
        local_end1 = time.time()
        time_count += (local_end1 - local_start1) / n_workers
        
        server_start1 = time.time()
        g_x = np.mean(g_xi, axis=0).reshape(-1, 1)
        g_y = np.mean(g_yi, axis=0).reshape(-1, 1)
        H_xy = np.mean(H_xyi, axis=0).reshape(-1, 1)
        H_yy = np.mean(H_yyi, axis=0).reshape(-1, 1)
        server_end1 = time.time()
        time_count += (server_end1 - server_start1)
        
        pxi, pxyi = [], []
        def parallel_mission2(worker_idx):
            obj = sub_objs[worker_idx]
            Hxx_inv = np.linalg.inv(obj.Hxx(np.copy(params)))
            px = Hxx_inv @ g_x; pxy = Hxx_inv @ H_xy
            pxi.append(px); pxyi.append(pxy)
            return None     
        
        local_start2 = time.time()
        Parallel(n_jobs=1, backend='threading')(delayed(parallel_mission2)(i) 
                                                 for i in range(n_workers))
        local_end2 = time.time()
        time_count += (local_end2 - local_start2) / n_workers
        
        server_start2 = time.time()
        p_xy = np.mean(pxyi, axis=0)
        p_x = np.mean(pxi, axis=0)
        
        C2 = H_yy - H_xy.T @ p_xy
        C2_inv = np.linalg.inv(C2)
        C1_inv_g_x = p_x + p_xy @ C2_inv @ H_xy.T @ p_x
        C1_inv_H_xy_H_yy_inv = p_xy @ np.linalg.inv(H_yy) + p_xy @ C2_inv @ H_xy.T @ p_xy @ np.linalg.inv(H_yy)
        
        d_x = C1_inv_g_x - C1_inv_H_xy_H_yy_inv @ g_y
        d_y = C2_inv @ g_y - np.linalg.inv(H_yy) @ H_xy.T @ C1_inv_g_x
        
        d = np.vstack([d_x, d_y])
        params = params - lr * d
        server_end2 = time.time()
        time_count += (server_end2 - server_start2)
        
        cpu_time.append(time_count)
        grad_norm = np.linalg.norm(np.mean([obj.grad(params) for obj in local_obj], axis=0), ord=2)
        gradients.append(grad_norm)
        if num_epoch % 5 == 0:
            print(f'epoch {num_epoch}, gradient norm: {grad_norm}')
        
    return params, gradients, cpu_time

# Part VII: a9a Dataset

In [18]:
m = loadmat('a9a.mat')
A0 = np.array(m['A']).astype("float")
b = np.array(m['b']).astype("float")
c = A0[:,71]
A = np.hstack((A0[:,:71],A0[:,72:]))
c[np.where(c == 0)[0]] = -1
c = c.reshape(A0.shape[0],1)

In [19]:
A.shape

(32561, 122)

## Part VII-I: PAN Solution

In [8]:
# PAN(5%)
params = np.zeros(123).reshape(-1, 1)
params_pan_5, gradients_pan_5, time_pan_5 = PAN(params, A, b, c, 0.0001, 0.0001, 0.5, ratio=0.05, epochs=500, lr=1.0)

epoch 0, gradient norm: 0.6612688562040967
epoch 5, gradient norm: 0.0038578195215161993
epoch 10, gradient norm: 0.004460884435641053
epoch 15, gradient norm: 0.00569981672822834
epoch 20, gradient norm: 0.006573575837128978
epoch 25, gradient norm: 0.00787040458533466
epoch 30, gradient norm: 0.00613820195356965
epoch 35, gradient norm: 0.0052462465460702025
epoch 40, gradient norm: 0.0067910005132097604


KeyboardInterrupt: 

In [13]:
# PAN(10%)
params = np.zeros(123).reshape(-1, 1)
params_pan_10, gradients_pan_10, time_pan_10 = PAN(params, A, b, c, 0.0001, 0.0001, 0.5, ratio=0.1, epochs=100, lr=1.0)

epoch 0, gradient norm: 0.6612688562040967
epoch 5, gradient norm: 0.00030131253099683643
epoch 10, gradient norm: 2.1605141205726847e-07
epoch 15, gradient norm: 1.5772656516389222e-10
epoch 20, gradient norm: 2.1596908023432706e-13
epoch 25, gradient norm: 4.991624131890611e-16
epoch 30, gradient norm: 3.900739362107925e-17
epoch 35, gradient norm: 2.6574713700545633e-17
epoch 40, gradient norm: 4.390351855649948e-17
epoch 45, gradient norm: 1.600670497107787e-17
epoch 50, gradient norm: 1.6963002420597952e-17
epoch 55, gradient norm: 4.7462332622374665e-17
epoch 60, gradient norm: 2.881896137816099e-17
epoch 65, gradient norm: 5.209762794118781e-17
epoch 70, gradient norm: 1.0286465342079387e-17
epoch 75, gradient norm: 5.0056633470325984e-17
epoch 80, gradient norm: 2.481607803826819e-17
epoch 85, gradient norm: 1.6836986029809348e-17
epoch 90, gradient norm: 5.895530175165075e-17
epoch 95, gradient norm: 2.829266603242507e-17
epoch 100, gradient norm: 1.9630255551569386e-17


In [14]:
# PAN(20%)
params = np.zeros(123).reshape(-1, 1)
params_pan_20, gradients_pan_20, time_pan_20 = PAN(params, A, b, c, 0.0001, 0.0001, 0.5, ratio=0.2, epochs=100, lr=1.0)

epoch 0, gradient norm: 0.6612688562040967
epoch 5, gradient norm: 0.00012436806666263895
epoch 10, gradient norm: 1.668092669449058e-08
epoch 15, gradient norm: 4.1657296475093354e-12
epoch 20, gradient norm: 6.212637282172057e-16
epoch 25, gradient norm: 2.443334448538217e-17
epoch 30, gradient norm: 1.3178690460058379e-17
epoch 35, gradient norm: 4.3820704132122786e-17
epoch 40, gradient norm: 4.8037586314873385e-17
epoch 45, gradient norm: 1.9795860934922297e-17
epoch 50, gradient norm: 7.908902900274948e-18
epoch 55, gradient norm: 1.6671195984889157e-17
epoch 60, gradient norm: 3.265834250476961e-17
epoch 65, gradient norm: 4.632224849146657e-17
epoch 70, gradient norm: 1.4491100227772756e-17
epoch 75, gradient norm: 5.162673933201004e-17
epoch 80, gradient norm: 4.293717207376464e-17
epoch 85, gradient norm: 5.622530045551061e-17
epoch 90, gradient norm: 4.457201512469702e-17
epoch 95, gradient norm: 8.28811378752027e-18
epoch 100, gradient norm: 2.6817927947173176e-17


## Part VII-II: PANDA Solution

In [8]:
# PANDA(4)
params = np.zeros(123).reshape(-1, 1)
params_nt_4, gradients_nt_4, time_nt_4 = parallel_nt(params, A, b, c, 0.0001, 0.0001, 0.5, n_workers=4, epochs=100, lr=1.0)

epoch 0, gradient norm: 0.6612686076732777
epoch 5, gradient norm: 0.00021057779872244665
epoch 10, gradient norm: 1.1729996942228272e-12
epoch 15, gradient norm: 2.5043635483832864e-17
epoch 20, gradient norm: 2.105692745693514e-17
epoch 25, gradient norm: 4.780405289641596e-17
epoch 30, gradient norm: 2.2136919326595576e-17
epoch 35, gradient norm: 1.3613653592846576e-17
epoch 40, gradient norm: 3.001989424046837e-17
epoch 45, gradient norm: 3.7973999840935435e-17
epoch 50, gradient norm: 1.3749898273782841e-17
epoch 55, gradient norm: 2.121069172019686e-17
epoch 60, gradient norm: 2.809000906481296e-17
epoch 65, gradient norm: 3.4271964460016653e-17
epoch 70, gradient norm: 3.058439519101444e-17
epoch 75, gradient norm: 3.491350418206587e-17
epoch 80, gradient norm: 3.917147386847454e-17
epoch 85, gradient norm: 8.425494302060812e-17
epoch 90, gradient norm: 3.8897559419236617e-17
epoch 95, gradient norm: 2.91999944310401e-17
epoch 100, gradient norm: 6.076714309408818e-18


In [9]:
# PANDA(8)
params = np.zeros(123).reshape(-1, 1)
params_nt_8, gradients_nt_8, time_nt_8 = parallel_nt(params, A, b, c, 0.0001, 0.0001, 0.5, n_workers=8, epochs=100, lr=1.0)

epoch 0, gradient norm: 0.6612689971582043
epoch 5, gradient norm: 0.00018676300083661284
epoch 10, gradient norm: 1.7100279312953336e-11
epoch 15, gradient norm: 5.818890614954898e-16
epoch 20, gradient norm: 2.6154850464658624e-17
epoch 25, gradient norm: 2.6615935141568778e-17
epoch 30, gradient norm: 1.2287077276572544e-17
epoch 35, gradient norm: 1.5027884407615935e-17
epoch 40, gradient norm: 3.8276397985369775e-17
epoch 45, gradient norm: 1.8646575449458593e-17
epoch 50, gradient norm: 7.627406992629827e-17
epoch 55, gradient norm: 6.271705124460136e-17
epoch 60, gradient norm: 1.1575375487948042e-17
epoch 65, gradient norm: 1.6618179517286713e-17
epoch 70, gradient norm: 2.533600819017849e-17
epoch 75, gradient norm: 7.688505247258e-17
epoch 80, gradient norm: 1.730099036441236e-17
epoch 85, gradient norm: 5.760975642944773e-17
epoch 90, gradient norm: 3.13606557915681e-17
epoch 95, gradient norm: 1.2596236563388429e-17
epoch 100, gradient norm: 2.568593445890809e-17


In [10]:
# PANDA(16)
params = np.zeros(123).reshape(-1, 1)
params_nt_16, gradients_nt_16, time_nt_16 = parallel_nt(params, A, b, c, 0.0001, 0.0001, 0.5, n_workers=16, epochs=100, lr=1.0)

epoch 0, gradient norm: 0.6612695932533036
epoch 5, gradient norm: 0.00014505849461540624
epoch 10, gradient norm: 5.1096338932256496e-09
epoch 15, gradient norm: 6.369118145364914e-12
epoch 20, gradient norm: 9.491613828362548e-15
epoch 25, gradient norm: 3.590104018618651e-17
epoch 30, gradient norm: 8.704187481892118e-17
epoch 35, gradient norm: 3.541584418010261e-17
epoch 40, gradient norm: 2.0357451851308164e-17
epoch 45, gradient norm: 4.3308081839623054e-17
epoch 50, gradient norm: 5.58181734343633e-17
epoch 55, gradient norm: 2.948935905068323e-17
epoch 60, gradient norm: 1.2892043691421485e-17
epoch 65, gradient norm: 2.672642894477224e-17
epoch 70, gradient norm: 7.174457866539177e-18
epoch 75, gradient norm: 5.85109139447481e-18
epoch 80, gradient norm: 2.1208274594064347e-17
epoch 85, gradient norm: 3.628799530153978e-17
epoch 90, gradient norm: 1.3450754038021956e-17
epoch 95, gradient norm: 4.579826409259996e-17
epoch 100, gradient norm: 1.798987686999606e-17


In [11]:
# PANDA(32)
params = np.zeros(123).reshape(-1, 1)
params_nt_32, gradients_nt_32, time_nt_32 = parallel_nt(params, A, b, c, 0.0001, 0.0001, 0.5, n_workers=32, epochs=100, lr=1.0)

epoch 0, gradient norm: 0.6612651821458368
epoch 5, gradient norm: 5.6447797235583e-05
epoch 10, gradient norm: 1.3363274599890471e-06
epoch 15, gradient norm: 3.875979525149503e-08
epoch 20, gradient norm: 1.1286425038001568e-09
epoch 25, gradient norm: 3.2958512739129765e-11
epoch 30, gradient norm: 9.638008450794912e-13
epoch 35, gradient norm: 2.8242287212383336e-14
epoch 40, gradient norm: 8.041724397137658e-16
epoch 45, gradient norm: 4.5455410162172173e-17
epoch 50, gradient norm: 7.317604487066302e-17
epoch 55, gradient norm: 3.3426319347669457e-17
epoch 60, gradient norm: 1.1569183612725705e-17
epoch 65, gradient norm: 7.739537409843066e-18
epoch 70, gradient norm: 2.6642424341172376e-17
epoch 75, gradient norm: 2.6419977650792923e-17
epoch 80, gradient norm: 1.4463697146098785e-17
epoch 85, gradient norm: 1.1554642726343487e-17
epoch 90, gradient norm: 8.361184490880083e-17
epoch 95, gradient norm: 8.64982897390796e-18
epoch 100, gradient norm: 9.242451632283113e-18


## Part VII-III: ProxSkip-VI-FL Solution

In [None]:
# ProxSkip (8)
params = np.zeros(123).reshape(-1, 1)
control = np.zeros(123).reshape(-1, 1)
params_ps_8, gradients_ps_8, time_ps_8 = proxskip(params, control, A, b, c, 0.0001, 0.0001, 0.5, n_workers=8, epochs=8000, lr=1.0, prob=0.4)

## Part VII-IV: EG Solution

In [17]:
# EG(8)
params = np.zeros(123).reshape(-1, 1)
params_extragrad_8, gradients_extragrad_8, time_extragrad_8 = parallel_extra(params, A, b, c, 0.0001, 0.0001, 0.5, n_workers=8, epochs=12000, lr=0.7)

epoch 0, gradient norm: 0.6612699149892407
epoch 20, gradient norm: 0.06043776576116855
epoch 40, gradient norm: 0.030868327703461873
epoch 60, gradient norm: 0.022301360274171542
epoch 80, gradient norm: 0.017528030821039032
epoch 100, gradient norm: 0.014387427516963655
epoch 120, gradient norm: 0.012154497145571236
epoch 140, gradient norm: 0.01049110614354109
epoch 160, gradient norm: 0.009209073254236281
epoch 180, gradient norm: 0.008193627213011135
epoch 200, gradient norm: 0.007370863008992806
epoch 220, gradient norm: 0.006691304808027721
epoch 240, gradient norm: 0.006120767503650975
epoch 260, gradient norm: 0.005634978728421488
epoch 280, gradient norm: 0.00521628616066542
epoch 300, gradient norm: 0.004851576428594963
epoch 320, gradient norm: 0.004530922767401527
epoch 340, gradient norm: 0.004246683875301692
epoch 360, gradient norm: 0.0039928894326311205
epoch 380, gradient norm: 0.003764812126328067
epoch 400, gradient norm: 0.0035586637288780375
epoch 420, gradient no

epoch 3380, gradient norm: 0.0002894712526121209
epoch 3400, gradient norm: 0.00028749553059106053
epoch 3420, gradient norm: 0.0002855413986570231
epoch 3440, gradient norm: 0.00028360848516852626
epoch 3460, gradient norm: 0.00028169642743074743
epoch 3480, gradient norm: 0.00027980487140974496
epoch 3500, gradient norm: 0.0002779334714583347
epoch 3520, gradient norm: 0.0002760818900530569
epoch 3540, gradient norm: 0.00027424979754168727
epoch 3560, gradient norm: 0.0002724368719008012
epoch 3580, gradient norm: 0.0002706427985028981
epoch 3600, gradient norm: 0.0002688672698926408
epoch 3620, gradient norm: 0.00026710998557177585
epoch 3640, gradient norm: 0.00026537065179233453
epoch 3660, gradient norm: 0.0002636489813577207
epoch 3680, gradient norm: 0.00026194469343132626
epoch 3700, gradient norm: 0.00026025751335232946
epoch 3720, gradient norm: 0.00025858717245834454
epoch 3740, gradient norm: 0.0002569334079146123
epoch 3760, gradient norm: 0.00025529596254944244
epoch 378

epoch 6680, gradient norm: 0.000115968540786496
epoch 6700, gradient norm: 0.00011541035787160691
epoch 6720, gradient norm: 0.00011485538471662983
epoch 6740, gradient norm: 0.00011430359640840502
epoch 6760, gradient norm: 0.00011375496831566537
epoch 6780, gradient norm: 0.00011320947608470274
epoch 6800, gradient norm: 0.00011266709563511822
epoch 6820, gradient norm: 0.00011212780315564493
epoch 6840, gradient norm: 0.00011159157510005141
epoch 6860, gradient norm: 0.00011105838818311801
epoch 6880, gradient norm: 0.00011052821937668598
epoch 6900, gradient norm: 0.00011000104590577811
epoch 6920, gradient norm: 0.00010947684524479139
epoch 6940, gradient norm: 0.00010895559511375594
epoch 6960, gradient norm: 0.00010843727347466448
epoch 6980, gradient norm: 0.00010792185852786547
epoch 7000, gradient norm: 0.00010740932870852061
epoch 7020, gradient norm: 0.00010689966268312945
epoch 7040, gradient norm: 0.00010639283934611324
epoch 7060, gradient norm: 0.00010588883781645846
ep

epoch 10020, gradient norm: 5.412730083186154e-05
epoch 10040, gradient norm: 5.389105029180338e-05
epoch 10060, gradient norm: 5.365591936515211e-05
epoch 10080, gradient norm: 5.342190202650775e-05
epoch 10100, gradient norm: 5.318899228986924e-05
epoch 10120, gradient norm: 5.295718420829539e-05
epoch 10140, gradient norm: 5.27264718735702e-05
epoch 10160, gradient norm: 5.2496849415872675e-05
epoch 10180, gradient norm: 5.2268311003448514e-05
epoch 10200, gradient norm: 5.204085084228887e-05
epoch 10220, gradient norm: 5.18144631758098e-05
epoch 10240, gradient norm: 5.158914228453719e-05
epoch 10260, gradient norm: 5.136488248579406e-05
epoch 10280, gradient norm: 5.114167813339242e-05
epoch 10300, gradient norm: 5.091952361732841e-05
epoch 10320, gradient norm: 5.069841336348022e-05
epoch 10340, gradient norm: 5.04783418333111e-05
epoch 10360, gradient norm: 5.025930352357276e-05
epoch 10380, gradient norm: 5.004129296601531e-05
epoch 10400, gradient norm: 4.9824304727099046e-05


# Part VIII: DatasetStudent Dataset

In [19]:
m = loadmat('LSTUDENT_DATA1.mat')
A = np.array(m['A']).astype("float")
b = np.array(m['b']).astype("float")
b[np.where(b == 0)[0]] = -1
c = np.array(m['c']).astype('float')
c[np.where(c == 0)[0]] = -1

In [20]:
A.shape

(20427, 379)

## Part VIII-I: PAN Solution

In [22]:
# PAN(5%)
params = np.zeros(380).reshape(-1, 1)
params_pan_5, gradients_pan_5, time_pan_5 = PAN(params, A, b, c, 0.0001, 0.0001, 1.0, ratio=0.05, epochs=100, lr=0.5)

epoch 0, gradient norm: 0.5935822488611634
epoch 5, gradient norm: 0.5108317031099475
epoch 10, gradient norm: 0.014149767129513071
epoch 15, gradient norm: 0.0003742593832350471
epoch 20, gradient norm: 1.177985161439146e-05
epoch 25, gradient norm: 3.67366790564919e-07
epoch 30, gradient norm: 1.1609598110680183e-08
epoch 35, gradient norm: 3.6696414274797555e-10
epoch 40, gradient norm: 1.2231357063515421e-11
epoch 45, gradient norm: 3.9901668851487933e-13
epoch 50, gradient norm: 2.0123357811166924e-14
epoch 55, gradient norm: 5.487038458799802e-16
epoch 60, gradient norm: 4.730207025092572e-17
epoch 65, gradient norm: 2.0441486367014674e-17
epoch 70, gradient norm: 1.4401917081876896e-17
epoch 75, gradient norm: 1.947115643183212e-17
epoch 80, gradient norm: 1.9851168074515574e-17
epoch 85, gradient norm: 1.4164860138435436e-17
epoch 90, gradient norm: 7.274822667110992e-18
epoch 95, gradient norm: 1.9642877251695356e-17
epoch 100, gradient norm: 9.016111765771344e-18


In [23]:
# PAN(10%)
params = np.zeros(380).reshape(-1, 1)
params_pan_10, gradients_pan_10, time_pan_10 = PAN(params, A, b, c, 0.0001, 0.0001, 1.0, ratio=0.1, epochs=100, lr=0.5)

epoch 0, gradient norm: 0.5935822488611634
epoch 5, gradient norm: 0.035409966950739405
epoch 10, gradient norm: 0.0017696266871763567
epoch 15, gradient norm: 5.2750675472587594e-05
epoch 20, gradient norm: 1.4769711284142494e-06
epoch 25, gradient norm: 3.8856403967741747e-08
epoch 30, gradient norm: 1.1559898918201973e-09
epoch 35, gradient norm: 3.415549923968517e-11
epoch 40, gradient norm: 1.0152753135957461e-12
epoch 45, gradient norm: 3.156295792971648e-14
epoch 50, gradient norm: 9.63506363060443e-16
epoch 55, gradient norm: 3.651578864688437e-17
epoch 60, gradient norm: 1.4551529562032138e-17
epoch 65, gradient norm: 2.5788779106619992e-17
epoch 70, gradient norm: 2.0245594336328162e-17
epoch 75, gradient norm: 3.1486558650340365e-17
epoch 80, gradient norm: 1.4413837409048225e-17
epoch 85, gradient norm: 2.5495829396252613e-17
epoch 90, gradient norm: 9.029085806647482e-18
epoch 95, gradient norm: 1.3703712329164486e-17
epoch 100, gradient norm: 3.127525860780564e-17


In [24]:
# PAN(20%)
params = np.zeros(380).reshape(-1, 1)
params_pan_20, gradients_pan_20, time_pan_20 = PAN(params, A, b, c, 0.0001, 0.0001, 1.0, ratio=0.2, epochs=100, lr=0.5)

epoch 0, gradient norm: 0.5935822488611634
epoch 5, gradient norm: 0.03537192259475998
epoch 10, gradient norm: 0.0020151613479540446
epoch 15, gradient norm: 6.832563775933332e-05
epoch 20, gradient norm: 2.0095937974151372e-06
epoch 25, gradient norm: 5.916914552767154e-08
epoch 30, gradient norm: 1.7805533716255802e-09
epoch 35, gradient norm: 5.331616450375534e-11
epoch 40, gradient norm: 1.53788625291074e-12
epoch 45, gradient norm: 4.795669970494562e-14
epoch 50, gradient norm: 1.4617503814518958e-15
epoch 55, gradient norm: 4.718807100966165e-17
epoch 60, gradient norm: 8.931625511821976e-18
epoch 65, gradient norm: 3.1329679223736474e-17
epoch 70, gradient norm: 2.0096148454900752e-17
epoch 75, gradient norm: 1.9271449357413552e-17
epoch 80, gradient norm: 1.4724771772639162e-17
epoch 85, gradient norm: 1.00156544020726e-17
epoch 90, gradient norm: 5.337452900348058e-18
epoch 95, gradient norm: 1.533638518706881e-17
epoch 100, gradient norm: 2.559266867633418e-17


## Part VIII-II: PANDA Solution

In [62]:
# PANDA(4)
params = np.zeros(380).reshape(-1, 1)
params_nt_4, gradients_nt_4, time_nt_4 = parallel_nt(params, A, b, c, 0.0001, 0.0001, 1.0, n_workers=4, epochs=200, lr=0.5)

epoch 0, gradient norm: 0.5935822198749362
epoch 5, gradient norm: 0.035476823414181534
epoch 10, gradient norm: 0.0020981297895746116
epoch 15, gradient norm: 7.02164437425284e-05
epoch 20, gradient norm: 2.1272287778385603e-06
epoch 25, gradient norm: 6.446736695533603e-08
epoch 30, gradient norm: 1.96037152460725e-09
epoch 35, gradient norm: 5.979153423549333e-11
epoch 40, gradient norm: 1.8283558712805793e-12
epoch 45, gradient norm: 5.602856271051124e-14
epoch 50, gradient norm: 1.7106863759507431e-15
epoch 55, gradient norm: 5.552766877276874e-17
epoch 60, gradient norm: 6.7643167276336596e-18
epoch 65, gradient norm: 7.002781481831443e-18
epoch 70, gradient norm: 8.271900497559508e-18
epoch 75, gradient norm: 1.0738496248743002e-17
epoch 80, gradient norm: 1.0682711731059565e-17
epoch 85, gradient norm: 4.472814180418104e-18
epoch 90, gradient norm: 1.2457723721359556e-17
epoch 95, gradient norm: 1.1860084598069992e-17
epoch 100, gradient norm: 1.4963970744164947e-17
epoch 105, 

In [63]:
# PANDA(8)
params = np.zeros(380).reshape(-1, 1)
params_nt_8, gradients_nt_8, time_nt_8 = parallel_nt(params, A, b, c, 0.0001, 0.0001, 1.0, n_workers=8, epochs=200, lr=0.5)

epoch 0, gradient norm: 0.5935818200463516
epoch 5, gradient norm: 0.03329249023724169
epoch 10, gradient norm: 0.001791474027415085
epoch 15, gradient norm: 5.574213822049723e-05
epoch 20, gradient norm: 1.624300914435437e-06
epoch 25, gradient norm: 4.807045118776536e-08
epoch 30, gradient norm: 1.4423107707563097e-09
epoch 35, gradient norm: 4.3728207012822e-11
epoch 40, gradient norm: 1.3361396366588752e-12
epoch 45, gradient norm: 4.1061969141013364e-14
epoch 50, gradient norm: 1.2514841142351045e-15
epoch 55, gradient norm: 3.857142719503152e-17
epoch 60, gradient norm: 6.275797657998654e-18
epoch 65, gradient norm: 1.3823564464287907e-17
epoch 70, gradient norm: 8.920865016496103e-18
epoch 75, gradient norm: 8.990173391998502e-18
epoch 80, gradient norm: 7.083791614571127e-18
epoch 85, gradient norm: 1.276513196826515e-17
epoch 90, gradient norm: 1.114902297325981e-17
epoch 95, gradient norm: 1.2763663397768102e-17
epoch 100, gradient norm: 8.892644889617711e-18
epoch 105, gradi

In [64]:
# PANDA(16)
params = np.zeros(380).reshape(-1, 1)
params_nt_16, gradients_nt_16, time_nt_16 = parallel_nt(params, A, b, c, 0.0001, 0.0001, 1.0, n_workers=16, epochs=200, lr=0.5)

epoch 0, gradient norm: 0.59358158388404
epoch 5, gradient norm: 0.030265523803204526
epoch 10, gradient norm: 0.0013718853772173563
epoch 15, gradient norm: 3.8306767464638494e-05
epoch 20, gradient norm: 1.090120043529439e-06
epoch 25, gradient norm: 3.251840413394253e-08
epoch 30, gradient norm: 9.934944767986596e-10
epoch 35, gradient norm: 3.0702867279203664e-11
epoch 40, gradient norm: 9.540827150679302e-13
epoch 45, gradient norm: 2.972621337723156e-14
epoch 50, gradient norm: 9.535542663149847e-16
epoch 55, gradient norm: 4.405873735851813e-17
epoch 60, gradient norm: 2.875454741072235e-17
epoch 65, gradient norm: 2.093862550517535e-17
epoch 70, gradient norm: 2.176556250528426e-17
epoch 75, gradient norm: 2.976799344648293e-17
epoch 80, gradient norm: 1.7506196224139726e-17
epoch 85, gradient norm: 2.0018756814422735e-17
epoch 90, gradient norm: 1.5065784466991873e-17
epoch 95, gradient norm: 3.003636334780766e-17
epoch 100, gradient norm: 1.4972610529274337e-17
epoch 105, gra

In [65]:
# PANDA(32)
params = np.zeros(380).reshape(-1, 1)
params_nt_32, gradients_nt_32, time_nt_32 = parallel_nt(params, A, b, c, 0.0001, 0.0001, 1.0, n_workers=32, epochs=200, lr=0.4)

epoch 0, gradient norm: 0.5935810832392593
epoch 5, gradient norm: 0.03818939791204123
epoch 10, gradient norm: 0.0030242184890511025
epoch 15, gradient norm: 0.00020642754704253282
epoch 20, gradient norm: 1.4849144934682047e-05
epoch 25, gradient norm: 1.1230935819222308e-06
epoch 30, gradient norm: 8.646321176963655e-08
epoch 35, gradient norm: 6.697695175691572e-09
epoch 40, gradient norm: 5.200074748417337e-10
epoch 45, gradient norm: 4.0409307861997594e-11
epoch 50, gradient norm: 3.141337884549996e-12
epoch 55, gradient norm: 2.442281046846603e-13
epoch 60, gradient norm: 1.8982050785381036e-14
epoch 65, gradient norm: 1.4764571643001524e-15
epoch 70, gradient norm: 1.1898062939210272e-16
epoch 75, gradient norm: 3.7328607419946856e-17
epoch 80, gradient norm: 3.3020998884574836e-17
epoch 85, gradient norm: 2.6965904959547528e-17
epoch 90, gradient norm: 3.140439202105453e-17
epoch 95, gradient norm: 3.3035943912007704e-17
epoch 100, gradient norm: 3.824591370811746e-17
epoch 10

In [60]:
# PANDA(64)
params = np.zeros(380).reshape(-1, 1)
params_nt_64, gradients_nt_64, time_nt_64 = parallel_nt(params, A, b, c, 0.0001, 0.0001, 1.0, n_workers=64, epochs=200, lr=0.2)

epoch 0, gradient norm: 0.5935813733684949
epoch 5, gradient norm: 0.07081152870850359
epoch 10, gradient norm: 0.011875206822518079
epoch 15, gradient norm: 0.0023554467494712183
epoch 20, gradient norm: 0.00048675310460731644
epoch 25, gradient norm: 0.00010568594712874284
epoch 30, gradient norm: 2.4152922292259893e-05
epoch 35, gradient norm: 5.764127071344272e-06
epoch 40, gradient norm: 1.4273221387000957e-06
epoch 45, gradient norm: 3.6558678057043556e-07
epoch 50, gradient norm: 9.677654931429782e-08
epoch 55, gradient norm: 2.647211804502572e-08
epoch 60, gradient norm: 7.47609566244863e-09
epoch 65, gradient norm: 2.1748400230878437e-09
epoch 70, gradient norm: 6.492790146120809e-10
epoch 75, gradient norm: 1.980299828803943e-10
epoch 80, gradient norm: 6.142698840864675e-11
epoch 85, gradient norm: 1.930077259179825e-11
epoch 90, gradient norm: 6.12300534481156e-12
epoch 95, gradient norm: 1.956318590796504e-12
epoch 100, gradient norm: 6.283363489720532e-13
epoch 105, gradi

In [66]:
# PANDA(128)
params = np.zeros(380).reshape(-1, 1)
params_nt_128, gradients_nt_128, time_nt_128 = parallel_nt(params, A, b, c, 0.0001, 0.0001, 1.0, n_workers=128, epochs=200, lr=0.1)

epoch 0, gradient norm: 0.5935888017611278
epoch 5, gradient norm: 0.14661738798428223
epoch 10, gradient norm: 0.0784335108554047
epoch 15, gradient norm: 0.043572958222726166
epoch 20, gradient norm: 0.02493680458817493
epoch 25, gradient norm: 0.014512303878888145
epoch 30, gradient norm: 0.008513772030894028
epoch 35, gradient norm: 0.005012553726086108
epoch 40, gradient norm: 0.0029558087743950267
epoch 45, gradient norm: 0.0017442074299987916
epoch 50, gradient norm: 0.0010295824600758817
epoch 55, gradient norm: 0.0006078443389621999
epoch 60, gradient norm: 0.00035888730089170327
epoch 65, gradient norm: 0.0002119054187691513
epoch 70, gradient norm: 0.00012512272452677424
epoch 75, gradient norm: 7.388159634441808e-05
epoch 80, gradient norm: 4.362545917905867e-05
epoch 85, gradient norm: 2.5760015305948207e-05
epoch 90, gradient norm: 1.5210861651078274e-05
epoch 95, gradient norm: 8.981784492805382e-06
epoch 100, gradient norm: 5.303618154932739e-06
epoch 105, gradient norm

## Part VIII-III: ProxSkip-VI-FL Solution

In [None]:
# ProxSkip(8)
params = np.zeros(380).reshape(-1, 1)
control = np.zeros(380).reshape(-1, 1)
params_ps_8, gradients_ps_8, time_ps_8 = proxskip(params, control, A, b, c, 0.0001, 0.0001, 1.0, n_workers=8, epochs=20000, lr=0.4, prob=0.4)

## Part VIII-IV: EG Solution

In [29]:
# EG(8)
params = np.zeros(380).reshape(-1, 1)
params_extragrad_8, gradients_extragrad_8, time_extragrad_8 = parallel_extra(params, A, b, c, 0.0001, 0.0001, 1.0, n_workers=8, epochs=20000, lr=0.2)

epoch 0, gradient norm: 0.5935814763700317
epoch 20, gradient norm: 0.16624564659849075
epoch 40, gradient norm: 0.08399087430711
epoch 60, gradient norm: 0.05277240077864303
epoch 80, gradient norm: 0.03807473852742676
epoch 100, gradient norm: 0.03011385225293302
epoch 120, gradient norm: 0.02540702098834911
epoch 140, gradient norm: 0.022418521245778384
epoch 160, gradient norm: 0.02038535221513388
epoch 180, gradient norm: 0.018905362850468656
epoch 200, gradient norm: 0.01775944138150664
epoch 220, gradient norm: 0.016825202311269247
epoch 240, gradient norm: 0.016032592390025614
epoch 260, gradient norm: 0.015340373217965239
epoch 280, gradient norm: 0.014723426941642337
epoch 300, gradient norm: 0.014165802169600965
epoch 320, gradient norm: 0.01365685620216314
epoch 340, gradient norm: 0.013189091515663541
epoch 360, gradient norm: 0.012756930489801798
epoch 380, gradient norm: 0.012356014549112893
epoch 400, gradient norm: 0.011982798294219176
epoch 420, gradient norm: 0.01163

epoch 3440, gradient norm: 0.0022056856715266574
epoch 3460, gradient norm: 0.002193643044427405
epoch 3480, gradient norm: 0.0021817314094033893
epoch 3500, gradient norm: 0.0021699486805751493
epoch 3520, gradient norm: 0.0021582928138683902
epoch 3540, gradient norm: 0.002146761805984574
epoch 3560, gradient norm: 0.002135353693402821
epoch 3580, gradient norm: 0.002124066551411978
epoch 3600, gradient norm: 0.0021128984931717363
epoch 3620, gradient norm: 0.002101847668801748
epoch 3640, gradient norm: 0.0020909122644977166
epoch 3660, gradient norm: 0.0020800905016735032
epoch 3680, gradient norm: 0.0020693806361283046
epoch 3700, gradient norm: 0.002058780957238021
epoch 3720, gradient norm: 0.0020482897871699525
epoch 3740, gradient norm: 0.0020379054801200128
epoch 3760, gradient norm: 0.0020276264215716637
epoch 3780, gradient norm: 0.002017451027575833
epoch 3800, gradient norm: 0.0020073777440510793
epoch 3820, gradient norm: 0.001997405046103314
epoch 3840, gradient norm: 0

epoch 6820, gradient norm: 0.0011395117348709754
epoch 6840, gradient norm: 0.0011361567423733077
epoch 6860, gradient norm: 0.001132819157111276
epoch 6880, gradient norm: 0.001129498829554827
epoch 6900, gradient norm: 0.0011261956119945084
epoch 6920, gradient norm: 0.00112290935851429
epoch 6940, gradient norm: 0.0011196399249648397
epoch 6960, gradient norm: 0.001116387168937253
epoch 6980, gradient norm: 0.0011131509497372186
epoch 7000, gradient norm: 0.0011099311283596207
epoch 7020, gradient norm: 0.0011067275674635617
epoch 7040, gradient norm: 0.0011035401313478047
epoch 7060, gradient norm: 0.0011003686859266258
epoch 7080, gradient norm: 0.0010972130987060656
epoch 7100, gradient norm: 0.0010940732387605776
epoch 7120, gradient norm: 0.0010909489767100642
epoch 7140, gradient norm: 0.0010878401846972933
epoch 7160, gradient norm: 0.0010847467363656839
epoch 7180, gradient norm: 0.0010816685068374679
epoch 7200, gradient norm: 0.001078605372692203
epoch 7220, gradient norm:

epoch 10180, gradient norm: 0.0007428517694380877
epoch 10200, gradient norm: 0.0007411839793005988
epoch 10220, gradient norm: 0.0007395219034318906
epoch 10240, gradient norm: 0.0007378655124016312
epoch 10260, gradient norm: 0.000736214776999104
epoch 10280, gradient norm: 0.0007345696682309815
epoch 10300, gradient norm: 0.000732930157319135
epoch 10320, gradient norm: 0.0007312962156984657
epoch 10340, gradient norm: 0.0007296678150147658
epoch 10360, gradient norm: 0.000728044927122606
epoch 10380, gradient norm: 0.0007264275240832516
epoch 10400, gradient norm: 0.0007248155781626003
epoch 10420, gradient norm: 0.0007232090618291487
epoch 10440, gradient norm: 0.0007216079477519857
epoch 10460, gradient norm: 0.0007200122087988048
epoch 10480, gradient norm: 0.0007184218180339496
epoch 10500, gradient norm: 0.0007168367487164754
epoch 10520, gradient norm: 0.0007152569742982399
epoch 10540, gradient norm: 0.0007136824684220161
epoch 10560, gradient norm: 0.0007121132049196268
epo

epoch 13480, gradient norm: 0.0005282905713517587
epoch 13500, gradient norm: 0.0005272819828244632
epoch 13520, gradient norm: 0.0005262761527218924
epoch 13540, gradient norm: 0.0005252730704796101
epoch 13560, gradient norm: 0.0005242727255881539
epoch 13580, gradient norm: 0.0005232751075926564
epoch 13600, gradient norm: 0.0005222802060924664
epoch 13620, gradient norm: 0.0005212880107407749
epoch 13640, gradient norm: 0.0005202985112442447
epoch 13660, gradient norm: 0.0005193116973626438
epoch 13680, gradient norm: 0.0005183275589084786
epoch 13700, gradient norm: 0.0005173460857466344
epoch 13720, gradient norm: 0.0005163672677940187
epoch 13740, gradient norm: 0.0005153910950192038
epoch 13760, gradient norm: 0.0005144175574420761
epoch 13780, gradient norm: 0.0005134466451334884
epoch 13800, gradient norm: 0.0005124783482149131
epoch 13820, gradient norm: 0.0005115126568580998
epoch 13840, gradient norm: 0.0005105495612847369
epoch 13860, gradient norm: 0.0005095890517661121


epoch 16740, gradient norm: 0.00039410399182432176
epoch 16760, gradient norm: 0.0003934360718159926
epoch 16780, gradient norm: 0.0003927697152773742
epoch 16800, gradient norm: 0.0003921049172351688
epoch 16820, gradient norm: 0.0003914416727366006
epoch 16840, gradient norm: 0.0003907799768493078
epoch 16860, gradient norm: 0.0003901198246612337
epoch 16880, gradient norm: 0.00038946121128052093
epoch 16900, gradient norm: 0.0003888041318354053
epoch 16920, gradient norm: 0.00038814858147410904
epoch 16940, gradient norm: 0.0003874945553647388
epoch 16960, gradient norm: 0.00038684204869517797
epoch 16980, gradient norm: 0.00038619105667298697
epoch 17000, gradient norm: 0.0003855415745252979
epoch 17020, gradient norm: 0.00038489359749871395
epoch 17040, gradient norm: 0.0003842471208592074
epoch 17060, gradient norm: 0.00038360213989202
epoch 17080, gradient norm: 0.00038295864990156104
epoch 17100, gradient norm: 0.00038231664621131054
epoch 17120, gradient norm: 0.00038167612416

epoch 20000, gradient norm: 0.0003028502092616674
