In [None]:
import pickle, os, time
from itertools import product

import numpy as np
import matplotlib.pyplot as plt

from nda import log
from nda.problems import *
from nda.optimizers import *
from nda.optimizers.utils import generate_mixing_matrix
from nda.experiment_utils import run_exp

from BEER import BEER
from MoTEF import MoTEF

In [None]:
def get_bits_per_round_per_agent(config, dim):
    if 'compressor_type' in config:
        if config['compressor_type'] == 'random':
            return config['compressor_param'] * 64
        elif config['compressor_type'] == 'top':
            return config['compressor_param'] * 64
        elif config['compressor_type'] == 'gsgd':
            return config['compressor_param'] * dim
    return dim * 64

In [None]:
def plot_exp(results, name, logx=False, logy=False, figsize=None, dpi=None, save=False, plot_norm=False, plot_bits=True, legends=None):

    max_bits = min([_[1].bits.iloc[-1] for _ in results]) * 1.1
    max_iters = min([_[1].t.iloc[-1] for _ in results])

    if plot_bits:
        fig, axs = plt.subplots(1, 4, figsize=figsize, dpi=dpi)
    else:
        fig, axs = plt.subplots(1, 2, figsize=figsize, dpi=dpi)

    line_styles = [color + style for style in ['-', '--', ':'] for color in ['k', 'r', 'g', 'b', 'c', 'm', 'y']]

    for i in range(len(results)):

        data = results[i][1]
        style = line_styles[i]

        def _plot_semilog(index, x, y, n, xlabel='', ylabel=''):
            ax = axs[index]
            mask = data[x].values <= n
            ax.semilogy(
                data[x].values[mask],
                data[y].values[mask],
                style
            )
            ax.set(xlabel=xlabel, ylabel=ylabel)
            if logy:
                ax.set_yscale('log')
            if logx:
                ax.set_xscale('log')

        def _plot(index, x, y, n, xlabel='', ylabel=''):
            ax = axs[index]
            mask = data[x].values <= n
            ax.semilogy(
                data[x].values[mask],
                data[y].values[mask],
                style
            )
            ax.set(xlabel=xlabel, ylabel=ylabel)
            if logy:
                ax.set_yscale('log')
            if logx:
                ax.set_xscale('log')

        if not plot_norm:
            _plot_semilog(0, 't', 'f', max_iters, xlabel='Iterations', ylabel='Training loss')
            if plot_bits:
                _plot_semilog(2, 'bits', 'f', max_bits, xlabel='Bits communicated', ylabel='Training loss')
        else:
            _plot_semilog(0, 't', 'grad_norm', max_iters, xlabel='Iterations', ylabel='Training gradient norm')
            if plot_bits:
                _plot_semilog(2, 'bits', 'grad_norm', max_bits, xlabel='Bits communicated', ylabel='Training gradient norm')

        _plot(1, 't', 'test_accuracy', max_iters, xlabel='Iterations', ylabel='Testing accuracy')
        if plot_bits:
            _plot(3, 'bits', 'test_accuracy', max_bits, xlabel='Bits communicated', ylabel='Testing accuracy')

    if legends is None:
        plt.legend([_[0] for _ in results])
    else:
        plt.legend(legends)

    return fig

In [None]:
def save_exp(results, configs, name, **kwargs):

    for res, config in zip(results, configs):
        data = res[1]
        data['t'] = data['t'].astype(int)
        data['n_grads'] = data['n_grads'].astype(int)
        data['bits'] = get_bits_per_round_per_agent(config, p.dim) * p.n_agent * data.comm_rounds

    kwargs['results'] = results
    with open(f"data/{name}.pkl", 'wb') as f:
        pickle.dump(kwargs, f)

# Nonconvex logistic regression on unshuffled a9a dataset

## Optimization algorithms

In [None]:
np.random.seed(0)

In [None]:
n_agent = 100
dataset = 'a9a'
graph_type = 'cycle'


p = LogisticRegression(n_agent=n_agent, graph_type=graph_type, alpha=0.05, dataset=dataset, sort=True)

m = p.m
dim = p.dim

x_0 = np.random.rand(dim, n_agent)
W, alpha = generate_mixing_matrix(p)

### Fine-tunining of MoTEF

In [None]:
name = f'MoTEF_logistic_regression_nonconvex_{dataset}_unshuffled_algorithms_{graph_type}'

In [None]:
n_iters = 6000
batch_size = 5

extra_metrics = ['test_accuracy', 'grad_norm']

MoTEF_configs = []
for eta in [0.001, 0.01, 0.05]:
    for gamma in [0.1, 0.2, 0.5, 0.9]:
        for lmbd in [0.005, 0.01, 0.05, 0.1]:
            MoTEF_configs.append({'eta': eta, 'compressor_param': 5, 'gamma': gamma, 'lmbd': lmbd, 'compressor_type': 'gsgd'})



for _ in MoTEF_configs:
    _['extra_metrics'] = extra_metrics

configs = MoTEF_configs
exps = [MoTEF(p, n_iters=n_iters, batch_size=batch_size, x_0=x_0, W=W, **config) for config in MoTEF_configs] \

begin = time.time()
res_MoTEF = run_exp(exps, max_iter=n_iters, name=name, n_gpus=4, plot=False)
end = time.time()
log.info('Total %.2fs', end - begin)

results = save_exp(res_MoTEF, configs, name, p=p, x_0=x_0, W=W)

### Fine-tuning of BEER

In [None]:
name = f'BEER_logistic_regression_nonconvex_{dataset}_unshuffled_algorithms_path'

In [None]:
extra_metrics = ['test_accuracy', 'grad_norm']

BEER_configs = []
for eta in [0.001, 0.01, 0.05]:
    for gamma in [0.1, 0.2, 0.5, 0.9]:
            BEER_configs.append({'eta': eta, 'compressor_param': 5, 'gamma': gamma,'compressor_type': 'gsgd'})



for _ in BEER_configs:
    _['extra_metrics'] = extra_metrics

configs = BEER_configs
exps = [BEER(p, n_iters=n_iters, batch_size=batch_size, x_0=x_0, W=W, **config) for config in BEER_configs] 

begin = time.time()
res_BEER = run_exp(exps, max_iter=n_iters, name=name, n_gpus=4, plot=False)
end = time.time()
log.info('Total %.2fs', end - begin)


results = save_exp(res_BEER, configs, name, p=p, x_0=x_0, W=W)

### MoTEF against other methods

In [None]:
n_agent = 100
n_iters = 6000
batch_size = 5

eta = 0.1
gamma = 0.9
lmbd = 0.001

extra_metrics = ['test_accuracy', 'grad_norm']

MoTEF_configs = [
        {'compressor_param': 5, 'eta': eta, 'gamma': gamma, 'lmbd': lmbd, 'compressor_type': 'gsgd'},
]

BEER_configs = [
        {'compressor_param': 5, 'eta':eta, 'gamma': gamma, 'compressor_type': 'gsgd'},
]

CHOCO_SGD_configs = [
        {'eta': eta, 'compressor_param': 5, 'gamma': gamma, 'compressor_type': 'gsgd'},
]


for _ in CHOCO_SGD_configs + BEER_configs + MoTEF_configs:
    _['extra_metrics'] = extra_metrics

baseline_exps = [
        DSGD(p, eta=0.01, n_iters=n_iters, batch_size=batch_size, x_0=x_0, W=W, extra_metrics=extra_metrics),
        D2(p, eta=0.01, n_iters=n_iters, batch_size=batch_size, x_0=x_0, W=W, extra_metrics=extra_metrics),
    ]

configs = BEER_configs + MoTEF_configs + CHOCO_SGD_configs + len(baseline_exps) * [{}]
exps = [BEER(p, n_iters=n_iters, batch_size=batch_size, x_0=x_0, W=W, **config) for config in BEER_configs] \
        + [MoTEF(p, n_iters=n_iters, batch_size=batch_size, x_0=x_0, W=W, **config) for config in MoTEF_configs] \
        + [CHOCO_SGD(p, n_iters=int(n_iters * 2), batch_size=batch_size, x_0=x_0, W=W, **config) for config in CHOCO_SGD_configs] \
        + baseline_exps

begin = time.time()
res = run_exp(exps, max_iter=n_iters, name=name, n_gpus=4, plot=False)
end = time.time()
log.info('Total %.2fs', end - begin)


results = save_exp(res, configs, name, p=p, x_0=x_0, W=W)

In [None]:
name = f'MoTEF_BEER_CHOCO_DCGD_logistic_regression_nonconvex_{dataset}_unshuffled_algorithms_path'

In [None]:
_ = plot_exp(res, name, plot_norm=True, figsize=(16, 4))

## Network topologies

In [None]:
np.random.seed(0)

In [None]:
n_agent = 40
p = LogisticRegression(n_agent=n_agent, graph_type=graph_type, alpha=0.05, dataset=dataset, sort=True)

m = p.m
dim = p.dim

x_0 = np.random.rand(dim, n_agent)

log.info(f'n_agent = {n_agent}, m = {m}, dim = {dim}')

In [None]:
name = f'logistic_regression_nonconvex_{dataset}_unshuffled_BEERM_topologies'

In [None]:
p.generate_graph('cycle')
W_ring, alpha_ring = generate_mixing_matrix(p)
log.info(f'Ring graph alpha = {alpha_ring}')

p.generate_graph('star')
W_star, alpha_star = generate_mixing_matrix(p)
log.info(f'Star graph alpha = {alpha_star}')

p.generate_graph('grid', (8, 5))
W_grid, alpha_grid = generate_mixing_matrix(p)
log.info(f'Grid graph alpha = {alpha_grid}')

p.generate_graph('er', 0.2)
W_er_1, alpha_er_1 = generate_mixing_matrix(p)
log.info(f'E-R graph alpha 1 = {alpha_er_1}')

p.generate_graph('er', 0.5)
W_er_2, alpha_er_2 = generate_mixing_matrix(p)
log.info(f'E-R graph alpha 2 = {alpha_er_2}')

In [None]:
n_iters = 6000
batch_size = 100


extra_metrics = ['test_accuracy', 'grad_norm']

configs = [
        {'eta': 0.05, 'compressor_param': 5, 'gamma': 0.5, 'lmbd':0.01, 'compressor_type': 'gsgd', 'W': W_ring},
    
        {'eta': 0.05, 'compressor_param': 5, 'gamma': 0.5, 'lmbd':0.01, 'compressor_type': 'gsgd', 'W': W_star},
        {'eta': 0.05, 'compressor_param': 5, 'gamma': 0.5, 'lmbd':0.01, 'compressor_type': 'gsgd', 'W': W_grid},

        {'eta': 0.05, 'compressor_param': 5, 'gamma': 0.5, 'lmbd':0.01, 'compressor_type': 'gsgd', 'W': W_er_1},
        {'eta': 0.05, 'compressor_param': 5, 'gamma': 0.5, 'lmbd':0.01, 'compressor_type': 'gsgd', 'W': W_er_2},
]

exps = [BEER_Momentum(p, n_iters=n_iters, batch_size=batch_size, x_0=x_0, early_stopping=False, extra_metrics=extra_metrics, **config) for config in configs]

begin = time.time()

name = f'logistic_regression_nonconvex_{dataset}_unshuffled_MoTEF_topologies'
res_BEERM_topology = run_exp(exps, max_iter=n_iters, name=name, n_gpus=4, plot=False)
end = time.time()
log.info('Total %.2fs', end - begin)


save_exp(res_BEERM_topology, configs, name, p=p, x_0=x_0, W_er_1=W_er_1, W_er_2=W_er_2)

In [None]:
_ = plot_exp(res_BEERM_topology, name, plot_norm=True, figsize=(16, 4))

# 1-hidden-layer NN on MNIST

In [None]:
np.random.seed(0)

In [None]:
experiment = 'nn'
graph_type = 'cycle'
n_agent = 10

p = NN(n_agent=n_agent, graph_type=graph_type, n_hidden=32, sort=True)
m = p.m
dim = p.dim

x_0 = np.random.randn(dim, n_agent) / 10

W, alpha = generate_mixing_matrix(p)

log.info('alpha = ' + str(alpha))
x_0_mean = x_0.mean(axis=1)

In [None]:
name = 'mnist_unshuffled_MoTEF_BEER_CHOCO_SGD_32_hidden'

In [None]:
n_iters = 2000
batch_size = 100
eta = 0.1

extra_metrics = ['test_accuracy', 'grad_norm']

MoTEF_configs = [
        {'eta': eta, 'compressor_param': 20, 'gamma': 6e-1, 'lmbd':0.005, 'compressor_type': 'gsgd', 'extra_metrics': extra_metrics},
]

BEER_configs = [
        {'eta': eta, 'compressor_param': 20, 'gamma': 6e-1, 'compressor_type': 'gsgd', 'extra_metrics': extra_metrics},
]

CHOCO_SGD_configs = [
        {'eta': eta, 'compressor_param': 20, 'gamma': 6e-1, 'compressor_type': 'gsgd', 'extra_metrics': extra_metrics},
]

for _ in CHOCO_SGD_configs + BEER_configs + MoTEF_configs:
    _['extra_metrics'] = extra_metrics

baseline_exps = [
        DSGD(p, eta=eta, n_iters=n_iters, batch_size=batch_size, x_0=x_0, W=W, early_stopping=False, extra_metrics=extra_metrics),
        D2(p, eta=eta, n_iters=n_iters, batch_size=batch_size, x_0=x_0, W=W, early_stopping=False, extra_metrics=extra_metrics),
    ]


configs = MoTEF_configs + BEER_configs + CHOCO_SGD_configs + len(baseline_exps) * [{}]
exps = [MoTEF(p, n_iters=n_iters, batch_size=batch_size, x_0=x_0, W=W, early_stopping=False, **config) for config in MoTEF_configs] \
        + [BEER(p, n_iters=n_iters, batch_size=batch_size, x_0=x_0, W=W, early_stopping=False, **config) for config in BEER_configs] \
        + [CHOCO_SGD(p, n_iters=int(n_iters * 2), batch_size=batch_size, x_0=x_0, W=W, early_stopping=False, **config) for config in CHOCO_SGD_configs] \
        + baseline_exps


begin = time.time()
res = run_exp(exps, max_iter=n_iters, name=name, n_gpus=4, plot=False, save=True)
end = time.time()
log.info('Total %.2fs', end - begin)

save_exp(res, configs, name, p=p, x_0=x_0, W=W)

In [None]:
_ = plot_exp(res, name, plot_norm=True, figsize=(16, 4), dpi=200)