In [1]:
import pickle, os, time
from itertools import product

import numpy as np
import matplotlib.pyplot as plt

from nda import log
from nda.problems import *
from nda.optimizers import *
from nda.optimizers.utils import generate_mixing_matrix
from nda.experiment_utils import run_exp

from BEER import BEER
#from MoTEF import BEER_Momentum

[32mINFO 10:55:01.3350 353135 utils.py:148] Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.[0m
[32mINFO 10:55:01.3361 353135 utils.py:160] NumExpr defaulting to 8 threads.[0m


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
def get_bits_per_round_per_agent(config, dim):
    if 'compressor_type' in config:
        if config['compressor_type'] == 'random':
            return config['compressor_param'] * 64
        elif config['compressor_type'] == 'top':
            return config['compressor_param'] * 64
        elif config['compressor_type'] == 'gsgd':
            return config['compressor_param'] * dim
    return dim * 64

In [4]:
def plot_exp(results, name, logx=False, logy=False, figsize=None, dpi=None, save=False, plot_norm=False, plot_bits=True, legends=None):

    max_bits = min([_[1].bits.iloc[-1] for _ in results]) * 1.1
    max_iters = min([_[1].t.iloc[-1] for _ in results])

    if plot_bits:
        fig, axs = plt.subplots(1, 4, figsize=figsize, dpi=dpi)
    else:
        fig, axs = plt.subplots(1, 2, figsize=figsize, dpi=dpi)

    line_styles = [color + style for style in ['-', '--', ':'] for color in ['k', 'r', 'g', 'b', 'c', 'm', 'y']]

    for i in range(len(results)):

        data = results[i][1]
        style = line_styles[i]

        def _plot_semilog(index, x, y, n, xlabel='', ylabel=''):
            ax = axs[index]
            mask = data[x].values <= n
            ax.semilogy(
                data[x].values[mask],
                data[y].values[mask],
                style
            )
            ax.set(xlabel=xlabel, ylabel=ylabel)
            if logy:
                ax.set_yscale('log')
            if logx:
                ax.set_xscale('log')

        def _plot(index, x, y, n, xlabel='', ylabel=''):
            ax = axs[index]
            mask = data[x].values <= n
            ax.semilogy(
                data[x].values[mask],
                data[y].values[mask],
                style
            )
            ax.set(xlabel=xlabel, ylabel=ylabel)
            if logy:
                ax.set_yscale('log')
            if logx:
                ax.set_xscale('log')

        if not plot_norm:
            _plot_semilog(0, 't', 'f', max_iters, xlabel='Iterations', ylabel='Training loss')
            if plot_bits:
                _plot_semilog(2, 'bits', 'f', max_bits, xlabel='Bits communicated', ylabel='Training loss')
        else:
            _plot_semilog(0, 't', 'grad_norm', max_iters, xlabel='Iterations', ylabel='Training gradient norm')
            if plot_bits:
                _plot_semilog(2, 'bits', 'grad_norm', max_bits, xlabel='Bits communicated', ylabel='Training gradient norm')

        _plot(1, 't', 'test_accuracy', max_iters, xlabel='Iterations', ylabel='Testing accuracy')
        if plot_bits:
            _plot(3, 'bits', 'test_accuracy', max_bits, xlabel='Bits communicated', ylabel='Testing accuracy')

    if legends is None:
        plt.legend([_[0] for _ in results])
    else:
        plt.legend(legends)

    return fig

In [5]:
import os
os.makedirs("data", exist_ok=True)  # Creates "data" directory if it doesn't exist

In [6]:
def save_exp(results, configs, name, **kwargs):

    for res, config in zip(results, configs):
        data = res[1]
        data['t'] = data['t'].astype(int)
        data['n_grads'] = data['n_grads'].astype(int)
        data['bits'] = get_bits_per_round_per_agent(config, p.dim) * p.n_agent * data.comm_rounds

    kwargs['results'] = results
    with open(f"data/{name}.pkl", 'wb') as f:
        pickle.dump(kwargs, f)

# Nonconvex logistic regression on unshuffled a9a dataset

## Optimization algorithms

In [7]:
# np.random.seed(0)

In [8]:
# n_agent = 100
# dataset = 'a9a'
# graph_type = 'cycle'


# p = LogisticRegression(n_agent=n_agent, graph_type=graph_type, alpha=0.05, dataset=dataset, sort=True)

# m = p.m
# dim = p.dim

# x_0 = np.random.rand(dim, n_agent)
# W, alpha = generate_mixing_matrix(p)

### Fine-tuning of BEER

In [9]:
# n_iters = 6000
# batch_size = 5

In [10]:
# name = f'BEER_logistic_regression_nonconvex_{dataset}_unshuffled_algorithms_path'

In [11]:
# extra_metrics = ['test_accuracy', 'grad_norm']

# BEER_configs = []
# for eta in [0.001, 0.01, 0.05]:
#     for gamma in [0.1, 0.2, 0.5, 0.9]:
#             BEER_configs.append({'eta': eta, 'compressor_param': 5, 'gamma': gamma,'compressor_type': 'gsgd'})



# for _ in BEER_configs:
#     _['extra_metrics'] = extra_metrics

# configs = BEER_configs
# exps = [BEER(p, n_iters=n_iters, batch_size=batch_size, x_0=x_0, W=W, **config) for config in BEER_configs] 

# begin = time.time()
# res_BEER = run_exp(exps, max_iter=n_iters, name=name, n_gpus=4, plot=False)
# end = time.time()
# log.info('Total %.2fs', end - begin)


# results = save_exp(res_BEER, configs, name, p=p, x_0=x_0, W=W)

# 1-hidden-layer NN on MNIST or ResNET8

In [12]:
from nda.problems.resnet8 import ResNet8

In [13]:
np.random.seed(0)

In [14]:
experiment = 'nn'
graph_type = 'cycle'
n_agent = 10

p = NN(n_agent=n_agent, graph_type=graph_type, sort=True)
#p = ResNet8(n_agent=n_agent, graph_type=graph_type, sort=True)
m = p.m
dim = p.dim

x_0 = np.random.randn(dim, n_agent) / 10

W, alpha = generate_mixing_matrix(p)

print('alpha = ' + str(alpha))
x_0_mean = x_0.mean(axis=1)

[32mINFO 10:55:03.8957 353135 dataset.py:31] Loading MNIST dataset from cached file[0m


Initialization done
alpha = 0.8256645490249561


In [15]:
name = 'mnist_unshuffled__BEER_resnet8'

In [16]:
n_iters = 3
batch_size = 100
eta = 0.1

extra_metrics = ['test_accuracy', 'grad_norm']

# MoTEF_configs = [
#         {'eta': eta, 'compressor_param': 20, 'gamma': 6e-1, 'lmbd':0.005, 'compressor_type': 'gsgd', 'extra_metrics': extra_metrics},
# ]

BEER_configs = [
        {'eta': eta, 'compressor_param': 20, 'gamma': 6e-1, 'compressor_type': 'gsgd', 'extra_metrics': extra_metrics},
]

# CHOCO_SGD_configs = [
#         {'eta': eta, 'compressor_param': 20, 'gamma': 6e-1, 'compressor_type': 'gsgd', 'extra_metrics': extra_metrics},
# ]

# for _ in CHOCO_SGD_configs + BEER_configs + MoTEF_configs:
#     _['extra_metrics'] = extra_metrics

baseline_exps = [
        DSGD(p, eta=eta, n_iters=n_iters, batch_size=batch_size, x_0=x_0, W=W, early_stopping=False, extra_metrics=extra_metrics),
        D2(p, eta=eta, n_iters=n_iters, batch_size=batch_size, x_0=x_0, W=W, early_stopping=False, extra_metrics=extra_metrics),
    ]


#configs = MoTEF_configs + BEER_configs + CHOCO_SGD_configs + len(baseline_exps) * [{}]
configs = BEER_configs
# exps = [MoTEF(p, n_iters=n_iters, batch_size=batch_size, x_0=x_0, W=W, early_stopping=False, **config) for config in MoTEF_configs] \
#         + [BEER(p, n_iters=n_iters, batch_size=batch_size, x_0=x_0, W=W, early_stopping=False, **config) for config in BEER_configs] \
#         + [CHOCO_SGD(p, n_iters=int(n_iters * 2), batch_size=batch_size, x_0=x_0, W=W, early_stopping=False, **config) for config in CHOCO_SGD_configs] \
#         + baseline_exps

exps = [BEER(p, n_iters=n_iters, batch_size=batch_size, x_0=x_0, W=W, early_stopping=False, **config) for config in BEER_configs]

# Adding debug statements
print("Starting experiment setup")

begin = time.time()
try:
    res = run_exp(exps, max_iter=n_iters, name=name, n_gpus=1, plot=False, save=True)
except Exception as e:
    print(f"Error running experiments: {e}")
    res = []

end = time.time()
print('Total %.2fs' % (end - begin))

if res:
    save_exp(res, configs, name, p=p, x_0=x_0, W=W)
else:
    print("No results to save.")

[32mINFO 10:55:06.3672 353135 BEER.py:21] gamma = 0.600[0m


Starting experiment setup
Launching task 0 on device 0


[32mINFO 10:55:13.6708 353243 utils.py:148] Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.[0m
[32mINFO 10:55:13.6710 353243 utils.py:160] NumExpr defaulting to 8 threads.[0m
[32mINFO 10:55:16.2579 353243 utils.py:22] BEER started[0m


BEER started
grad_h called with w shape: (421642, 10), i: None, j: None
wdim calling:2
Forward-backward pass called with X shape: (6000, 785), Y shape: (6000, 10), w shape: (421642,)
Original shape of X: (6000, 785)
Reshaped shape of X: torch.Size([6000, 1, 28, 28])
Gradients computed with shape: torch.Size([421642])
Forward-backward pass called with X shape: (6000, 785), Y shape: (6000, 10), w shape: (421642,)
Original shape of X: (6000, 785)
Reshaped shape of X: torch.Size([6000, 1, 28, 28])
Gradients computed with shape: torch.Size([421642])
Forward-backward pass called with X shape: (6000, 785), Y shape: (6000, 10), w shape: (421642,)
Original shape of X: (6000, 785)
Reshaped shape of X: torch.Size([6000, 1, 28, 28])
Gradients computed with shape: torch.Size([421642])
Forward-backward pass called with X shape: (6000, 785), Y shape: (6000, 10), w shape: (421642,)
Original shape of X: (6000, 785)
Reshaped shape of X: torch.Size([6000, 1, 28, 28])
Gradients computed with shape: torch.

In [17]:
res

[]

In [18]:
 _ = plot_exp(res, name, plot_norm=True, figsize=(16, 4), dpi=200)

ValueError: min() iterable argument is empty