In [1]:
# Add below current directory to path
# Notebook cannot import any code without this line !!!!
import sys; sys.path.insert(0, '..')

import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import IntSlider, interact
from math import sqrt

from sklearn.preprocessing import scale

from tqdm import tqdm, trange # For progress bar
%matplotlib inline
%load_ext autoreload
%autoreload 2

from src.models.CostModel import LogisticModel, RMSEModel, build_several_cost_model

from src.machinery.GradientDescent import ArtemisDescent, FL_VanillaSGD
from src.machinery.GradientUpdateMethod import ArtemisUpdate
from src.machinery.Parameters import *
from src.machinery.PredefinedParameters import *

from src.utils.ErrorPlotter import *
from src.utils.Constants import *
from src.utils.DataClustering import *
from src.utils.DataPreparation import build_data_logistic, add_bias_term
from src.utils.Utilities import pickle_loader, pickle_saver
from src.utils.runner.RunnerUtilities import *
from src.utils.runner.ResultsOfSeveralDescents import ResultsOfSeveralDescents

filename = "rcv1"

nb_devices_for_the_run = 2

In [2]:
import numpy as np
import cyanure as cyan
import scipy.sparse
#load rcv1 dataset about 1Gb, n=781265, p=47152
data = np.load('/home/constantin/OneDrive/Documents/Etudes/Thèse/dataset/rcv1/rcv1.npz',allow_pickle=True); Y_data=data['y']; X_data=data['X']
X_data = scipy.sparse.csc_matrix(X_data.all()).T # n x p matrix, csr format
#normalize the rows of X in-place, without performing any copy
cyan.preprocess(X_data,normalize=True,columns=False)

In [3]:
%%time
#Transforming into torch.FloatTensor
X_merged = X_data[:5000]
Y_merged = torch.tensor(Y_data, dtype=torch.float64)[:5000]
number_of_items = X_merged.shape[0]
number_of_items_by_devices = number_of_items // nb_devices_for_the_run
print("Number of points by devices: ", number_of_items_by_devices)

X, Y = [], []
for i in tqdm(range(nb_devices_for_the_run)):
    X.append(scipy.sparse.csc_matrix(torch.tensor(
        X_merged[number_of_items_by_devices * i:number_of_items_by_devices * (i+1)].A, dtype=torch.float64
    )))
    Y_temp = Y_merged[number_of_items_by_devices * i:number_of_items_by_devices * (i+1)]
    Y.append(torch.stack([y[0] for y in Y_temp]))
print("There is {0} devices.".format(len(X)))

# Adding a columns of "1" to take into account a potential bias.
#X = add_bias_term(X)
dim_notebook = X[0].shape[1]
for x in X:
    print("Number of points on this device: {0}".format(x.shape))

  0%|          | 0/2 [00:00<?, ?it/s]

Number of points by devices:  2500


100%|██████████| 2/2 [00:02<00:00,  1.05s/it]

There is 2 devices.
Number of points on this device: (2500, 47152)
Number of points on this device: (2500, 47152)
CPU times: user 3.35 s, sys: 917 ms, total: 4.27 s
Wall time: 2.12 s





In [4]:
dim_notebook = X_data.shape[1]

In [5]:
# Checking that data is balanced over devices.
print("Diplaying the ratio between state 1 and state -1.")
print("If data is iid, the ratio should be close to 0.5")
for y in Y:
    print("ratio of state 1 on this device: {0}".format(abs((y == 1).sum().item() / abs(y).sum().item())))

Diplaying the ratio between state 1 and state -1.
If data is iid, the ratio should be close to 0.5
ratio of state 1 on this device: 0.4676
ratio of state 1 on this device: 0.468


In [6]:
# Creating cost models which will be used to computed cost/loss, gradients, L ...
cost_models = build_several_cost_model(LogisticModel, X, Y, nb_devices_for_the_run)

In [7]:
%%time
import gc
gc.collect()
obj_min_by_N = {}
obj_min_by_N_descent = FL_VanillaSGD(Parameters(n_dimensions = dim_notebook, 
                                                     nb_devices=nb_devices_for_the_run,
                                                     nb_epoch=500, 
                                                     quantization_param=0,
                                                     momentum = 0., 
                                                     verbose=True, 
                                                     cost_models=cost_models,
                                                     stochastic=False,
                                                     bidirectional=False
                                                    ))
#obj_min_by_N_descent.set_data(X,Y)
obj_min_by_N_descent.run(cost_models)
obj_min_by_N = obj_min_by_N_descent.losses[-1]
pickle_saver(obj_min_by_N, "{0}-iid-obj_min".format(filename))

   it    |   obj   
     100 | 6.9314e-01
     200 | 6.9314e-01
     300 | 6.9314e-01
     400 | 6.9314e-01
Lips time: 0.7055132389068604
Cost time: 0.25673699378967285
Grad time: 0
Lips time: 0.6803417205810547
Cost time: 0.2588202953338623
Grad time: 0
== Inside time 0.00010704994201660156
== Averaging time : 0.0005185604095458984
== Full time : 0.5685954093933105
=== Used memory : 1021.82912 Mbytes
Gradient Descent: execution time=0.569 seconds
Final loss : 0.69314

CPU times: user 1.25 s, sys: 6.76 ms, total: 1.25 s
Wall time: 639 ms


In [8]:
from src.utils.Utilities import check_memory_usage

check_memory_usage()



                object  number_of_objects    memory  mem_per_object
0                  str             133677  23375654      174.866686
11                dict              32879  14391976      437.725478
1                 type               5653   5142720      909.732885
18                code              35158   5093608      144.877638
82                list              28402   3342984      117.702415
4                tuple              34734   2636040       75.892209
45       numpy.ndarray                 78   1582267    20285.474359
229                set               1532    839520      547.989556
80             weakref               6960    612480       88.000000
78   getset_descriptor               6266    501280       80.000000
                                             object  number_of_objects  \
9299                matplotlib.colors._ColorMapping                  1   
17192    pytz.lazy.LazySet.__new__.<locals>.LazySet                  2   
45                            

In [9]:
def ratio_failure(index: int, guess: int = 1):
    x = X[index]
    y = Y[index]
    w = obj_min_by_N_descent.model_params[-1]
    mul =  torch.tensor(x.dot(w))
    inf_middle = 0
    failure = 0
    for i in range(len(mul)):
        if guess == 1:
            if torch.sigmoid(mul[i]) >= 0.5:
                if y[i] != guess:
                    failure +=1
                inf_middle +=1
        else:
            if torch.sigmoid(mul[i]) < 0.5:
                if y[i] != guess:
                    failure +=1
                inf_middle +=1
    print("Number of labels equal to " + str(guess) + ": ", inf_middle)
    print("Failures: {0}".format(failure))
    print("Percent of labels wrongly predicted to be state " + str(guess) +" for worker 0: " + 
          str(failure/inf_middle * 100) + "%")

index = 0
ratio_failure(index, 1)
ratio_failure(index, -1)

Number of labels equal to 1:  0
Failures: 0


ZeroDivisionError: division by zero

In [None]:
X[0]

In [12]:
%%time
import sys
import gc
all_descent = {}
for type_params in tqdm(KIND_COMPRESSION):
    multiple_sg_descent = multiple_run_descent(type_params, cost_models=cost_models, 
                                               use_averaging=False, streaming=True, 
                                               logs_file="{0}.txt".format(filename))
    all_descent[type_params.name()] = multiple_sg_descent
res = ResultsOfSeveralDescents(all_descent, nb_devices_for_the_run)
pickle_saver(res, "{0}-iid-descent".format(filename))


  0%|          | 0/5 [00:00<?, ?it/s]

SGD
Lips time: 0.7055132389068604
Cost time: 20.551859378814697
Grad time: 0
Lips time: 0.6803417205810547
Cost time: 20.422707319259644
Grad time: 0
== Inside time 1.448000192642212
== Averaging time : 0.0030617713928222656
== Full time : 4.411934852600098
=== Used memory : 2583.064576 Mbytes
* === * Time of one run : 4.466008186340332
* === * Size of workers : 104


 20%|██        | 1/5 [00:09<00:36,  9.10s/it]

Lips time: 0.7055132389068604
Cost time: 21.99114418029785
Grad time: 0
Lips time: 0.6803417205810547
Cost time: 21.737714529037476
Grad time: 0
== Inside time 1.534684658050537
== Averaging time : 0.0031299591064453125
== Full time : 4.527889251708984
=== Used memory : 4262.07232 Mbytes
* === * Time of one run : 4.561882257461548
* === * Size of workers : 104
QSGD
Lips time: 0.7055132389068604
Cost time: 23.750927448272705
Grad time: 1.2335772514343262
Lips time: 0.6803417205810547
Cost time: 23.299745082855225
Grad time: 1.2833600044250488
== Inside time 18.993475437164307
== Averaging time : 0.0041887760162353516
== Full time : 22.61982536315918
=== Used memory : 6084.399104 Mbytes
* === * Time of one run : 22.662686109542847
* === * Size of workers : 104


 40%|████      | 2/5 [00:56<01:02, 20.71s/it]

Lips time: 0.7055132389068604
Cost time: 25.66319513320923
Grad time: 2.6243550777435303
Lips time: 0.6803417205810547
Cost time: 25.032809495925903
Grad time: 2.711245059967041
== Inside time 21.000775575637817
== Averaging time : 0.0053861141204833984
== Full time : 24.998295545578003
=== Used memory : 7950.31552 Mbytes
* === * Time of one run : 25.047027826309204
* === * Size of workers : 104
Diana
Lips time: 0.7055132389068604
Cost time: 27.507596969604492
Grad time: 3.9552865028381348
Lips time: 0.6803417205810547
Cost time: 26.702539920806885
Grad time: 4.077139854431152
== Inside time 20.21388006210327
== Averaging time : 0.00456690788269043
== Full time : 24.05967402458191
=== Used memory : 9810.030592 Mbytes
* === * Time of one run : 24.104429244995117
* === * Size of workers : 104


 60%|██████    | 3/5 [01:46<00:58, 29.27s/it]

Lips time: 0.7055132389068604
Cost time: 29.420109510421753
Grad time: 5.325900554656982
Lips time: 0.6803417205810547
Cost time: 28.422613382339478
Grad time: 5.480698585510254
== Inside time 21.034231662750244
== Averaging time : 0.004900455474853516
== Full time : 25.006105184555054
=== Used memory : 11677.212672 Mbytes
* === * Time of one run : 25.048439502716064
* === * Size of workers : 104
BiQSGD
Lips time: 0.7055132389068604
Cost time: 31.279783725738525
Grad time: 6.663127660751343
Lips time: 0.6803417205810547
Cost time: 30.097740173339844
Grad time: 6.848909616470337
== Inside time 23.97263813018799
== Averaging time : 0.00421595573425293
== Full time : 27.842397451400757
=== Used memory : 13643.255808 Mbytes
* === * Time of one run : 27.89671754837036
* === * Size of workers : 104


 80%|████████  | 4/5 [02:41<00:37, 37.18s/it]

Lips time: 0.7055132389068604
Cost time: 33.13889408111572
Grad time: 7.971517562866211
Lips time: 0.6803417205810547
Cost time: 31.758519887924194
Grad time: 8.198947191238403
== Inside time 23.76045060157776
== Averaging time : 0.00450444221496582
== Full time : 27.606268167495728
=== Used memory : 15581.835264 Mbytes
* === * Time of one run : 27.65887713432312
* === * Size of workers : 104
Artemis
Lips time: 0.7055132389068604
Cost time: 34.961376428604126
Grad time: 9.250722646713257
Lips time: 0.6803417205810547
Cost time: 33.383704662323
Grad time: 9.523228406906128
== Inside time 23.67951989173889
== Averaging time : 0.0047070980072021484
== Full time : 27.450745105743408
=== Used memory : 17517.228032 Mbytes
* === * Time of one run : 27.503297805786133
* === * Size of workers : 104


100%|██████████| 5/5 [03:36<00:00, 43.21s/it]

Lips time: 0.7055132389068604
Cost time: 36.74556589126587
Grad time: 10.48262906074524
Lips time: 0.6803417205810547
Cost time: 34.96759748458862
Grad time: 10.803924083709717
== Inside time 22.968239545822144
== Averaging time : 0.004826784133911133
== Full time : 26.6485595703125
=== Used memory : 19455.660032 Mbytes
* === * Time of one run : 26.69069766998291
* === * Size of workers : 104
CPU times: user 14min 7s, sys: 13.4 s, total: 14min 21s
Wall time: 3min 36s





In [None]:
cost_models[0].X

In [None]:
%%time
import sys
import gc
all_descent = {}
X_number_of_bits = []
for type_params in tqdm(KIND_COMPRESSION):
    multiple_sg_descent = multiple_run_descent(type_params, cost_models=cost_models, 
                                               use_averaging=False, streaming=True, logs_file="rcv1.txt")
    all_descent[type_params.name()] = multiple_sg_descent
    del multiple_sg_descent
    gc.collect
res = ResultsOfSeveralDescents(all_descent, nb_devices_for_the_run)
pickle_saver(res, "{0}-iid-descent".format(filename))


In [None]:
from src.utils.Utilities import check_memory_usage

check_memory_usage()

### With Averaging

We don't used averaging as it makes the whole process much, much, much ..., much slower.

### Without Averaging

In [None]:
obj = pickle_loader(filename + "-iid-obj_min")
res = pickle_loader(filename + "-iid-descent")

plot_error_dist(res.get_loss(obj), res.names, res.nb_devices_for_the_run, dim_notebook,
                x_legend="Number of passes on data (iid)", all_error=res.get_std(obj)) 
plot_error_dist(res.get_loss(obj), res.names, res.nb_devices_for_the_run, dim_notebook, 
                x_points=res.X_number_of_bits, x_legend="Communicated bits (iid)", all_error=res.get_std(obj)) 