<a href="https://colab.research.google.com/github/montest/stochastic-methods-optimal-quantization/blob/pytorch_implentation_dim_1/Lloyd_with_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import os
import sys
import time
import math
import torch
import itertools
import matplotlib

import numpy as np
import pandas as pd
import torch.nn.functional as F

from tqdm import trange

np.set_printoptions(threshold=np.inf, linewidth=10_000)
torch.set_printoptions(profile="full", linewidth=10_000)

### Optimized numpy implementation

In [15]:
def lloyd_method_dim_1(N: int, M: int, nbr_iter: int, seed: int = 0):
    """
    Apply `nbr_iter` iterations of the Randomized Lloyd algorithm in order to build an optimal quantizer of size `N`
    for a Gaussian random variable. This implementation is done using numpy.

    N: number of centroids
    M: number of samples to generate
    nbr_iter: number of iterations of fixed point search
    seed: numpy seed for reproducibility

    Returns: centroids, probabilities associated to each centroid and distortion
    """
    np.random.seed(seed)  # Set seed in order to be able to reproduce the results

    # Draw M samples of gaussian variable
    xs = np.random.normal(0, 1, size=M)

    # Initialize the Voronoi Quantizer randomly and sort it
    centroids = np.random.normal(0, 1, size=N)
    centroids.sort(axis=0)

    with trange(nbr_iter, desc=f'Lloyd method - N: {N} - M: {M} - seed: {seed} (numpy)') as t:
        for step in t:
            # Compute the vertices that separate the centroids
            vertices = 0.5 * (centroids[:-1] + centroids[1:])

            # Find the index of the centroid that is closest to each sample
            index_closest_centroid = np.sum(xs[:, None] >= vertices[None, :], axis=1)

            # Compute the new quantization levels as the mean of the samples assigned to each level
            centroids = np.array([np.mean(xs[index_closest_centroid == i], axis=0) for i in range(N)])

            if any(np.isnan(centroids)):
                break

    # Compute, for each sample, the distance to each centroid
    dist_centroids_points = np.linalg.norm(centroids.reshape((N, 1)) - xs.reshape(M, 1, 1), axis=2)
    # Find the index of the centroid that is closest to each sample using the previously computed distances
    index_closest_centroid = dist_centroids_points.argmin(axis=1)
    # Compute the probability of each centroid
    probabilities = np.bincount(index_closest_centroid) / float(M)
    # Compute the final distortion between the samples and the quantizer
    distortion = np.mean(dist_centroids_points[np.arange(M), index_closest_centroid] ** 2) * 0.5
    return centroids, probabilities, distortion

### PyTorch implementation

In [16]:
def lloyd_method_dim_1_pytorch(N: int, M: int, nbr_iter: int, device: str, seed: int = 0):
    """
    Apply `nbr_iter` iterations of the Randomized Lloyd algorithm in order to build an optimal quantizer of size `N`
    for a Gaussian random variable. This implementation is done using torch.

    N: number of centroids
    M: number of samples to generate
    nbr_iter: number of iterations of fixed point search
    device: device on which perform the computations: "cuda" or "cpu"
    seed: torch seed for reproducibility

    Returns: centroids, probabilities associated to each centroid and distortion
    """
    torch.manual_seed(seed=seed)  # Set seed in order to be able to reproduce the results

    with torch.no_grad():
        # Draw M samples of gaussian variable
        xs = torch.randn(M)
        # xs = torch.tensor(torch.randn(M), dtype=torch.float32)
        xs = xs.to(device)  # send samples to correct device

        # Initialize the Voronoi Quantizer randomly
        centroids = torch.randn(N)
        centroids, index = centroids.sort()
        centroids = centroids.to(device)  # send centroids to correct device

        with trange(nbr_iter, desc=f'Lloyd method - N: {N} - M: {M} - seed: {seed} (pytorch: {device})') as t:
            for step in t:
                # Compute the vertices that separate the centroids
                vertices = 0.5 * (centroids[:-1] + centroids[1:])

                # Find the index of the centroid that is closest to each sample
                index_closest_centroid = torch.sum(xs[:, None] >= vertices[None, :], dim=1).long()

                # Compute the new quantization levels as the mean of the samples assigned to each level
                centroids = torch.tensor([torch.mean(xs[index_closest_centroid == i]) for i in range(N)]).to(device)

                if torch.isnan(centroids).any():
                    break

        # Compute, for each sample, the distance to each centroid
        dist_centroids_points = torch.norm(centroids - xs.reshape(M, 1, 1), dim=1)
        # Find the index of the centroid that is closest to each sample using the previously computed distances
        index_closest_centroid = dist_centroids_points.argmin(dim=1)
        # Compute the probability of each centroid
        probabilities = torch.bincount(index_closest_centroid).to('cpu').numpy()/float(M)
        # Compute the final distortion between the samples and the quantizer
        distortion = torch.mean(dist_centroids_points[torch.arange(M), index_closest_centroid] ** 2).item() * 0.5
        return centroids.to('cpu').numpy(), probabilities, distortion

### Some useful functions for benchmarking

In [17]:
def check_existance(dict_of_values, df):
    v = df.iloc[:, 0] == df.iloc[:, 0]
    for key, value in dict_of_values.items():
        v &= (df[key] == value)
    return v.any()


def testing_method(fct_to_test, parameters_grid: dict, path_to_results: str):
    if os.path.exists(path_to_results) and os.path.getsize(path_to_results) > 0:
        df_results = pd.read_csv(path_to_results, index_col=0)
    else:
        df_results = pd.DataFrame()

    keys, values = zip(*parameters_grid.items())
    permutations_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]

    for permutations in permutations_dicts:
        dict_result = permutations.copy()
        dict_result["method_name"] = fct_to_test.__name__
        if len(df_results) > 0 and check_existance(dict_result, df_results):
            print(f"Skipping {dict_result}")
            continue

        start_time = time.time()
        centroids, probabilities, distortion = fct_to_test(**permutations)
        if math.isnan(distortion):
            print(f"Results for following values {dict_result} were not saved "
                  f"because an nan was present in the centroids")
            continue
        elapsed_time = time.time() - start_time
        dict_result["elapsed_time"] = elapsed_time
        df_results = pd.concat(
            [df_results, pd.DataFrame(dict_result, index=[0])],
            ignore_index=True
        )
        df_results.to_csv(path_to_results)

### Testing methods

In [18]:
# parameters_grid = {
#         "N": [10, 20, 50, 100, 200, 500],
#         "M": [5000],
#         "nbr_iter": [100],
#         "seed": [0]
#     }
# path_to_results = "results_test.csv"

# testing_method(lloyd_method_dim_1, parameters_grid, path_to_results)

In [19]:
path_to_results = "results_latest.csv"
if os.path.exists(path_to_results) and os.path.getsize(path_to_results) > 0:
  df_results = pd.read_csv(path_to_results, index_col=0)
df_results['device'].fillna("cpu", inplace=True)
df_results["method"] = df_results.loc[:,"method_name"] + "_" + df_results.loc[:,"device"]
df_results["method"].replace(
  ["lloyd_method_dim_1_cpu", "lloyd_method_dim_1_pytorch_cpu", "lloyd_method_dim_1_pytorch_cuda"],
  ["numpy_cpu", "pytorch_cpu", "pytorch_cuda"],
  inplace=True
  )
df_results["elapsed_time_by_iter"] = df_results.loc[:,"elapsed_time"] / df_results.loc[:,"nbr_iter"]
df_results = df_results[df_results.nbr_iter == 100]
df_results.drop(labels=["method_name", "device", "nbr_iter", "elapsed_time"], axis=1, inplace=True)

In [20]:
df_grouped = df_results.groupby(['N', 'M', 'method'])['elapsed_time_by_iter'].mean()
df_grouped = df_grouped.reset_index()
df_grouped.to_csv("grouped_results_latest.csv")


In [21]:
from bokeh.io import export_svg, export_png
from bokeh.models import ColumnDataSource
from bokeh.palettes import Viridis
from bokeh.plotting import figure, show
from bokeh.transform import dodge
import chromedriver_autoinstaller
chromedriver_autoinstaller.install() 

'/Users/thibautmontes/GitHub/stochastic-methods-optimal-quantization/venv/lib/python3.9/site-packages/chromedriver_autoinstaller/110/chromedriver'

In [22]:
def plot_results(df_grouped, M):
    grouped_by_values = df_grouped.groupby(["method", "M"]).agg(list).to_dict()
    elapsed_times_per_iter_per_method = grouped_by_values.get("elapsed_time_by_iter")
    Ns_per_method = grouped_by_values.get("N")

    source = ColumnDataSource(
        data=dict(
                Ns_numpy=Ns_per_method.get(("numpy_cpu",M)), numpy_cpu=elapsed_times_per_iter_per_method.get(("numpy_cpu",M)),
                Ns_pytorch_cpu=Ns_per_method.get(("pytorch_cpu",M)), pytorch_cpu=elapsed_times_per_iter_per_method.get(("pytorch_cpu",M)),
                Ns_pytorch_cuda=Ns_per_method.get(("pytorch_cuda",M)), pytorch_cuda=elapsed_times_per_iter_per_method.get(("pytorch_cuda",M))
            )
        )
    
    color_numpy_cpu = Viridis[3][1]
    color_pytorch_cpu = Viridis[3][2]
    color_pytorch_cuda = Viridis[3][0]
    general_font_size = '14pt'

    plot = figure(plot_width=600, plot_height=500)

    plot.xaxis.axis_label = "Grid size (N)"
    plot.xaxis.axis_label_text_font_size = general_font_size

    plot.yaxis.axis_label = "Time elapsed per iter (in seconds)"
    plot.yaxis.axis_label_text_font_size = general_font_size

    plot.circle(x='Ns_numpy', y='numpy_cpu', source=source, fill_color=None, line_color=color_numpy_cpu, legend_label='numpy')
    plot.line(x='Ns_numpy', y='numpy_cpu', source=source, line_color=color_numpy_cpu, legend_label='numpy')

    plot.circle(x='Ns_pytorch_cpu', y='pytorch_cpu', source=source, fill_color=None, line_color=color_pytorch_cpu, legend_label='pytorch (cpu)')
    plot.line(x='Ns_pytorch_cpu', y='pytorch_cpu', source=source, line_color=color_pytorch_cpu, legend_label='pytorch (cpu)')

    plot.circle(x='Ns_pytorch_cuda', y='pytorch_cuda', source=source, fill_color=color_pytorch_cuda, line_color=color_pytorch_cuda, legend_label='pytorch (cuda)')
    plot.line(x='Ns_pytorch_cuda', y='pytorch_cuda', source=source, line_color=color_pytorch_cuda, legend_label='pytorch (cuda)')

    # show(plot)
    export_png(plot, filename=f"_output/gaussian/pytorch/method_comparison_M_{M}.png")
    export_svg(plot, filename=f"_output/gaussian/pytorch/method_comparison_M_{M}.svg")


In [23]:
plot_results(df_grouped=df_grouped, M=100000)

In [24]:
def plot_ratios(df_grouped, M):
    color_numpy_cpu = Viridis[3][1]
    color_pytorch_cpu = Viridis[3][2]
    color_pytorch_cuda = Viridis[3][0]
    general_font_size = '14pt'

    grouped_by_values = df_grouped.groupby(["method", "M"]).agg(list).to_dict()
    elapsed_times_per_iter_per_method = grouped_by_values.get("elapsed_time_by_iter")
    Ns_per_method = grouped_by_values.get("N")
    if(Ns_per_method.get(('pytorch_cuda', M)) != Ns_per_method.get(('pytorch_cpu', M)) or Ns_per_method.get(('numpy_cpu', M)) != Ns_per_method.get(('pytorch_cpu', M))):
        print(f"Cannot plot ratios for M equals {M} because N values does not match!!")
        return
        
    rescaled_comparisons = {
        (method, M): np.array(elapsed_times_per_iter_per_method.get((method, M))) / np.array(elapsed_times_per_iter_per_method.get(("pytorch_cuda", M)))
        for method, M in elapsed_times_per_iter_per_method
        }
    Ns = Ns_per_method.get(("pytorch_cuda", M))
    Ns = [str(N) for N in Ns]

    plot = figure(x_range=Ns, plot_width=600, plot_height=500)

    plot.xaxis.axis_label = "Grid size (N)"
    plot.xaxis.axis_label_text_font_size = general_font_size

    plot.yaxis.axis_label = "Ratio Time elapsed per iter vs PyTorch cuda"
    plot.yaxis.axis_label_text_font_size = general_font_size
        
    source = ColumnDataSource(data={
        'Ns' : Ns,
        'pytorch_cuda' : rescaled_comparisons.get(("pytorch_cuda", M)),
        'pytorch_cpu' : rescaled_comparisons.get(("pytorch_cpu", M)),
        'numpy_cpu' : rescaled_comparisons.get(("numpy_cpu", M))
    })
    
    plot.vbar(x=dodge('Ns', -0.25, range=plot.x_range), top='pytorch_cuda', source=source, width=0.2, color=color_pytorch_cuda, legend_label="pytorch (cuda)")
    plot.vbar(x=dodge('Ns',  0.0,  range=plot.x_range), top='pytorch_cpu', source=source, width=0.2, color=color_pytorch_cpu, legend_label="pytorch (cpu)")
    plot.vbar(x=dodge('Ns',  0.25, range=plot.x_range), top='numpy_cpu', source=source, width=0.2, color=color_numpy_cpu, legend_label="numpy")
    plot.legend.location = "top_left"
    
    # show(plot)
    export_png(plot, filename=f"_output/gaussian/pytorch/ratio_comparison_M_{M}.png")
    export_svg(plot, filename=f"_output/gaussian/pytorch/ratio_comparison_M_{M}.svg")

In [25]:
# plot_ratios(M=10000)
# plot_ratios(M=20000)
# plot_ratios(M=100000)
# plot_ratios(M=500000)
plot_ratios(df_grouped=df_grouped, M=100000)


In [26]:
# def plot_ratios(df_grouped, M):
#     color_numpy_cpu = Viridis[3][1]
#     color_pytorch_cpu = Viridis[3][2]
#     color_pytorch_cuda = Viridis[3][0]
#     general_font_size = '14pt'

#     grouped_by_values = df_grouped.groupby(["method", "M"]).agg(list).to_dict()
#     elapsed_times_per_iter_per_method = grouped_by_values.get("elapsed_time_by_iter")
#     Ns_per_method = grouped_by_values.get("N")

#     rescaled_comparisons = {
#         (method, M): np.array(elapsed_times_per_iter_per_method.get((method, M))) / np.array(elapsed_times_per_iter_per_method.get(("pytorch_cuda", M)))
#         for method, M in elapsed_times_per_iter_per_method
#         }
#     source = ColumnDataSource(
#         data=dict(
#                 Ns_numpy=[str(N) for N in Ns_per_method.get(("numpy_cpu",M))], numpy_cpu=rescaled_comparisons.get(("numpy_cpu",M)),
#                 Ns_pytorch_cpu=[str(N) for N in Ns_per_method.get(("pytorch_cpu",M))], pytorch_cpu=rescaled_comparisons.get(("pytorch_cpu",M)),
#                 Ns_pytorch_cuda=[str(N) for N in Ns_per_method.get(("pytorch_cuda",M))], pytorch_cuda=rescaled_comparisons.get(("pytorch_cuda",M))
#             )
#         )
    
#     plot = figure(x_range=[str(N) for N in Ns_per_method.get(("numpy_cpu",M))], plot_width=600, plot_height=500)

#     plot.xaxis.axis_label = "Grid size (N)"
#     plot.xaxis.axis_label_text_font_size = general_font_size

#     plot.yaxis.axis_label = "Ratio Time elapsed per iter vs PyTorch cuda"
#     plot.yaxis.axis_label_text_font_size = general_font_size
        
#     source = ColumnDataSource(data=data)
    
#     # plot.vbar(x=dodge('Ns_numpy', -0.25, range="Ns_numpy"), top='pytorch_cuda', source=source, width=0.2, color=color_pytorch_cuda, legend_label="pytorch (cuda)")
#     # plot.vbar(x=dodge('Ns_pytorch_cpu', 0.0, range="Ns_pytorch_cpu"), top='pytorch_cpu', source=source, width=0.2, color=color_pytorch_cpu, legend_label="pytorch (cpu)")
#     # plot.vbar(x=dodge('Ns_pytorch_cuda', 0.25, range="Ns_pytorch_cuda"), top='numpy_cpu', source=source, width=0.2, color=color_numpy_cpu, legend_label="numpy")
    
#     plot.vbar(x=dodge('Ns_numpy', -0.25, range=plot.x_range), top='pytorch_cuda', source=source, width=0.2, color=color_pytorch_cuda, legend_label="pytorch (cuda)")
#     plot.vbar(x=dodge('Ns_pytorch_cpu',  0.0,  range=plot.x_range), top='pytorch_cpu', source=source, width=0.2, color=color_pytorch_cpu, legend_label="pytorch (cpu)")
#     plot.vbar(x=dodge('Ns_pytorch_cuda',  0.25, range=plot.x_range), top='numpy_cpu', source=source, width=0.2, color=color_numpy_cpu, legend_label="numpy")
#     plot.legend.location = "top_left"
    
#     # show(plot)
#     export_png(plot, filename=f"_output/gaussian/pytorch/ratio_comparison_M_{M}.png")
#     export_svg(plot, filename=f"_output/gaussian/pytorch/ratio_comparison_M_{M}.svg")