In [27]:
import math
import torch
import pickle
import torch.cuda
import torchvision.transforms as transforms
import torch.utils.data as data
import torchvision.datasets as dsets
import os
from utils.BBBConvmodel import BBBAlexNet, BBBLeNet, BBB3Conv3FC
from utils.BBBlayers import GaussianVariationalInference
import numpy as np
from scipy.stats import norm
cuda = torch.cuda.is_available()
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.mixture import GaussianMixture
import random
import numpy as np

In [28]:
batch_size = 32
lr = 0.001
dataset = 'CIFAR-10'
if dataset == 'MNIST':
    model = torch.load("./results/lenet_withbias_b{}_lr{}_{}.pth".format(batch_size, lr, dataset))
elif dataset == 'CIFAR-10':
    model = BBBLeNet(outputs=10, inputs=3)
    model_name = 'lenet5'
    num_epochs = 50
    model.load_state_dict(torch.load('./model_with_bias/model{}_param_epoch{}_lr{}_bs{}.pkl'.format(model_name,num_epochs,lr,batch_size), map_location='cpu'))
net = BBBLeNet
num_samples = 10
beta_type = "Blundell"

In [29]:
# dimensions of input and output
if dataset == 'MNIST':    # train with MNIST
    outputs = 10
    inputs = 1
elif dataset == 'CIFAR-10':  # train with CIFAR-10
    outputs = 10
    inputs = 3
elif dataset == 'CIFAR-100':    # train with CIFAR-100
    outputs = 100
    inputs = 3

if net == BBBLeNet or BBB3Conv3FC:
    resize = 32
elif net == BBBAlexNet:
    resize = 227

In [30]:
'''
LOADING DATASET
'''

if dataset == 'MNIST':
    transform = transforms.Compose([transforms.Resize((resize, resize)), transforms.ToTensor(),
                                    transforms.Normalize((0.1307,), (0.3081,))])
    train_dataset = dsets.MNIST(root="data", download=True, transform=transform)
    val_dataset = dsets.MNIST(root="data", download=True, train=False, transform=transform)

elif dataset == 'CIFAR-100':
    transform = transforms.Compose([transforms.Resize((resize, resize)), transforms.ToTensor(),
                                    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])
    train_dataset = dsets.CIFAR100(root="data", download=True, transform=transform)
    val_dataset = dsets.CIFAR100(root='data', download=True, train=False, transform=transform)

elif dataset == 'CIFAR-10':
    transform = transforms.Compose([transforms.Resize((resize, resize)), transforms.ToTensor(),
                                    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])
    train_dataset = dsets.CIFAR10(root="data", download=True, transform=transform)
    val_dataset = dsets.CIFAR10(root='data', download=True, train=False, transform=transform)

Files already downloaded and verified
Files already downloaded and verified


In [31]:
loader_val = data.DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)

In [32]:
model.state_dict().keys()

odict_keys(['conv1.qw_mean', 'conv1.qw_logvar', 'conv1.qb_mean', 'conv1.qb_logvar', 'conv1.conv_qw_mean', 'conv1.conv_qw_si', 'conv1.log_alpha', 'conv2.qw_mean', 'conv2.qw_logvar', 'conv2.qb_mean', 'conv2.qb_logvar', 'conv2.conv_qw_mean', 'conv2.conv_qw_si', 'conv2.log_alpha', 'fc1.qw_mean', 'fc1.qw_logvar', 'fc1.qb_mean', 'fc1.qb_logvar', 'fc1.fc_qw_mean', 'fc1.fc_qw_si', 'fc1.log_alpha', 'fc2.qw_mean', 'fc2.qw_logvar', 'fc2.qb_mean', 'fc2.qb_logvar', 'fc2.fc_qw_mean', 'fc2.fc_qw_si', 'fc2.log_alpha', 'fc3.qw_mean', 'fc3.qw_logvar', 'fc3.qb_mean', 'fc3.qb_logvar', 'fc3.fc_qw_mean', 'fc3.fc_qw_si', 'fc3.log_alpha', 'layers.0.qw_mean', 'layers.0.qw_logvar', 'layers.0.qb_mean', 'layers.0.qb_logvar', 'layers.0.conv_qw_mean', 'layers.0.conv_qw_si', 'layers.0.log_alpha', 'layers.3.qw_mean', 'layers.3.qw_logvar', 'layers.3.qb_mean', 'layers.3.qb_logvar', 'layers.3.conv_qw_mean', 'layers.3.conv_qw_si', 'layers.3.log_alpha', 'layers.7.qw_mean', 'layers.7.qw_logvar', 'layers.7.qb_mean', 'layers

# 画weights的直方图

In [33]:
w_name = ['layers.0.qw_', 'layers.3.qw_', 'layers.7.qw_','layers.9.qw_', 'layers.11.qw_']
b_name = ['layers.0.qb_', 'layers.3.qb_', 'layers.7.qb_','layers.9.qb_', 'layers.11.qb_']

In [34]:
whole_w = []
for (i, j) in zip(w_name, b_name):
    whole_w.append(model.state_dict()['{}mean'.format(i)].numpy().ravel())
    whole_w.append(model.state_dict()['{}mean'.format(j)].numpy().ravel())
whole_w = np.concatenate(whole_w)

In [35]:
len(whole_w)

62006

In [36]:
len(whole_w[np.abs(whole_w) <= 1e-2])

6263

# 神经网络精度计算函数

In [37]:
vi = GaussianVariationalInference(torch.nn.CrossEntropyLoss())

def run_epoch(loader, epoch, is_training=False):
    m = math.ceil(len(loader.dataset) / loader.batch_size)

    accuracies = []
    likelihoods = []
    kls = []
    losses = []

    for i, (images, labels) in enumerate(loader):
        # # Repeat samples (Casper's trick)
        # x = images.view(-1, inputs, resize, resize).repeat(num_samples, 1, 1, 1)
        # y = labels.repeat(num_samples)
        x = images.view(-1, inputs, resize, resize)
        y = labels
        if cuda:
            x = x.cuda()
            y = y.cuda()

        if beta_type == "Blundell":
            beta = 2 ** (m - (i + 1)) / (2 ** m - 1)
        elif beta_type == "Soenderby":
            beta = min(epoch / (num_epochs//4), 1)
        elif beta_type == "Standard":
            beta = 1 / m
        else:
            beta = 0

        logits, kl = model.probforward(x)
        loss = vi(logits, y, kl, beta)
        ll = -loss.data.mean() + beta*kl.data.mean()

        if is_training:
            optimiser.zero_grad()
            loss.backward()
            optimiser.step()

        _, predicted = logits.max(1)
        accuracy = (predicted.data.cpu() == y.cpu()).float().mean()

        accuracies.append(accuracy)
        losses.append(loss.data.mean())
        kls.append(beta*kl.data.mean())
        likelihoods.append(ll)

    diagnostics = {'loss': sum(losses)/len(losses),
                   'acc': sum(accuracies)/len(accuracies),
                   'kl': sum(kls)/len(kls),
                   'likelihood': sum(likelihoods)/len(likelihoods)}

    return diagnostics

In [38]:
model = model.cuda()
diagnostics_val = run_epoch(loader_val, epoch=1)

logits tensor([[-6.4739e+00, -5.2193e+00, -1.7640e+00,  2.6536e+00, -1.0912e+00,
          4.0515e+00, -1.1566e+00,  3.2845e+00, -3.7999e+00, -6.3062e+00],
        [ 2.0506e+00, -2.4812e-01, -4.1659e+00, -4.9868e+00, -7.0991e+00,
         -6.6611e+00, -7.4082e+00, -7.5586e+00,  4.5361e+00, -1.8279e+00],
        [ 1.4890e-01, -1.0008e-01, -2.4236e+00, -3.9342e+00, -3.8813e+00,
         -4.7016e+00, -3.6188e+00, -4.4365e+00,  6.1713e-01,  1.3208e+00],
        [ 4.4492e+00, -3.0875e+00, -1.8170e+00, -3.5469e+00,  1.0114e+00,
         -6.7393e+00, -9.3938e+00, -6.4026e+00,  5.8801e-01, -5.5603e+00],
        [-4.5092e+00, -5.5218e+00,  1.5222e+00,  2.4021e+00,  6.3286e+00,
         -1.5988e+00,  1.9339e+00, -3.2495e+00, -5.5714e+00, -6.8741e+00],
        [-4.5614e+00, -2.5400e+00, -1.1223e+00,  1.3693e+00,  7.8390e-01,
          6.9055e-01,  4.6277e+00, -2.6569e+00, -4.4204e+00, -4.5756e+00],
        [ 3.0248e-01,  2.8807e+00, -3.7455e+00, -3.7867e+00, -6.9227e+00,
         -3.9509e+00, -5.

In [39]:
def evaluate(loader, epoch=1):
    m = math.ceil(len(loader.dataset) / loader.batch_size)

    accuracies = []

    for i, (images, labels) in enumerate(loader):
        # # Repeat samples (Casper's trick)
        # x = images.view(-1, inputs, resize, resize).repeat(num_samples, 1, 1, 1)
        # y = labels.repeat(num_samples)

        x = images.view(-1, inputs, resize, resize)
        y = labels

        if cuda:
            x = x.cuda()
            y = y.cuda()

        logits = cpr_model(x)
        _, predicted = torch.max(logits.data, 1)
        accuracy = (predicted.data.cpu() == y.cpu()).float().mean()

        accuracies.append(accuracy)

    diagnostics = {'acc': sum(accuracies)/len(accuracies)}

    return diagnostics

# all layers放在一起压缩

In [40]:
# 设置随机种子
seed = 42
np.random.seed(seed)
random.seed(seed)

# 定义要尝试的n_components值
n_min = 2
n_components_range = range(n_min, 6)

# 初始化信息准则列表
aic_scores = []
bic_scores = []

# 计算每个n_components值上的信息准则值
for n_components in n_components_range:
    gmm = GaussianMixture(n_components=n_components)
    gmm.fit(whole_w.reshape(-1,1))
    aic_scores.append(gmm.aic(whole_w.reshape(-1,1)))
    # bic_scores.append(gmm.bic(whole_w.reshape(-1,1)))

# 选择具有最小信息准则值的n_components值
best_n_components_aic = np.argmin(aic_scores) + n_min
# best_n_components_bic = np.argmin(bic_scores) + n_min

print("Best n_components (AIC):", best_n_components_aic)
# print("Best n_components (BIC):", best_n_components_bic)

gmm = GaussianMixture(n_components=best_n_components_aic)
gmm.fit(whole_w.reshape(-1,1))

# 获取每个混合分量的参数（均值、标准差、权重）
means = gmm.means_.squeeze()
covs = gmm.covariances_.squeeze()  #注意covs是方差，不是标准差
pis = gmm.weights_.squeeze()

Best n_components (AIC): 5


In [41]:
print(means, covs, pis)

[ 0.08809907 -0.01111425 -0.20795845  0.16530471 -0.11824319] [0.00259748 0.00123671 0.01491951 0.01321627 0.0028724 ] [0.21830678 0.3723919  0.08431909 0.09455562 0.23042661]


In [42]:
import scipy.optimize as optimize

def F(x, w, u, s):
    return sum(w * norm.cdf(x, loc=u, scale=s))

def F_inv(p, w, u, s, br=(-1000, 1000)):
    G = lambda x: F(x, w, u, s) - p
    result = optimize.root_scalar(G, bracket=br)
    return result.root

In [43]:
def compress_coordinates(means, stds, beta, bitlengths):
    # N = len(means.ravel()) = len(stds.ravel())
    # C = len(codepoints)
    optima = np.empty_like(means)
    optima_lengths = np.empty_like(means, dtype=int)
    for i in range(0, 10000000, 100000):
        if i % 100000 == 0:
            print(i / 1000000)
        squared_errors = (codepoints[np.newaxis, :] - means.ravel()[i:i+100000, np.newaxis])**2
            # shape (N, C)
        weighted_penalties = (2 * beta) * stds.ravel()[i:i+100000, np.newaxis]**2 * bitlengths[np.newaxis, :]
            # shape (N, C)
        optima_idxs = np.argmin(squared_errors + weighted_penalties, axis=1)
        optima.ravel()[i:i+100000] = codepoints[optima_idxs]
        optima_lengths.ravel()[i:i+100000] = bitlengths[optima_idxs]
    return optima, optima_lengths

In [44]:
model = model.cpu()

In [45]:
results = [['full', 'full', diagnostics_val['acc'].numpy(), 'None']]
compressed_len_list = []
betas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
bitlengths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# betas = [0.01]
# bitlengths = [2]

for max_codepoint_length in bitlengths:
    for beta_ in betas:
        compressed_len = 0
        compressed_list = []
        for (i, j) in zip(w_name, b_name):
            codepoints_and_lengths = [
                (F_inv(codepoint_xi, pis, means, np.sqrt(covs)), length)
                for length in range(max_codepoint_length+1)
                for codepoint_xi in np.arange(0.5**(length+1), 1, 0.5**length)
            ]
            codepoints = np.array([codepoint for codepoint, _ in codepoints_and_lengths])
            lengths = np.array([length for _, length in codepoints_and_lengths])
            # compress w
            vecs_u1 = model.state_dict()['{}mean'.format(i)].numpy()
            stds_u1 = np.exp(model.state_dict()['{}logvar'.format(i)].numpy())
            compressed1, cpr_len1 = compress_coordinates(vecs_u1,stds_u1,beta_,lengths)
            compressed1 = torch.from_numpy(compressed1)
            compressed_list.append(compressed1)
            compressed_len += np.sum(cpr_len1)
            # compress b
            vecs_u2 = model.state_dict()['{}mean'.format(j)].numpy()
            stds_u2 = np.exp(model.state_dict()['{}logvar'.format(j)].numpy())
            compressed2, cpr_len2 = compress_coordinates(vecs_u2,stds_u2,beta_,lengths)
            compressed2 = torch.from_numpy(compressed2)
            compressed_list.append(compressed2)
            compressed_len += np.sum(cpr_len2)

        compressed_len_list.append([max_codepoint_length, beta_, compressed_len, 'global'])

        # lenet
        from model import lenet_b
        cpr_model=lenet_b(inputs)
        print(max_codepoint_length, beta_)
        name_cpr=cpr_model.state_dict().keys()
        cpr_state_dict = dict(zip(name_cpr, compressed_list))
        cpr_model.load_state_dict(cpr_state_dict)
        # 用压缩后的模型计算loss，acc等
        cpr_model = cpr_model.cuda()
        diagnostics_cpr_val = evaluate(loader_val)
        results.append([max_codepoint_length, beta_, diagnostics_cpr_val['acc'].numpy(), 'global'])

0.0
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
1.1
1.2
1.3
1.4
1.5
1.6
1.7
1.8
1.9
2.0
2.1
2.2
2.3
2.4
2.5
2.6
2.7
2.8
2.9
3.0
3.1
3.2
3.3
3.4
3.5
3.6
3.7
3.8
3.9
4.0
4.1
4.2
4.3
4.4
4.5
4.6
4.7
4.8
4.9
5.0
5.1
5.2
5.3
5.4
5.5
5.6
5.7
5.8
5.9
6.0
6.1
6.2
6.3
6.4
6.5
6.6
6.7
6.8
6.9
7.0
7.1
7.2
7.3
7.4
7.5
7.6
7.7
7.8
7.9
8.0
8.1
8.2
8.3
8.4
8.5
8.6
8.7
8.8
8.9
9.0
9.1
9.2
9.3
9.4
9.5
9.6
9.7
9.8
9.9
0.0
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
1.1
1.2
1.3
1.4
1.5
1.6
1.7
1.8
1.9
2.0
2.1
2.2
2.3
2.4
2.5
2.6
2.7
2.8
2.9
3.0
3.1
3.2
3.3
3.4
3.5
3.6
3.7
3.8
3.9
4.0
4.1
4.2
4.3
4.4
4.5
4.6
4.7
4.8
4.9
5.0
5.1
5.2
5.3
5.4
5.5
5.6
5.7
5.8
5.9
6.0
6.1
6.2
6.3
6.4
6.5
6.6
6.7
6.8
6.9
7.0
7.1
7.2
7.3
7.4
7.5
7.6
7.7
7.8
7.9
8.0
8.1
8.2
8.3
8.4
8.5
8.6
8.7
8.8
8.9
9.0
9.1
9.2
9.3
9.4
9.5
9.6
9.7
9.8
9.9
0.0
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
1.1
1.2
1.3
1.4
1.5
1.6
1.7
1.8
1.9
2.0
2.1
2.2
2.3
2.4
2.5
2.6
2.7
2.8
2.9
3.0
3.1
3.2
3.3
3.4
3.5
3.6
3.7
3.8
3.9
4.0
4.1
4.2
4.3
4.4
4.5
4.6
4.7
4.8
4.9


# 每层用不同P(z)

In [46]:
for max_codepoint_length in bitlengths:
    for beta_ in betas:
        compressed_len = 0
        compressed_list = []
        for (i, j) in zip(w_name, b_name):
            # w
            vecs_u1 = model.state_dict()['{}mean'.format(i)].numpy()
            stds_u1 = np.exp(model.state_dict()['{}logvar'.format(i)].numpy())
            # b
            vecs_u2 = model.state_dict()['{}mean'.format(j)].numpy()
            stds_u2 = np.exp(model.state_dict()['{}logvar'.format(j)].numpy())
            # 先把w和b合并在一起再拟合gmm
            vecs_u = np.concatenate([vecs_u1.ravel(), vecs_u2.ravel()])
            # 拟合gmm
            # 定义要尝试的n_components值
            n_min = 2
            n_components_range = range(n_min, 6)

            # 初始化信息准则列表
            aic_scores = []

            # 计算每个n_components值上的信息准则值
            for n_components in n_components_range:
                gmm = GaussianMixture(n_components=n_components)
                gmm.fit(vecs_u.reshape(-1,1))
                aic_scores.append(gmm.aic(vecs_u.reshape(-1,1)))
                # bic_scores.append(gmm.bic(vecs_u.reshape(-1,1)))

            # 选择具有最小信息准则值的n_components值
            best_n_components_aic = np.argmin(aic_scores) + n_min
            # best_n_components_bic = np.argmin(bic_scores) + n_min

            print("Best n_components (AIC):", best_n_components_aic)
            # print("Best n_components (BIC):", best_n_components_bic)

            gmm = GaussianMixture(n_components=best_n_components_aic)
            gmm.fit(vecs_u.reshape(-1,1))

            # 获取每个混合分量的参数（均值、标准差、权重）
            means = gmm.means_.squeeze()
            covs = gmm.covariances_.squeeze()  #注意covs是方差，不是标准差
            pis = gmm.weights_.squeeze()

            codepoints_and_lengths = [
                (F_inv(codepoint_xi, pis, means, np.sqrt(covs)), length)
                for length in range(max_codepoint_length+1)
                for codepoint_xi in np.arange(0.5**(length+1), 1, 0.5**length)
            ]
            codepoints = np.array([codepoint for codepoint, _ in codepoints_and_lengths])
            lengths = np.array([length for _, length in codepoints_and_lengths])

            compressed1, cpr_len1 = compress_coordinates(vecs_u1,stds_u1,beta_,lengths)
            compressed1 = torch.from_numpy(compressed1)
            compressed_list.append(compressed1)
            compressed_len += np.sum(cpr_len1)
            # compress b
            compressed2, cpr_len2 = compress_coordinates(vecs_u2,stds_u2,beta_,lengths)
            compressed2 = torch.from_numpy(compressed2)
            compressed_list.append(compressed2)
            compressed_len += np.sum(cpr_len2)
            
        compressed_len_list.append([max_codepoint_length, beta_, compressed_len, 'layer-wise'])
        # lenet
        from model import lenet_b
        cpr_model=lenet_b(inputs)
        print(max_codepoint_length, beta_)
        print(compressed_list)
        name_cpr=cpr_model.state_dict().keys()
        cpr_state_dict = dict(zip(name_cpr, compressed_list))
        cpr_model.load_state_dict(cpr_state_dict)
        cpr_model = cpr_model.cuda()
        # 用压缩后的模型计算loss，acc等
        diagnostics_cpr_val = evaluate(loader_val)
        results.append([max_codepoint_length, beta_, diagnostics_cpr_val['acc'].numpy(), 'layer-wise'])

Best n_components (AIC): 3
0.0
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
1.1
1.2
1.3
1.4
1.5
1.6
1.7
1.8
1.9
2.0
2.1
2.2
2.3
2.4
2.5
2.6
2.7
2.8
2.9
3.0
3.1
3.2
3.3
3.4
3.5
3.6
3.7
3.8
3.9
4.0
4.1
4.2
4.3
4.4
4.5
4.6
4.7
4.8
4.9
5.0
5.1
5.2
5.3
5.4
5.5
5.6
5.7
5.8
5.9
6.0
6.1
6.2
6.3
6.4
6.5
6.6
6.7
6.8
6.9
7.0
7.1
7.2
7.3
7.4
7.5
7.6
7.7
7.8
7.9
8.0
8.1
8.2
8.3
8.4
8.5
8.6
8.7
8.8
8.9
9.0
9.1
9.2
9.3
9.4
9.5
9.6
9.7
9.8
9.9
0.0
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
1.1
1.2
1.3
1.4
1.5
1.6
1.7
1.8
1.9
2.0
2.1
2.2
2.3
2.4
2.5
2.6
2.7
2.8
2.9
3.0
3.1
3.2
3.3
3.4
3.5
3.6
3.7
3.8
3.9
4.0
4.1
4.2
4.3
4.4
4.5
4.6
4.7
4.8
4.9
5.0
5.1
5.2
5.3
5.4
5.5
5.6
5.7
5.8
5.9
6.0
6.1
6.2
6.3
6.4
6.5
6.6
6.7
6.8
6.9
7.0
7.1
7.2
7.3
7.4
7.5
7.6
7.7
7.8
7.9
8.0
8.1
8.2
8.3
8.4
8.5
8.6
8.7
8.8
8.9
9.0
9.1
9.2
9.3
9.4
9.5
9.6
9.7
9.8
9.9
Best n_components (AIC): 3
0.0
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
1.1
1.2
1.3
1.4
1.5
1.6
1.7
1.8
1.9
2.0
2.1
2.2
2.3
2.4
2.5
2.6
2.7
2.8
2.9
3.0
3.1
3.2
3.3
3.4
3.5
3.

In [47]:
results = pd.DataFrame(results)
results.columns = ['bitlength', 'beta', 'acc_test', 'method']

In [48]:
results

Unnamed: 0,bitlength,beta,acc_test,method
0,full,full,0.6084265,
1,1,0.0001,0.14197284,global
2,1,0.001,0.14197284,global
3,1,0.01,0.14327076,global
4,1,0.1,0.13997604,global
...,...,...,...,...
136,10,0.01,0.5461262,layer-wise
137,10,0.1,0.54382986,layer-wise
138,10,1,0.5466254,layer-wise
139,10,10,0.5302516,layer-wise


In [49]:
results.to_csv("./results/lenet_withbias_b{}_lr{}_{}_gmm.csv".format(batch_size, lr, dataset), index=False)

In [50]:
cpr_len_df = pd.DataFrame(compressed_len_list)
cpr_len_df.columns = ['bitlength', 'beta', 'len', 'method']
cpr_len_df['len'] = cpr_len_df['len'] / whole_w.size

In [51]:
cpr_len_df

Unnamed: 0,bitlength,beta,len,method
0,1,0.0001,0.696916,global
1,1,0.0010,0.696900,global
2,1,0.0100,0.696836,global
3,1,0.1000,0.696336,global
4,1,1.0000,0.691014,global
...,...,...,...,...
135,10,0.0100,5.236364,layer-wise
136,10,0.1000,3.693046,layer-wise
137,10,1.0000,2.325727,layer-wise
138,10,10.0000,1.338951,layer-wise


In [52]:
cpr_len_df.to_csv("./results/bits_lenet_withbias_b{}_lr{}_{}_gmm.csv".format(batch_size, lr, dataset), index=False)