# Optimizing the NF model using optuna

In [1]:
import os

import optuna
from optuna.trial import TrialState
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from NF_utils import Latent_data
#custom imports
from utils import load_graph_dataset, train, evaluate, GraphDataset, get_graph_dataset_info
from models import GIN, HeteroGIN

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
2023-06-29 22:47:13.227729: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using device cuda:0


In [2]:
import normflows as nf
from normflows import flows
## Standard libraries
import math
import time
import numpy as np

## Imports for plotting
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['lines.linewidth'] = 2.0

import dgl #NOTE: for dgl.batch and dgl.unbatch
from dgl import save_graphs, load_graphs
from dgl.data import DGLDataset
from dgl.dataloading import GraphDataLoader
from dgl.data.utils import save_info, load_info, Subset

import umap
reducer = umap.UMAP();
from tqdm import tqdm

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
DEVICE = torch.device("cpu") if not torch.cuda.is_available() else torch.device("cuda:0")
# DEVICE = torch.device("cpu")
print("Using DEVICE", DEVICE)

BATCHSIZE = 100
CLASSES = 2
EPOCHS = 10

Using DEVICE cuda:0


In [4]:
# Data and MC both have the same prefix
prefix = "/hpc/group/vossenlab/mfm45/.dgl/"

# MC inside Lambda_train_matched_jobs_outbending_cache_bg50nA_7_28_22__pT_phi_theta_beta_chi2_pid_status__Normalized
MCdataset = "Lambda_train_matched_jobs_outbending_cache_bg50nA_7_28_22__pT_phi_theta_beta_chi2_pid_status__Normalized"

# Data inside data_jobs_rga_fall2018_7_28_22__pT_phi_theta_beta_chi2_pid_status__Normalized
DATAdataset = "data_jobs_rga_fall2018_7_28_22__pT_phi_theta_beta_chi2_pid_status__Normalized"
max_events = 1e5
split = 0.1
nlayers = 2
nmlp = 3
hdim = 64
nclasses, nfeatures, nfeatures_edge = get_graph_dataset_info(dataset=MCdataset, prefix=prefix)
dropout = 0.8
learn_eps = False
batch = 256
indices = None
nworkers = 0
npooling = "max"
gpooling = "max"
torch.manual_seed(0)

#select model
extractor = GIN(nlayers, nmlp, nfeatures,
            hdim, nclasses, dropout, learn_eps, npooling, gpooling).to(DEVICE)
extractor.load_state_dict(torch.load("logs/model_weights",map_location=DEVICE))



DATA_max_events = 249090
MC_max_events = 141118

In [5]:
def create_latent_data(dataset_directory, extractor, prefix = "/hpc/group/vossenlab/mfm45/.dgl/", split = 0.8, max_events = 140000, num_samples = 250, mode = "default",shuffle = True):
    val_split = (1 - split) / 2
    if(mode == "test"):
        data_range = range(int(split*max_events),int((val_split + split)*max_events))
    elif(mode == "train"):
        data_range = range(0, int(split*max_events))
    elif(mode == "val"):
        data_range = range(int((val_split + split)*max_events),max_events)
    elif(mode == "default"):
        print(f"No mode given, defaulting to training\n")
        data_range = range(0, int(split*max_events))
    else:
        raise Exception("Invalid mode: {mode}\nPlease use either \"train,\" or \"test\" ", mode)
    dataset = GraphDataset(prefix+dataset_directory)
    dataset.load()
    if(shuffle):
        dataset.shuffle()
    dataset = Subset(dataset,data_range)
    dgl_batch = dgl.batch(dataset.dataset.graphs[dataset.indices.start:dataset.indices.stop])
    labels = dataset.dataset.labels[dataset.indices.start:dataset.indices.stop,0].clone().detach().float().view(-1, 1)
    mass = dataset.dataset.labels[dataset.indices.start:dataset.indices.stop,1].clone().detach().float()
    dgl_batch = dgl_batch.to(DEVICE)
    labels = labels.to(DEVICE)
    latent = extractor.get_latent_repr(dgl_batch).detach().cpu()
    latent_obj = Latent_data(latent,labels)
    latent_obj.set_batch_size(num_samples)
    latent_obj.set_mass(mass)
    return latent_obj

In [6]:
num_samples = 100
training_data_DATA = create_latent_data(DATAdataset, extractor,num_samples = num_samples, max_events = DATA_max_events)
training_data_MC = create_latent_data(MCdataset, extractor,num_samples = num_samples, max_events = MC_max_events)

testing_data_DATA = create_latent_data(DATAdataset, extractor, mode = "test",num_samples = num_samples, max_events = DATA_max_events)
testing_data_MC = create_latent_data(MCdataset, extractor, mode = "test",num_samples = num_samples, max_events = MC_max_events)

val_data_DATA = create_latent_data(DATAdataset, extractor, mode = "val",num_samples = num_samples, max_events = DATA_max_events)
val_data_MC = create_latent_data(MCdataset, extractor, mode = "val",num_samples = num_samples, max_events = MC_max_events)

No mode given, defaulting to training



  assert input.numel() == input.storage().size(), (


No mode given, defaulting to training



In [7]:
def NF_model_optimize(trial):
    num_layers = trial.suggest_int("num_layers", 24, 64)
    #mask
    b = torch.ones(71)
    for i in range(b.size()[0]):
        if i % 2 == 0:
            b[i] = 0
    masked_affine_flows = []
    for i in range(num_layers):
        s = nf.nets.MLP([71, 142, 142, 71])
        t = nf.nets.MLP([71, 142, 142, 71])
        if i % 2 == 0:
            masked_affine_flows += [nf.flows.MaskedAffineFlow(b, t, s)]
        else:
            masked_affine_flows += [nf.flows.MaskedAffineFlow(1 - b, t, s)]
    distribution = nf.distributions.DiagGaussian(training_data_DATA.latent_size, trainable = False)
    masked_affine_model = nf.NormalizingFlow(q0=distribution, flows=masked_affine_flows)
    return masked_affine_model


In [8]:
in_data = training_data_DATA
val_data = val_data_DATA
val_data.set_batch_size(int(np.floor(val_data.num_events / in_data.max_iter)))

def objective(trial):
    # Generate the optimizers.
    model = NF_model_optimize(trial).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-5)
    for epoch in range(1):
        print(f"starting epoch #{epoch}")
        with tqdm(total=in_data.max_iter, position=0, leave=True) as pbar:
            for it in tqdm(range(in_data.max_iter), position = 0, leave=True):
                model.train()
                optimizer.zero_grad()
                #randomly sample the latent space
                samples = in_data.sample(iteration = it)
                samples = samples.to(DEVICE)
                loss = model.forward_kld(samples)
                # Do backprop and optimizer step
                if ~(torch.isnan(loss) | torch.isinf(loss)):
                    loss.backward()
                    optimizer.step()
        print(f"starting val epoch #{epoch}")
        model.eval()
        val_loss = 0
        with tqdm(total=in_data.max_iter, position=0, leave=True) as pbar:
            for it in tqdm(range(in_data.max_iter), position = 0, leave=True):
                val_samples = val_data.sample(iteration = it)
                val_samples = val_samples.to(DEVICE)
                val_loss += model.forward_kld(val_samples)
        avg_loss = val_loss / in_data.max_iter
        trial.report(avg_loss, epoch)
        
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    return avg_loss         

In [9]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, timeout=3600)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2023-06-29 22:51:31,124] A new study created in memory with name: no-name-67eb7966-9b87-45b6-9de7-dea87dfc7f29


starting epoch #0


100%|██████████| 1992/1992 [02:00<00:00, 16.48it/s]
  0%|          | 0/1992 [02:00<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [00:36<00:00, 54.44it/s]
  0%|          | 0/1992 [00:36<?, ?it/s]
[I 2023-06-29 22:54:08,813] Trial 0 finished with value: -37.29963302612305 and parameters: {'num_layers': 27}. Best is trial 0 with value: -37.29963302612305.


starting epoch #0


100%|██████████| 1992/1992 [01:36<00:00, 20.71it/s]
  0%|          | 0/1992 [01:36<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [01:14<00:00, 26.61it/s]
  0%|          | 0/1992 [01:14<?, ?it/s]
[W 2023-06-29 22:57:02,364] Trial 1 failed with parameters: {'num_layers': 56} because of the following error: The value nan is not acceptable..
[W 2023-06-29 22:57:02,429] Trial 1 failed with value tensor(nan, device='cuda:0', grad_fn=<DivBackward0>).


starting epoch #0


100%|██████████| 1992/1992 [03:14<00:00, 10.24it/s]
  0%|          | 0/1992 [03:14<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [01:01<00:00, 32.14it/s]
  0%|          | 0/1992 [01:01<?, ?it/s]
[I 2023-06-29 23:01:23,750] Trial 2 finished with value: -37.61505126953125 and parameters: {'num_layers': 47}. Best is trial 2 with value: -37.61505126953125.


starting epoch #0


100%|██████████| 1992/1992 [02:53<00:00, 11.45it/s]
  0%|          | 0/1992 [02:53<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [00:50<00:00, 39.79it/s]
  0%|          | 0/1992 [00:50<?, ?it/s]
[I 2023-06-29 23:05:11,522] Trial 3 finished with value: -34.9852180480957 and parameters: {'num_layers': 38}. Best is trial 2 with value: -37.61505126953125.


starting epoch #0


100%|██████████| 1992/1992 [01:28<00:00, 22.41it/s]
  0%|          | 0/1992 [01:28<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [01:08<00:00, 29.25it/s]
  0%|          | 0/1992 [01:08<?, ?it/s]
[W 2023-06-29 23:07:52,065] Trial 4 failed with parameters: {'num_layers': 52} because of the following error: The value nan is not acceptable..
[W 2023-06-29 23:07:52,074] Trial 4 failed with value tensor(nan, device='cuda:0', grad_fn=<DivBackward0>).


starting epoch #0


100%|██████████| 1992/1992 [02:15<00:00, 14.69it/s]
  0%|          | 0/1992 [02:15<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [00:38<00:00, 52.17it/s]
  0%|          | 0/1992 [00:38<?, ?it/s]
[I 2023-06-29 23:10:50,207] Trial 5 finished with value: -36.91230010986328 and parameters: {'num_layers': 29}. Best is trial 2 with value: -37.61505126953125.


starting epoch #0


100%|██████████| 1992/1992 [04:06<00:00,  8.09it/s]
  0%|          | 0/1992 [04:06<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [01:09<00:00, 28.68it/s]
  0%|          | 0/1992 [01:09<?, ?it/s]
[I 2023-06-29 23:16:08,666] Trial 6 finished with value: -35.582237243652344 and parameters: {'num_layers': 61}. Best is trial 2 with value: -37.61505126953125.


starting epoch #0


100%|██████████| 1992/1992 [03:55<00:00,  8.47it/s]
  0%|          | 0/1992 [03:55<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [01:03<00:00, 31.30it/s]
  0%|          | 0/1992 [01:03<?, ?it/s]
[I 2023-06-29 23:21:12,648] Trial 7 finished with value: -39.44623565673828 and parameters: {'num_layers': 57}. Best is trial 7 with value: -39.44623565673828.


starting epoch #0


100%|██████████| 1992/1992 [02:52<00:00, 11.57it/s]
  0%|          | 0/1992 [02:52<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [00:43<00:00, 45.36it/s]
  0%|          | 0/1992 [00:43<?, ?it/s]
[I 2023-06-29 23:24:53,258] Trial 8 finished with value: -40.07331085205078 and parameters: {'num_layers': 39}. Best is trial 8 with value: -40.07331085205078.


starting epoch #0


100%|██████████| 1992/1992 [03:39<00:00,  9.07it/s]
  0%|          | 0/1992 [03:39<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [01:00<00:00, 32.91it/s]
  0%|          | 0/1992 [01:00<?, ?it/s]
[I 2023-06-29 23:29:36,655] Trial 9 finished with value: -42.10033416748047 and parameters: {'num_layers': 55}. Best is trial 9 with value: -42.10033416748047.


starting epoch #0


100%|██████████| 1992/1992 [01:32<00:00, 21.50it/s]
  0%|          | 0/1992 [01:32<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [01:06<00:00, 29.81it/s]
  0%|          | 0/1992 [01:06<?, ?it/s]
  return np.nanmin(values)
[I 2023-06-29 23:32:20,535] Trial 10 pruned. 


starting epoch #0


100%|██████████| 1992/1992 [01:16<00:00, 26.02it/s]
  0%|          | 0/1992 [01:16<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [00:57<00:00, 34.89it/s]
  0%|          | 0/1992 [00:57<?, ?it/s]
[I 2023-06-29 23:34:39,256] Trial 11 pruned. 


starting epoch #0


100%|██████████| 1992/1992 [02:46<00:00, 11.97it/s]
  0%|          | 0/1992 [02:46<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [00:59<00:00, 33.44it/s]
  0%|          | 0/1992 [00:59<?, ?it/s]
[I 2023-06-29 23:38:29,347] Trial 12 pruned. 


starting epoch #0


100%|██████████| 1992/1992 [02:32<00:00, 13.04it/s]
  0%|          | 0/1992 [02:32<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [00:43<00:00, 46.24it/s]
  0%|          | 0/1992 [00:43<?, ?it/s]
[I 2023-06-29 23:41:49,463] Trial 13 pruned. 


starting epoch #0


100%|██████████| 1992/1992 [02:47<00:00, 11.89it/s]
  0%|          | 0/1992 [02:47<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [00:38<00:00, 51.53it/s]
  0%|          | 0/1992 [00:38<?, ?it/s]
[I 2023-06-29 23:45:18,967] Trial 14 finished with value: -39.59315490722656 and parameters: {'num_layers': 40}. Best is trial 9 with value: -42.10033416748047.


starting epoch #0


100%|██████████| 1992/1992 [02:31<00:00, 13.11it/s]
  0%|          | 0/1992 [02:31<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [00:44<00:00, 44.59it/s]
  0%|          | 0/1992 [00:44<?, ?it/s]
[I 2023-06-29 23:48:38,835] Trial 15 pruned. 


starting epoch #0


100%|██████████| 1992/1992 [02:09<00:00, 15.37it/s]
  0%|          | 0/1992 [02:09<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [00:32<00:00, 62.19it/s]
  0%|          | 0/1992 [00:32<?, ?it/s]
[I 2023-06-29 23:51:24,180] Trial 16 pruned. 


starting epoch #0


100%|██████████| 1992/1992 [03:06<00:00, 10.66it/s]
  0%|          | 0/1992 [03:06<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [00:41<00:00, 47.92it/s]
  0%|          | 0/1992 [00:41<?, ?it/s]
[I 2023-06-29 23:55:15,502] Trial 17 finished with value: -41.64117431640625 and parameters: {'num_layers': 43}. Best is trial 9 with value: -42.10033416748047.


Study statistics: 
  Number of finished trials:  18
  Number of pruned trials:  6
  Number of complete trials:  10
Best trial:
  Value:  -42.10033416748047
  Params: 
    num_layers: 55
