# Optimizing the NF model using optuna

In [1]:
import os

import optuna
from optuna.trial import TrialState
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from NF_utils import Latent_data
#custom imports
from utils import load_graph_dataset, train, evaluate, GraphDataset, get_graph_dataset_info
from models import GIN, HeteroGIN

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
2023-07-03 11:17:54.125823: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using device cuda:0


In [10]:
os.getpid()

822642

In [2]:
import normflows as nf
from normflows import flows
## Standard libraries
import math
import time
import numpy as np

## Imports for plotting
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['lines.linewidth'] = 2.0

import dgl #NOTE: for dgl.batch and dgl.unbatch
from dgl import save_graphs, load_graphs
from dgl.data import DGLDataset
from dgl.dataloading import GraphDataLoader
from dgl.data.utils import save_info, load_info, Subset

import umap
reducer = umap.UMAP();
from tqdm import tqdm

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
DEVICE = torch.device("cpu") if not torch.cuda.is_available() else torch.device("cuda:0")
# DEVICE = torch.device("cpu")
print("Using DEVICE", DEVICE)

BATCHSIZE = 100
CLASSES = 2
EPOCHS = 10

Using DEVICE cuda:0


In [4]:
# Data and MC both have the same prefix
prefix = "/hpc/group/vossenlab/mfm45/.dgl/"

# MC inside Lambda_train_matched_jobs_outbending_cache_bg50nA_7_28_22__pT_phi_theta_beta_chi2_pid_status__Normalized
MCdataset = "Lambda_train_matched_jobs_outbending_cache_bg50nA_7_28_22__pT_phi_theta_beta_chi2_pid_status__Normalized"

# Data inside data_jobs_rga_fall2018_7_28_22__pT_phi_theta_beta_chi2_pid_status__Normalized
DATAdataset = "data_jobs_rga_fall2018_7_28_22__pT_phi_theta_beta_chi2_pid_status__Normalized"
max_events = 1e5
split = 0.1
nlayers = 2
nmlp = 3
hdim = 64
nclasses, nfeatures, nfeatures_edge = get_graph_dataset_info(dataset=MCdataset, prefix=prefix)
dropout = 0.8
learn_eps = False
batch = 256
indices = None
nworkers = 0
npooling = "max"
gpooling = "max"
torch.manual_seed(0)

#select model
extractor = GIN(nlayers, nmlp, nfeatures,
            hdim, nclasses, dropout, learn_eps, npooling, gpooling).to(DEVICE)
extractor.load_state_dict(torch.load("logs/model_weights",map_location=DEVICE))



DATA_max_events = 249090
MC_max_events = 141118

In [5]:
def create_latent_data(dataset_directory, extractor, prefix = "/hpc/group/vossenlab/mfm45/.dgl/", split = 0.8, max_events = 140000, num_samples = 250, mode = "default",shuffle = True):
    val_split = (1 - split) / 2
    if(mode == "test"):
        data_range = range(int(split*max_events),int((val_split + split)*max_events))
    elif(mode == "train"):
        data_range = range(0, int(split*max_events))
    elif(mode == "val"):
        data_range = range(int((val_split + split)*max_events),max_events)
    elif(mode == "default"):
        print(f"No mode given, defaulting to training\n")
        data_range = range(0, int(split*max_events))
    else:
        raise Exception("Invalid mode: {mode}\nPlease use either \"train,\" or \"test\" ", mode)
    dataset = GraphDataset(prefix+dataset_directory)
    dataset.load()
    if(shuffle):
        dataset.shuffle()
    dataset = Subset(dataset,data_range)
    dgl_batch = dgl.batch(dataset.dataset.graphs[dataset.indices.start:dataset.indices.stop])
    labels = dataset.dataset.labels[dataset.indices.start:dataset.indices.stop,0].clone().detach().float().view(-1, 1)
    mass = dataset.dataset.labels[dataset.indices.start:dataset.indices.stop,1].clone().detach().float()
    dgl_batch = dgl_batch.to(DEVICE)
    labels = labels.to(DEVICE)
    latent = extractor.get_latent_repr(dgl_batch).detach().cpu()
    latent_obj = Latent_data(latent,labels)
    latent_obj.set_batch_size(num_samples)
    latent_obj.set_mass(mass)
    return latent_obj

In [6]:
num_samples = 100
training_data_DATA = create_latent_data(DATAdataset, extractor,num_samples = num_samples, max_events = DATA_max_events)
training_data_MC = create_latent_data(MCdataset, extractor,num_samples = num_samples, max_events = MC_max_events)

testing_data_DATA = create_latent_data(DATAdataset, extractor, mode = "test",num_samples = num_samples, max_events = DATA_max_events)
testing_data_MC = create_latent_data(MCdataset, extractor, mode = "test",num_samples = num_samples, max_events = MC_max_events)

val_data_DATA = create_latent_data(DATAdataset, extractor, mode = "val",num_samples = num_samples, max_events = DATA_max_events)
val_data_MC = create_latent_data(MCdataset, extractor, mode = "val",num_samples = num_samples, max_events = MC_max_events)

No mode given, defaulting to training



  assert input.numel() == input.storage().size(), (


No mode given, defaulting to training



In [7]:
def NF_model_optimize(trial):
#     num_layers = trial.suggest_int("num_layers", 24, 64)
    num_layers = 52
    hidden_dim = 142
    #mask
    b = torch.ones(71)
    for i in range(b.size()[0]):
        if i % 2 == 0:
            b[i] = 0
    masked_affine_flows = []
    for i in range(num_layers):
        s = nf.nets.MLP([71, hidden_dim, hidden_dim, 71])
        t = nf.nets.MLP([71, hidden_dim, hidden_dim, 71])
        if i % 2 == 0:
            masked_affine_flows += [nf.flows.MaskedAffineFlow(b, t, s)]
        else:
            masked_affine_flows += [nf.flows.MaskedAffineFlow(1 - b, t, s)]
    num_modes = trial.suggest_int("num_modes", 2, 40)
    distribution = nf.distributions.GaussianMixture(num_modes, training_data_DATA.latent_size)
#     distribution = nf.distributions.DiagGaussian(training_data_DATA.latent_size, trainable = False)
    masked_affine_model = nf.NormalizingFlow(q0=distribution, flows=masked_affine_flows)
    return masked_affine_model


In [8]:
in_data = training_data_DATA
val_data = val_data_DATA
val_data.set_batch_size(int(np.floor(val_data.num_events / in_data.max_iter)))

def objective(trial):
    # Generate the optimizers.
    model = NF_model_optimize(trial).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-5)
    for epoch in range(2):
        print(f"starting epoch #{epoch}")
        with tqdm(total=in_data.max_iter, position=0, leave=True) as pbar:
            for it in tqdm(range(in_data.max_iter), position = 0, leave=True):
                model.train()
                optimizer.zero_grad()
                #randomly sample the latent space
                samples = in_data.sample(iteration = it)
                samples = samples.to(DEVICE)
                loss = model.forward_kld(samples)
                # Do backprop and optimizer step
                if ~(torch.isnan(loss) | torch.isinf(loss)):
                    loss.backward()
                    optimizer.step()
        print(f"starting val epoch #{epoch}")
        model.eval()
        val_loss = 0
        with tqdm(total=in_data.max_iter, position=0, leave=True) as pbar:
            for it in tqdm(range(in_data.max_iter), position = 0, leave=True):
                val_samples = val_data.sample(iteration = it)
                val_samples = val_samples.to(DEVICE)
                val_loss += model.forward_kld(val_samples)
        avg_loss = val_loss / in_data.max_iter
        trial.report(avg_loss, epoch)
        
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    return avg_loss         

In [9]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, timeout=7200)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2023-07-03 11:23:03,744] A new study created in memory with name: no-name-df98d9db-0a5b-46af-a0f8-d2e0a5db0a7a


starting epoch #0


100%|██████████| 1992/1992 [04:00<00:00,  8.29it/s]
  0%|          | 0/1992 [04:00<?, ?it/s]


starting val epoch #0


100%|██████████| 1992/1992 [01:12<00:00, 27.46it/s]
  0%|          | 0/1992 [01:12<?, ?it/s]


starting epoch #1


100%|██████████| 1992/1992 [03:40<00:00,  9.01it/s]
  0%|          | 0/1992 [03:40<?, ?it/s]


starting val epoch #1


 73%|███████▎  | 1457/1992 [00:54<00:19, 26.78it/s]
  0%|          | 0/1992 [00:54<?, ?it/s]
[W 2023-07-03 11:32:52,188] Trial 0 failed with parameters: {'num_modes': 30} because of the following error: OutOfMemoryError('CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 23.69 GiB total capacity; 13.36 GiB already allocated; 3.75 MiB free; 13.36 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF').
Traceback (most recent call last):
  File "/hpc/group/vossenlab/rck32/miniconda3/envs/venv/lib/python3.11/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_822642/3125869348.py", line 30, in objective
    val_loss += model.forward_kld(val_samples)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/hpc/group/vossenlab/rck32/miniconda

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 23.69 GiB total capacity; 13.36 GiB already allocated; 3.75 MiB free; 13.36 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF