In [2]:
%cd /home/naodell/work/hgcal/hgcal_gravnet_model

import os, os.path as osp
from time import strftime
from tqdm.notebook import tqdm

import numpy as np
import matplotlib.pyplot as plt
import torch
#from torch.utils.tensorboard import SummaryWriter
from torch_geometric.data import DataLoader

import scripts.objectcondensation as object_condensation
from torch_cmspepr.gravnet_model import GravnetModel
from torch_cmspepr.dataset import TauDataset
from scripts.lrscheduler import CyclicLRWithRestarts
from scripts.nadam import Nadam


/home/naodell/work/hgcal/hgcal_gravnet_model


OSError: /home/naodell/opt/miniconda3/envs/hgcal-ml/lib/python3.9/site-packages/torch_sparse/_version_cpu.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSs

In [None]:
# global options 

object_condensation.DEBUG = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')       
print('Using device', device)                                               

reduced_data = False
batch_size = 32

In [None]:
# get the data
data_path = 'local_data/npzs_all'
dataset = TauDataset(data_path)
dataset.blacklist([ # Remove a bunch of bad events                          
    f'{data_path}/110_nanoML_98.npz',                                          
    f'{data_path}/113_nanoML_13.npz',                                          
    f'{data_path}/124_nanoML_77.npz',                                          
    f'{data_path}/128_nanoML_70.npz',                                          
    f'{data_path}/149_nanoML_90.npz',                                          
    f'{data_path}/153_nanoML_22.npz',                                          
    f'{data_path}/26_nanoML_93.npz',                                           
    f'{data_path}/32_nanoML_45.npz',                                           
    f'{data_path}/5_nanoML_51.npz',                                            
    f'{data_path}/86_nanoML_97.npz',                                           
    ])                                                                      

if reduced_data:
    keep = .01
    print(f'Keeping only {100.*keep:.1f}% of events for debugging')
    dataset, _ = dataset.split(keep)
    
shuffle = True                                                              
train_dataset, test_dataset = dataset.split(.8)                             
train_loader = DataLoader(train_dataset, 
                          batch_size = batch_size, 
                          shuffle = shuffle
                         )
test_loader = DataLoader(test_dataset, 
                         batch_size = batch_size, 
                         shuffle = shuffle
                        )


In [None]:
# define model and configure it

model = GravnetModel(input_dim=9, output_dim=4).to(device)
                                                                            
epoch_size = len(train_loader.dataset)                                      
#optimizer = Nadam(model.parameters(), lr=1e-5, weight_decay=1e-4)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)
# scheduler = CyclicLRWithRestarts(optimizer, batch_size, epoch_size, restart_period=400, t_mult=1.1, policy="cosine")
                                                                            
loss_offset = 1. # To prevent a negative loss from ever occuring            

In [None]:
# define the loss

def loss_fn(out, data, s_c=1., return_components=False):                    
    device = out.device                                                     
    pred_betas = torch.sigmoid(out[:,0])                                    
    pred_cluster_space_coords = out[:,1:4]                                  
    # pred_cluster_properties = out[:,3:]                                   
    assert all(t.device == device for t in [pred_betas, pred_cluster_space_coords, data.y, data.batch,                                                         
        # pred_cluster_properties, data.truth_cluster_props                 
        ])                                                                  
    
    out_oc = object_condensation.calc_LV_Lbeta(                              
        pred_betas,                                                         
        pred_cluster_space_coords,                                          
        data.y.long(),                                                      
        data.batch,                                                         
        return_components = return_components                                 
        )                                                                   
    
    if return_components:                                                   
        return out_oc                                                       
    else:                                                                   
        LV, Lbeta = out_oc                                                  
        return LV + Lbeta + loss_offset                                     
    
    # Lp = objectcondensation.calc_Lp(                                                                                                                                                                                                                  
    #     pred_betas,                                                       
    #     data.y.long(),                                                    
    #     pred_cluster_properties,                                          
    #     data.truth_cluster_props                                          
    #     )                                                                 
    # return Lp + s_c*(LV + Lbeta)                                          
                                                                                  


In [None]:
# define train and test methods

def train(epoch, train_loader, tb_writer):
    #print('Training epoch', epoch)
    model.train()
    # scheduler.step()
    try:
        pbar = tqdm(train_loader, total=len(train_loader), leave=False)
        pbar.set_postfix({'loss': '?'})
        for i, data in enumerate(pbar):
            data = data.to(device)
            optimizer.zero_grad()
            result = model(data.x, data.batch)
            loss = loss_fn(result, data)
            
            loss.backward()
            optimizer.step()
            # scheduler.batch_step()
            pbar.set_postfix({'loss': float(loss)})
        
            tb_writer.add_scalar('training loss', loss, i)
        return loss
            
    except Exception:
        print('Exception encountered:', data, ', npzs:')
        print('  ' + '\n  '.join([train_dataset.npzs[int(i)] for i in data.inpz]))
        raise

def test(epoch):
    N_test = len(test_loader)
    loss_components = {}
    
    def update(components):
        for key, value in components.items():
            if not key in loss_components: loss_components[key] = 0.
            loss_components[key] += value
            
    with torch.no_grad():
        model.eval()
        for data in tqdm(test_loader, total=len(test_loader), leave=False):
            data = data.to(device)
            result = model(data.x, data.batch)
            update(loss_fn(result, data, return_components=True))
            
    # Divide by number of entries
    for key in loss_components:
        loss_components[key] /= N_test
        
    # Compute total loss and do printout
    test_loss = loss_offset + loss_components['L_V']+loss_components['L_beta']
    
    #print('test ' + object_condensation.formatted_loss_components_string(loss_components))
    #print(f'Returning {test_loss}')
    
    return test_loss

In [7]:
# additional useful utility functions

def write_checkpoint(checkpoint_number=None, best=False):
    ckpt_dir = strftime('checkpoints/gravnet_%b%d_%H%M') #if args.ckptdir is None else args.ckptdir
    ckpt = 'ckpt_best.pth.tar' if best else 'ckpt_{0}.pth.tar'.format(checkpoint_number)
    ckpt = osp.join(ckpt_dir, ckpt)
    if best: 
        print('Saving epoch {0} as new best'.format(checkpoint_number))
    
    if not reduced_data:
        os.makedirs(ckpt_dir, exist_ok=True)
        torch.save(dict(model=model.state_dict()), ckpt)

def debug():
    oc.DEBUG = True
    dataset = TauDataset('data/taus')
    dataset.npzs = [
        # 'data/taus/49_nanoML_84.npz',
        # 'data/taus/37_nanoML_4.npz',
        'data/taus/26_nanoML_93.npz',
        # 'data/taus/142_nanoML_75.npz',
        ]
    for data in DataLoader(dataset, batch_size=len(dataset), shuffle=False): break
    
    print(data.y.sum())
    model = GravnetModel(input_dim=9, output_dim=4)
    with torch.no_grad():
        model.eval()
        out = model(data.x, data.batch)
        
    pred_betas = torch.sigmoid(out[:,0])
    pred_cluster_space_coords = out[:,1:4]
    out_oc = oc.calc_LV_Lbeta(
        pred_betas,
        pred_cluster_space_coords,
        data.y.long(),
        data.batch.long()
        )

def run_profile():
    from torch.profiler import profile, record_function, ProfilerActivity
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('Using device', device)

    batch_size = 2
    n_batches = 2
    shuffle = True
    dataset = TauDataset('data/taus')
    dataset.npzs = dataset.npzs[:batch_size*n_batches]
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    print(f'Running profiling for {len(dataset)} events, batch_size={batch_size}, {len(loader)} batches')

    model = GravnetModel(input_dim=9, output_dim=8).to(device)
    epoch_size = len(loader.dataset)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-7, weight_decay=1e-4)

    print('Start limited training loop')
    model.train()
    with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
        with record_function("model_inference"):
            pbar = tqdm(loader, total=len(loader), leave=False)
            pbar.set_postfix({'loss': '?'})
            for i, data in enumerate(pbar):
                data = data.to(device)
                optimizer.zero_grad()
                result = model(data.x, data.batch)
                loss = loss_fn(result, data)
                print(f'loss={float(loss)}')
                loss.backward()
                optimizer.step()
                pbar.set_postfix({'loss': float(loss)})
                
    print(prof.key_averages().table(sort_by="cpu_time", row_limit=10))
    # Other valid keys:
    # cpu_time, cuda_time, cpu_time_total, cuda_time_total, cpu_memory_usage,
    # cuda_memory_usage, self_cpu_memory_usage, self_cuda_memory_usage, count

In [None]:
# train the model
min_loss = 1e9
n_epochs = 50
tb_writer = SummaryWriter(log_dir='logs')
for i_epoch in tqdm(range(n_epochs)):
    train_loss = train(i_epoch, train_loader, tb_writer)
    write_checkpoint(i_epoch)
    test_loss = test(i_epoch)
    if test_loss < min_loss:
        min_loss = test_loss
        write_checkpoint(i_epoch, best=True)
    
    tb_writer.add_scalar('test loss', test_loss, i_epoch)
    

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/3978 [00:00<?, ?it/s]

  0%|          | 0/995 [00:00<?, ?it/s]

In [None]:
print('nothing')