### Optimizing Ming's graph-generating code for speed

In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
### Generic imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize, LogNorm
import scipy
import uproot
from tqdm import tqdm
import functools
from glob import glob

### ML-related
import tensorflow as tf
import atlas_mpl_style as ampl
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
import sonnet as snt

### GNN-related
from graph_nets import blocks
from graph_nets import graphs
from graph_nets import modules
from graph_nets import utils_np
from graph_nets import utils_tf
import networkx as nx

In [14]:
# ### GPU Setup
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "3" # pick a number between 0 & 3
# gpus = tf.config.list_physical_devices('GPU') 
# tf.config.experimental.set_memory_growth(gpus[0], True)

In [15]:
### Other setup 
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 20)

params = {'legend.fontsize': 13, 'axes.labelsize': 18}
plt.rcParams.update(params)

SEED = 15
np.random.seed(SEED)
tf.random.set_seed(SEED)

### Load files

First, a sample of single neutral pions:

In [5]:
file_path = '../data/neutral_pion_sample.root'
f_pi0 = uproot.open(file_path)

Define the primary DataFrame:

In [6]:
df = f_pi0['EventTree'].arrays(["cluster_cell_E", "cluster_cell_ID", "cluster_E", "cluster_Eta", "cluster_Phi"], library="pd")
df.reset_index(inplace=True) # flatten MultiIndexing

Define the cell geometry DataFrame:

In [7]:
df_geo = f_pi0['CellGeo'].arrays(library="pd")
df_geo = df_geo.reset_index() # remove redundant multi-indexing
df_geo.drop(columns = ["entry", "subentry"], inplace=True)

### Add x,y,z coordinates
df_geo["cell_geo_x"] = df_geo["cell_geo_rPerp"] * np.cos(df_geo["cell_geo_phi"])
df_geo["cell_geo_y"] = df_geo["cell_geo_rPerp"] * np.sin(df_geo["cell_geo_phi"])
cell_geo_theta = 2*np.arctan(np.exp(-df_geo["cell_geo_eta"]))
df_geo["cell_geo_z"] = df_geo["cell_geo_rPerp"] / np.tan(cell_geo_theta)

#### Define graph-making function

In [16]:
def make_graph(event: pd.Series, geo_df: pd.DataFrame, is_charged=False):
    """
    Creates a graph representation of an event
    
    inputs
    event (pd.Series) one event/row from EventTree
    geo_df (pd.DataFrame) the CellGeo DataFrame mapping cell_geo_ID to information about the cell
    is_charged (bool) True for charged pion, False for uncharged pion
    
    returns
    A pair of graph representations of the event for the GNN (train_graph, target_graph)
    returns (None, None) if no cell energies detected
    """
    
    ### No cell energies present
    if len(event["cluster_cell_E"]) == 0:
        return None, None
    
    ### Get cell geometry information for this particular event
    temp_df = geo_df[geo_df["cell_geo_ID"].isin([item for sublist in event["cluster_cell_ID"] for item in sublist])]
    temp_df = temp_df.set_index("cell_geo_ID")
    ### Assign cell energies
    for cell_id, cell_e in zip(
        [item for sublist in event["cluster_cell_ID"] for item in sublist],
        [item for sublist in event["cluster_cell_E"] for item in sublist]
    ):
        temp_df.loc[int(cell_id), "cell_E"] = cell_e
    
    ### Define node features
    n_nodes = temp_df.shape[0]
    node_features = ["cell_E", "cell_geo_eta",
                     "cell_geo_phi", "cell_geo_rPerp",
                     "cell_geo_deta", "cell_geo_dphi",
                     "cell_geo_volume"]
    nodes = temp_df[node_features].to_numpy(dtype=np.float32).reshape(-1, len(node_features))
    
    ### Apply k-NN search to find cell neighbors
    # NOTE FAIR also has a faster algo for KNN search. Might want to try it
    k = 6
    k = min(n_nodes, k)
    
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(temp_df[["cell_geo_x", "cell_geo_y", "cell_geo_z"]])
    distances, indices = nbrs.kneighbors(temp_df[["cell_geo_x", "cell_geo_y", "cell_geo_z"]])
    
    senders = np.repeat([x[0] for x in indices], k-1)               # k-1 for no self edges
    receivers = np.array([x[1:] for x in indices]).flatten()        # x[1:] for no self edges
    edges = np.array([x[1:] for x in distances], dtype=np.float32).flatten().reshape(-1, 1)
    n_edges = len(senders)
        
    global_features = ["cluster_E", "cluster_Eta", "cluster_Phi"]
    global_values = np.asarray(event[global_features]).astype('float32')
    
    input_datadict = {
        "n_node": n_nodes,
        "n_edge": n_edges,
        "nodes": nodes,
        "edges": edges,
        "senders": senders,
        "receivers": receivers,
        "globals": global_values            # np.array([n_nodes], dtype=np.float32)
    }
    
    target_datadict = {
        "n_node": n_nodes,
        "n_edge": n_edges,
        "nodes": nodes,
        "edges": edges,
        "senders": senders,
        "receivers": receivers,
        "globals": np.array([int(is_charged)], dtype=np.float32)
    }

    input_graph = utils_tf.data_dicts_to_graphs_tuple([input_datadict])
    target_graph = utils_tf.data_dicts_to_graphs_tuple([target_datadict])
    
    return input_graph, target_graph

def make_dict(event: pd.Series, geo_df: pd.DataFrame, is_charged=False):
    """
    Creates a graph representation of an event
    
    inputs
    event (pd.Series) one event/row from EventTree
    geo_df (pd.DataFrame) the CellGeo DataFrame mapping cell_geo_ID to information about the cell
    is_charged (bool) True for charged pion, False for uncharged pion
    
    returns
    A pair of graph representations of the event for the GNN (train_graph, target_graph)
    returns (None, None) if no cell energies detected
    """
    
    ### No cell energies present
    if len(event["cluster_cell_E"]) == 0:
        return None, None
    
    ### Get cell geometry information for this particular event
    temp_df = geo_df[geo_df["cell_geo_ID"].isin([item for sublist in event["cluster_cell_ID"] for item in sublist])]
    temp_df = temp_df.set_index("cell_geo_ID")
    ### Assign cell energies
    for cell_id, cell_e in zip(
        [item for sublist in event["cluster_cell_ID"] for item in sublist],
        [item for sublist in event["cluster_cell_E"] for item in sublist]
    ):
        temp_df.loc[int(cell_id), "cell_E"] = cell_e
    
    ### Define node features
    n_nodes = temp_df.shape[0]
    node_features = ["cell_E", "cell_geo_eta",
                     "cell_geo_phi", "cell_geo_rPerp",
                     "cell_geo_deta", "cell_geo_dphi",
                     "cell_geo_volume"]
    nodes = temp_df[node_features].to_numpy(dtype=np.float32).reshape(-1, len(node_features))
    
    ### Apply k-NN search to find cell neighbors
    # NOTE FAIR also has a faster algo for KNN search. Might want to try it
    k = 6
    k = min(n_nodes, k)
    
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(temp_df[["cell_geo_x", "cell_geo_y", "cell_geo_z"]])
    distances, indices = nbrs.kneighbors(temp_df[["cell_geo_x", "cell_geo_y", "cell_geo_z"]])
    
    senders = np.repeat([x[0] for x in indices], k-1)               # k-1 for no self edges
    receivers = np.array([x[1:] for x in indices]).flatten()        # x[1:] for no self edges
    edges = np.array([x[1:] for x in distances], dtype=np.float32).flatten().reshape(-1, 1)
    n_edges = len(senders)
        
    global_features = ["cluster_E", "cluster_Eta", "cluster_Phi"]
    global_values = np.asarray(event[global_features]).astype('float32')
    
    input_datadict = {
        "n_node": n_nodes,
        "n_edge": n_edges,
        "nodes": nodes,
        "edges": edges,
        "senders": senders,
        "receivers": receivers,
        "globals": global_values            # np.array([n_nodes], dtype=np.float32)
    }
    
    target_datadict = {
        "n_node": n_nodes,
        "n_edge": n_edges,
        "nodes": nodes,
        "edges": edges,
        "senders": senders,
        "receivers": receivers,
        "globals": np.array([int(is_charged)], dtype=np.float32)
    }

#     input_graph = utils_tf.data_dicts_to_graphs_tuple([input_datadict])
#     target_graph = utils_tf.data_dicts_to_graphs_tuple([target_datadict])
    
    return input_datadict, target_datadict

In [None]:
%%timeit
n_entries = 100
graph_list = []
for i in range(n_entries):
    graph_list.append(make_graph(df.iloc[i], geo_df=df_geo, is_charged=False))
    
input_graph_list = [tuple[0] for tuple in graph_list]
target_graph_list = [tuple[1] for tuple in graph_list]

Pick an event to look at:

In [9]:
# graph_list = []
# for i in tqdm(range(len(df))):
#     graph_list.append(make_graph(df.iloc[i], geo_df=df_geo, is_charged=False))

 26%|██▌       | 5393/20963 [05:22<15:31, 16.71it/s]


KeyboardInterrupt: 

In [None]:
n_entries = 20
graph_list = []
for i in range(n_entries):
    graph_list.append(make_graph(df.iloc[i], geo_df=df_geo, is_charged=False))

input_graph_list = [tuple[0] for tuple in graph_list]
target_graph_list = [tuple[1] for tuple in graph_list]

graph = input_graph_list[15] # pick event #15, say
print(graph.globals)

In [None]:
import pickle 
with open('test.pkl', 'wb') as f:
    pickle.dump(graph_list, f)

In [None]:
# try reading it back in...
with open('test.pkl', 'rb') as f:
    graph_list_loaded = pickle.load(f)

In [None]:
input_graph_list[0].n_node

In [None]:
target_graph_list[0].n_node

In [None]:
graph_list[10][0].globals

# Try running over all the files!

In [None]:
files = glob('/global/cfs/cdirs/m3246/mpettee/ml4pions/LCStudies/data/user.angerami.mc16_13TeV.900246.PG_singlepi0_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/*.root')

In [None]:
for file in files:
    print(len(uproot.open(file)['EventTree'].arrays(["cluster_cell_E", "cluster_cell_ID", "cluster_E", "cluster_Eta", "cluster_Phi"], library="pd")))

In [17]:
def divide_chunks(l, n):
    return [l[i::n] for i in range(n)]

In [18]:
files = glob('../data/*singlepi0*/*.root')
chunks = list(divide_chunks(files, 10))
worker_files = chunks[0]
print("{} files for worker #{}:".format(len(chunks[0]),0))
# print(worker_files) 

50 files for worker #0:


In [20]:
for file in tqdm(worker_files):
    ### Define primary dataframe
    f = uproot.open(file)
    df = f['EventTree'].arrays(["cluster_cell_E", "cluster_cell_ID", "cluster_E", "cluster_Eta", "cluster_Phi"], library="pd")
    df.reset_index(inplace=True) # flatten MultiIndexing

    ### Define cell geometry dataframe
    df_geo = f['CellGeo'].arrays(library="pd")
    df_geo = df_geo.reset_index() # remove redundant multi-indexing
    df_geo.drop(columns = ["entry", "subentry"], inplace=True)

    ### Add x,y,z coordinates
    df_geo["cell_geo_x"] = df_geo["cell_geo_rPerp"] * np.cos(df_geo["cell_geo_phi"])
    df_geo["cell_geo_y"] = df_geo["cell_geo_rPerp"] * np.sin(df_geo["cell_geo_phi"])
    cell_geo_theta = 2*np.arctan(np.exp(-df_geo["cell_geo_eta"]))
    df_geo["cell_geo_z"] = df_geo["cell_geo_rPerp"] / np.tan(cell_geo_theta)

    ### Make the graphs for the specified events
    graph_list = []
    for i in tqdm(range(len(df))):
        graph_list.append(make_graph(df.iloc[i], geo_df=df_geo, is_charged=False))

    ### Save Pickle file, with zero-indexing:
    os.makedirs(args.save_dir, exist_ok=True)
    filepath = os.path.join(args.save_dir,file.split('.')[-2][1:]+'.pkl')
    with open(filepath, 'wb') as f:
        pickle.dump(graph_list, f)

  0%|          | 0/50 [00:00<?, ?it/s]
  0%|          | 0/21147 [00:00<?, ?it/s][A
  0%|          | 1/21147 [00:00<1:04:24,  5.47it/s][A
  0%|          | 2/21147 [00:00<55:47,  6.32it/s]  [A
  0%|          | 5/21147 [00:00<26:58, 13.07it/s][A
  0%|          | 8/21147 [00:00<25:07, 14.02it/s][A
  0%|          | 10/21147 [00:00<25:10, 13.99it/s][A
  0%|          | 13/21147 [00:00<20:17, 17.36it/s][A
  0%|          | 15/21147 [00:01<20:15, 17.38it/s][A
  0%|          | 18/21147 [00:01<18:58, 18.56it/s][A
  0%|          | 21/21147 [00:01<17:34, 20.03it/s][A
  0%|          | 24/21147 [00:01<17:02, 20.66it/s][A
  0%|          | 27/21147 [00:01<16:28, 21.36it/s][A
  0%|          | 30/21147 [00:01<15:48, 22.26it/s][A
  0%|          | 33/21147 [00:01<14:37, 24.07it/s][A
  0%|          | 36/21147 [00:01<14:35, 24.12it/s][A
  0%|          | 39/21147 [00:02<18:56, 18.57it/s][A
  0%|          | 42/21147 [00:02<18:16, 19.25it/s][A
  0%|          | 45/21147 [00:02<18:28, 19.04it/s][

KeyboardInterrupt: 