# Extra dependencies installation

In [1]:
!pip install sparse

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sparse
  Downloading sparse-0.14.0-py2.py3-none-any.whl (80 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.0/81.0 KB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting numpy>=1.17
  Downloading numpy-1.23.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: numpy, sparse
  Attempting uninstall: numpy
    Found existing installation: numpy 1.24.2
    Uninstalling numpy-1.24.2:
      Successfully uninstalled numpy-1.24.2
Successfully installed numpy-1.23.5 sparse-0.14.0


# Imports

In [1]:
import os
from tqdm import tqdm
import pyarrow.parquet as pq
from collections import defaultdict, Counter
import numpy as np
import scipy
import sparse

# Setup dataset 

In [2]:
try:
  import google.colab
  COLAB_ENV = True
except:
  COLAB_ENV = False

if COLAB_ENV:
  data_filepath = "/" + os.path.join("content","drive","MyDrive","datasets", "ML4SCI_GSOC23")
  from google.colab import drive
  drive.mount('/content/drive')
else:
  data_filepath = "data"


os.listdir(data_filepath)

Mounted at /content/drive


['SinglePhotonPt50_IMGCROPS_n249k_RHv1.hdf5',
 'SingleElectronPt50_IMGCROPS_n249k_RHv1.hdf5',
 'QCDToGGQQ_IMGjet_RH1all_jet0_run0_n36272.test.snappy.parquet',
 'QCDToGGQQ_IMGjet_RH1all_jet0_run1_n47540.test.snappy.parquet',
 'QCDToGGQQ_IMGjet_RH1all_jet0_run2_n55494.test.snappy.parquet',
 'models',
 'tensorboard',
 'gravitational-lensing',
 'Task2']

# Important Hyperparameters

In [4]:
CREATE_DATASET_FROM_PARQUET = True

data_path_img = data_filepath
data_path_img_local = "data"

np_images_filepath = os.path.join(data_path_img_local, "Task2")
class_to_label = {0 : "Gluon",
                  1 : "Quark"}

# Dataset Creation

In [6]:
def read_and_process_parquet(name):
  print("-"*64)
  print(name)
  filepath = os.path.join(data_filepath, name)
  df = pq.read_table(filepath)
  print(df.schema)
  print(Counter(list(df['y'])))
  return df

def read_parquet(name):
  filepath = os.path.join(data_filepath, name)
  return pq.read_table(filepath)

def get_jet_images_numpy(name):
  filepath = os.path.join(data_filepath, name)
  return pq.read_table(filepath)["X_jets"].to_numpy()

def save_as_np_images(parquet_file, data_root, class_to_filename, starts = defaultdict(int), debug=False, imgs = -1):
  filepath = os.path.join(data_filepath, parquet_file)
  df = pq.read_table(filepath)
  
  class_to_filename = {k: os.path.join(data_root,v) for k,v in class_to_filename.items()}
  i = 0
  counters = starts

  os.makedirs(data_root,exist_ok=True)
  for k, v in class_to_filename.items():
    os.makedirs(v, exist_ok=True)

  for batch in tqdm(df.to_batches(max_chunksize=None)):
    _dict = batch.to_pydict()
    x, y, m0, pt = np.array(_dict["X_jets"])[0], _dict["y"][0], _dict["m0"][0], _dict["pt"][0]

    x = x.transpose(1, 2, 0)
    x = sparse.COO(x)

    filename = os.path.join(class_to_filename[y], f"{counters[y]}.npz")
    i += 1
    counters[y] += 1
    
    sparse.save_npz(filename, x)
    
    if imgs != -1 and i == imgs:
        break

    if debug:
      print("")
      print(y)
      print(f"saving file to {filename}")
      if i % 1 == 0:
       break

  return counters


In [7]:
parquet_files = ["QCDToGGQQ_IMGjet_RH1all_jet0_run0_n36272.test.snappy.parquet", 
                 "QCDToGGQQ_IMGjet_RH1all_jet0_run1_n47540.test.snappy.parquet",
                 "QCDToGGQQ_IMGjet_RH1all_jet0_run2_n55494.test.snappy.parquet"]

In [None]:
if CREATE_DATASET_FROM_PARQUET:
  counts = defaultdict(int)
  for pq_file in parquet_files:
    counts = save_as_np_images(pq_file, 
                    np_images_filepath,
                    class_to_label,
                    starts=counts)

100%|██████████| 36272/36272 [16:51<00:00, 35.85it/s]
 73%|███████▎  | 34584/47540 [15:54<05:54, 36.54it/s]

In [None]:
print(len(os.listdir("data/Task2/Gluon")))
print(len(os.listdir("data/Task2/Quark")))

In [None]:
print(data_path_img)

In [None]:
!zip -rq Task2.zip data/

In [4]:
!rsync -Prv Task2.zip /content/drive/MyDrive/datasets/ML4SCI_GSOC23/Task2

sending incremental file list
Task2.zip
    130,457,996 100%  213.35MB/s    0:00:00 (xfr#1, to-chk=0/1)

sent 130,489,934 bytes  received 35 bytes  86,993,312.67 bytes/sec
total size is 130,457,996  speedup is 1.00


In [5]:
!ls /content/drive/MyDrive/datasets/ML4SCI_GSOC23/Task2

Task2.zip


In [6]:
def get_statistics_of_data(parquet_file, starts = defaultdict(int), debug=False, imgs = -1):
  filepath = os.path.join(data_filepath, parquet_file)
  df = pq.read_table(filepath)
  
  total = 0
  non_zero = 0
  i = 0
  for batch in tqdm(df.to_batches(max_chunksize=None)):
    _dict = batch.to_pydict()
    x, y, m0, pt = np.array(_dict["X_jets"])[0], _dict["y"][0], _dict["m0"][0], _dict["pt"][0]
    
    x = x.transpose(1, 2, 0)
    non_zero += (x != 0).reshape(3, -1).sum(axis=1)
    total +=  np.array([1,1,1]) * (np.prod(x.shape) / 3)
    i += 1

  return {"total" : total, "non_zero" : non_zero}


In [7]:
stats = get_statistics_of_data(parquet_files[0])
print(stats)

NameError: ignored

In [None]:
100 * stats["non_zero"] / stats["total"]

array([0.89917142, 3.47077212, 0.87892969])

# calculate statistics

In [4]:
!rsync -P /content/drive/MyDrive/datasets/ML4SCI_GSOC23/Task2/Task2.zip .
!unzip Task2.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: data/Task2/Quark/2607.npz  
  inflating: data/Task2/Quark/11931.npz  
  inflating: data/Task2/Quark/948.npz  
  inflating: data/Task2/Quark/6480.npz  
  inflating: data/Task2/Quark/14372.npz  
  inflating: data/Task2/Quark/15291.npz  
  inflating: data/Task2/Quark/9201.npz  
  inflating: data/Task2/Quark/6305.npz  
  inflating: data/Task2/Quark/17658.npz  
  inflating: data/Task2/Quark/12580.npz  
  inflating: data/Task2/Quark/2817.npz  
  inflating: data/Task2/Quark/4247.npz  
  inflating: data/Task2/Quark/12776.npz  
  inflating: data/Task2/Quark/11937.npz  
  inflating: data/Task2/Quark/7037.npz  
  inflating: data/Task2/Quark/8295.npz  
  inflating: data/Task2/Quark/10627.npz  
  inflating: data/Task2/Quark/14241.npz  
  inflating: data/Task2/Quark/7508.npz  
  inflating: data/Task2/Quark/7167.npz  
  inflating: data/Task2/Quark/8208.npz  
  inflating: data/Task2/Quark/17308.npz  
  inflating: data/Task2/

In [5]:
names = []
for root, dirs, files in os.walk("data/Task2", topdown=False):
   for name in files:
      names.append(os.path.join(root, name))


In [6]:
mat = []
for f in names:
  mat.append(sparse.load_npz(f).todense())
df = np.concatenate(mat)
del mat

In [7]:
import gc
gc.collect()

825

In [8]:
print(f"mean = {np.mean(df.reshape(-1, 3), axis=0)}")
print(f"std = {np.std(df.reshape(-1,3), axis=0)}")

mean = [0.00782386 0.00482207 0.00304506]
std = [0.63704916 0.18686025 0.04598732]


mean = [0.00782386 0.00482207 0.00304506] <br>
std = [0.63704916 0.18686025 0.04598732]
