# Extra dependencies installation

In [1]:
!pip install sparse

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sparse
  Downloading sparse-0.14.0-py2.py3-none-any.whl (80 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.0/81.0 KB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sparse
Successfully installed sparse-0.14.0


# Imports

In [2]:
import os
from tqdm import tqdm
import pyarrow.parquet as pq
from collections import defaultdict, Counter
import numpy as np
import scipy
import sparse

# Setup dataset 

In [3]:
try:
  import google.colab
  COLAB_ENV = True
except:
  COLAB_ENV = False


if COLAB_ENV:
  data_filepath = "/" + os.path.join("content","drive","MyDrive","datasets", "ML4SCI_GSOC23")
  from google.colab import drive
  drive.mount('/content/drive')
else:
  data_filepath = "data"


os.listdir(data_filepath)

Mounted at /content/drive


['SinglePhotonPt50_IMGCROPS_n249k_RHv1.hdf5',
 'SingleElectronPt50_IMGCROPS_n249k_RHv1.hdf5',
 'QCDToGGQQ_IMGjet_RH1all_jet0_run0_n36272.test.snappy.parquet',
 'QCDToGGQQ_IMGjet_RH1all_jet0_run1_n47540.test.snappy.parquet',
 'QCDToGGQQ_IMGjet_RH1all_jet0_run2_n55494.test.snappy.parquet',
 'models',
 'tensorboard',
 'gravitational-lensing',
 'Task2']

# Important Hyperparameters

In [4]:
CREATE_DATASET_FROM_PARQUET = True

data_path_img = data_filepath
data_path_img_local = "data"

np_images_filepath = os.path.join(data_path_img_local, "Task2")
class_to_label = {0 : "Gluon",
                  1 : "Quark"}

# Dataset Creation

In [5]:
def read_and_process_parquet(name):
  print("-"*64)
  print(name)
  filepath = os.path.join(data_filepath, name)
  df = pq.read_table(filepath)
  print(df.schema)
  print(Counter(list(df['y'])))


def read_parquet(name):
  filepath = os.path.join(data_filepath, name)
  return pq.read_table(filepath)

def save_as_np_images(parquet_file, data_root, class_to_filename, starts = defaultdict(int), debug=False, imgs = -1):
  filepath = os.path.join(data_filepath, parquet_file)
  df = pq.read_table(filepath)
  
  class_to_filename = {k: os.path.join(data_root,v) for k,v in class_to_filename.items()}
  i = 0
  counters = starts

  os.makedirs(data_root,exist_ok=True)
  for k, v in class_to_filename.items():
    os.makedirs(v, exist_ok=True)

  for batch in tqdm(df.to_batches(max_chunksize=None)):
    _dict = batch.to_pydict()
    x, y, m0, pt = np.array(_dict["X_jets"])[0], _dict["y"][0], _dict["m0"][0], _dict["pt"][0]

    x = x.transpose(1, 2, 0)
    x = sparse.COO(x)

    filename = os.path.join(class_to_filename[y], f"{counters[y]}.npz")
    i += 1
    counters[y] += 1
    
    sparse.save_npz(filename, x)
    
    if imgs != -1 and i == imgs:
        break

    if debug:
      print("")
      print(y)
      print(f"saving file to {filename}")
      if i % 1 == 0:
       break

  return counters


In [6]:
parquet_files = ["QCDToGGQQ_IMGjet_RH1all_jet0_run0_n36272.test.snappy.parquet", 
                 "QCDToGGQQ_IMGjet_RH1all_jet0_run1_n47540.test.snappy.parquet",
                 "QCDToGGQQ_IMGjet_RH1all_jet0_run2_n55494.test.snappy.parquet"]

if CREATE_DATASET_FROM_PARQUET:
  saved = save_as_np_images(parquet_files[0], 
                    np_images_filepath,
                    class_to_label)

100%|██████████| 36272/36272 [22:00<00:00, 27.47it/s]


In [12]:
print(len(os.listdir("data/Task2/Gluon")))
print(len(os.listdir("data/Task2/Quark")))

18136
18136


In [13]:
print(data_path_img)

/content/drive/MyDrive/datasets/ML4SCI_GSOC23


In [14]:
!zip -rq Task2.zip data/

In [10]:
!rsync -Prv Task2.zip /content/drive/MyDrive/datasets/ML4SCI_GSOC23/Task2

sending incremental file list
Task2.zip
    130,457,996 100%   72.57MB/s    0:00:01 (xfr#1, to-chk=0/1)

sent 130,489,934 bytes  received 35 bytes  52,195,987.60 bytes/sec
total size is 130,457,996  speedup is 1.00


In [11]:
!ls /content/drive/MyDrive/datasets/ML4SCI_GSOC23/Task2

Task2.zip


In [18]:
def get_statistics_of_data(parquet_file, starts = defaultdict(int), debug=False, imgs = -1):
  filepath = os.path.join(data_filepath, parquet_file)
  df = pq.read_table(filepath)
  
  total = 0
  non_zero = 0
  i = 0
  for batch in tqdm(df.to_batches(max_chunksize=None)):
    _dict = batch.to_pydict()
    x, y, m0, pt = np.array(_dict["X_jets"])[0], _dict["y"][0], _dict["m0"][0], _dict["pt"][0]
    
    x = x.transpose(1, 2, 0)
    non_zero += (x != 0).reshape(3, -1).sum(axis=1)
    total +=  np.array([1,1,1]) * (np.prod(x.shape) / 3)
    i += 1

  return {"total" : total, "non_zero" : non_zero}


In [19]:
stats = get_statistics_of_data(parquet_files[0])
print(stats)

100%|██████████| 36272/36272 [14:22<00:00, 42.08it/s]


{'total': array([5.6675e+08, 5.6675e+08, 5.6675e+08]), 'non_zero': array([ 5096054, 19670601,  4981334])}


In [21]:
100 * stats["non_zero"] / stats["total"]

array([0.89917142, 3.47077212, 0.87892969])

In [37]:
print(data_path_img)

data


In [15]:
sparse.load_npz

<module 'sparse' from '/usr/local/lib/python3.8/dist-packages/sparse/__init__.py'>