# Extra dependencies installation

# Imports

In [5]:
import os
from tqdm import tqdm
import pyarrow.parquet as pq
from collections import defaultdict, Counter
import numpy as np

# Setup dataset 

In [2]:
try:
  import google.colab
  COLAB_ENV = True
except:
  COLAB_ENV = False


if COLAB_ENV:
  data_filepath = "/" + os.path.join("content","drive","MyDrive","datasets", "ML4SCI_GSOC23")
  from google.colab import drive
  drive.mount('/content/drive')
else:
  data_filepath = "data"


os.listdir(data_filepath)

Mounted at /content/drive


['SinglePhotonPt50_IMGCROPS_n249k_RHv1.hdf5',
 'SingleElectronPt50_IMGCROPS_n249k_RHv1.hdf5',
 'QCDToGGQQ_IMGjet_RH1all_jet0_run0_n36272.test.snappy.parquet',
 'QCDToGGQQ_IMGjet_RH1all_jet0_run1_n47540.test.snappy.parquet',
 'QCDToGGQQ_IMGjet_RH1all_jet0_run2_n55494.test.snappy.parquet',
 'models',
 'tensorboard',
 'gravitational-lensing']

# Important Hyperparameters

In [3]:
CREATE_DATASET_FROM_PARQUET = True
np_images_filepath = os.path.join(data_filepath, "Task2")
class_to_label = {0 : "Gluon",
                  1 : "Quark"}

# Dataset Creation

In [6]:
def read_and_process_parquet(name):
  print("-"*64)
  print(name)
  filepath = os.path.join(data_filepath, name)
  df = pq.read_table(filepath)
  print(df.schema)
  print(Counter(list(df['y'])))


def read_parquet(name):
  filepath = os.path.join(data_filepath, name)
  return pq.read_table(filepath)

def save_as_np_images(parquet_file, data_root, class_to_filename, starts = defaultdict(int), debug=False, imgs = -1):
  filepath = os.path.join(data_filepath, parquet_file)
  df = pq.read_table(filepath)
  
  class_to_filename = {k: os.path.join(data_root,v) for k,v in class_to_filename.items()}
  i = 0
  counters = starts

  os.makedirs(data_root,exist_ok=True)
  for k, v in class_to_filename.items():
    os.makedirs(v, exist_ok=True)

  for batch in tqdm(df.to_batches(max_chunksize=None)):
    _dict = batch.to_pydict()
    x, y, m0, pt = np.array(_dict["X_jets"])[0], _dict["y"][0], _dict["m0"][0], _dict["pt"][0]

    x = x.transpose(1, 2, 0)

    filename = os.path.join(class_to_filename[y], f"{counters[y]}.npy")
    i += 1
    counters[y] += 1
    
    np.save(filename, x)
    
    if imgs != -1 and i == imgs:
        break

    if debug:
      print("")
      print(y)
      print(f"saving file to {filename}")
      if i % 1 == 0:
       break

  return counters


In [7]:
parquet_files = ["QCDToGGQQ_IMGjet_RH1all_jet0_run0_n36272.test.snappy.parquet", 
                 "QCDToGGQQ_IMGjet_RH1all_jet0_run1_n47540.test.snappy.parquet",
                 "QCDToGGQQ_IMGjet_RH1all_jet0_run2_n55494.test.snappy.parquet"]

if CREATE_DATASET_FROM_PARQUET:
  saved = save_as_np_images(parquet_files[0], 
                    np_images_filepath,
                    class_to_label)

100%|██████████| 36272/36272 [24:48<00:00, 24.37it/s]
