In [1]:
import warnings

warnings.filterwarnings('ignore')
## We will be using Numpy, Pyplot and Tensorflow as our scientific tool box
import numpy as np 
import matplotlib.pyplot as plt
import tensorflow as tf

## BytesIO for defining in-memory file-like objects
from io import BytesIO

## Dask and in particular dask array for defining OOM pipelines
import dask
import dask.array as da

## Progress bars
from tqdm import tqdm

import mlflow

2024-12-05 16:04:30.640950: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-05 16:04:30.684155: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-05 16:04:30.684237: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-05 16:04:30.684265: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-05 16:04:30.692318: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-05 16:04:30.693621: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

In [None]:
model="classifier.keras"

In [2]:
! wget "https://agenda.infn.it/event/40829/attachments/120545/191628/object_list%20(1).csv"
! wget "https://pandora.infn.it/public/cdf340/dl/soscdata.zip"
! rm -fr input
! mkdir -p input && cd input && unzip ../soscdata.zip
! mkdir -p downloads

object_names = []
with open('object_list (1).csv') as f:
    object_names = [x.strip("\n") for x in f.readlines()]

--2024-12-05 16:04:33--  https://agenda.infn.it/event/40829/attachments/120545/191628/object_list%20(1).csv%20
Resolving agenda.infn.it (agenda.infn.it)... 131.154.52.174
Connecting to agenda.infn.it (agenda.infn.it)|131.154.52.174|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: /event/40829/attachments/120545/191628/object_list%20(1).csv [following]
--2024-12-05 16:04:33--  https://agenda.infn.it/event/40829/attachments/120545/191628/object_list%20(1).csv
Reusing existing connection to agenda.infn.it:443.
HTTP request sent, awaiting response... 200 OK
Length: 170142 (166K) [text/csv]
Saving to: ‘object_list (1).csv .1’


2024-12-05 16:04:33 (99.2 MB/s) - ‘object_list (1).csv .1’ saved [170142/170142]

--2024-12-05 16:04:33--  https://pandora.infn.it/public/cdf340/dl/soscdata.zip
Resolving pandora.infn.it (pandora.infn.it)... 131.154.52.50
Connecting to pandora.infn.it (pandora.infn.it)|131.154.52.50|:443... connected.
HTTP request sent, awaiting response

In [4]:
def load_npz_from_minio(object_name):
  """Load an object from Minio into a numpy array"""
  return np.load("input/"+object_name)


def inspect_np(np_file):
    """Display key, shape and dtype of the arrays in a npz file"""
    keys = np_file.keys()
    print ("Keys in file: ", ", ".join(keys))
    for key in keys:
        array = np_file[key]
        print (
            f" - {key:<15s}"
            f"   shape: {str(array.shape):<20s}"
            f"   dtype: {array.dtype}"
          )

npz_file = load_npz_from_minio(object_names[-1])
print(npz_file)
inspect_np(npz_file)

NpzFile 'input/data-chunk-2023-10-18T20:44:35.884254.npz' with keys: image, tstamp
Keys in file:  image, tstamp
 - image             shape: (10, 128, 128)         dtype: uint8
 - tstamp            shape: (10,)                  dtype: datetime64[us]


In [None]:
local_model_path = mlflow.artifacts.download_artifacts(
    artifact_uri=model, dst_path="./downloads"
)
print(local_model_path)
classifier = tf.keras.models.load_model(f"{local_model_path}/classifier.keras")
#classifier = tf.keras.models.load_model(f"classifier.keras")

In [None]:
## Solution

@dask.delayed
def load_array_from_minio(object_name, npz_key):
    """Load an array identified by npz_key from an npz file in Minio"""
    npz = load_npz_from_minio(object_name)
    return npz[npz_key] 

delayed_images = [
    da.from_delayed(
        load_array_from_minio(obj, 'image'),
        shape=(10, 128, 128),
        dtype=np.float64
    )
    for obj in object_names
]

images = da.concatenate(delayed_images)


### 
delayed_tstamps = [
    da.from_delayed(
        load_array_from_minio(obj, 'tstamp'),
        shape=(10,),
        dtype=np.float64
    )
    for obj in object_names
]

tstamps = da.concatenate(delayed_tstamps)

display(images)
display(tstamps)

In [None]:
## Solution

## Rechunk the image
rechunked_images = images.rechunk((10, -1, -1))

## Evaluate the CNN model on the batches and concatenate the outputs
predictions = np.concatenate([
    classifier.predict_on_batch(x).flatten()
    for x in tqdm(rechunked_images.blocks, total=rechunked_images.numblocks[0])
])

In [None]:
## Solution
def plot_histogram(predictions):
    """Makes a histogram of the CNN predictions for the CYGNO-SIM acquired data"""
    plt.hist(predictions, bins=np.linspace(0, 1, 51), label="CYGNO-SIM")
    plt.yscale('log')
    plt.xlabel("Response of the CNN")
    plt.ylabel("Number of acquired events")
    plt.legend()
    plt.show()

plot_histogram(predictions)

In [None]:
def make_timestamp_histogram(
    timestamps, 
    predictions,
    threshold,
    bin_width=1,
):
    """
    Creates a histogram of the selected events in hourly bins.

    Arguments:
     - timestamps:  dask array of type np.datetime64
     - predictions: numpy array of the classifier response with the same 
                    shape as timestamps
     - threshold:   float, threshold in range [0,1] defining the minimum
                    classifier's response to select a candidate event as NR
     - bin_width:   float, approximate dimension of a bin in hours, default: 0.1

    Returns:
     - a tuple of 1D arrays (time_in_hours, number_of_selected_events)
    """
    ## Obtain the timestamps as a numpy array
    np_tstamps = timestamps.compute()

    ## Retrieve the first timestmp as "START"
    t0 = np_tstamps[0]

    ## Select only the timestamps associated to a positive response
    ## of the CNN classifier
    t = np_tstamps[predictions > threshold]

    ## Convert the timestamp in a number of hours since START
    minutes_since_start = (t-t0)/np.timedelta64(1, 'm')

    ## Compute the end of the time span as LATEST
    tot_minutes = minutes_since_start.max()

    ## Compute the number of bins as the first integer exceeding 
    ## the total number of hours divided per the width of each bin
    bins = int(np.ceil(tot_minutes / bin_width))

    ## Fill a histogram of "hours since start"
    n_selected_events, boundaries = np.histogram(minutes_since_start, bins=bins)

    ## Given the boundaries obtained from the histogram, compute the 
    ## center of each bin
    time_in_minutes = 0.5 * (boundaries[1:] + boundaries[:-1])

    ## Return the tuple with the center of the bin and the number of selected
    ## events falling in that bin
    return time_in_minutes, n_selected_events

##############
## Retrieve the histogram of counts for events more likely to be due to 
## nuclear recoil
t, counts = make_timestamp_histogram(tstamps, predictions, 0.9)

## Assuming Poissonian distribution of the contents, assess the uncertainty 
## on the counts
error = np.sqrt(counts)

## Plot the histogram
plt.figure(figsize=(12,3))
plt.xlabel("Time [m]")
plt.ylabel("Selected events")
plt.errorbar (t, counts, error, fmt='ko')

In [None]:
b = tf.Variable(float(np.min(counts)))
s = tf.Variable(float(np.max(counts) - np.min(counts))/2)
T = tf.Variable(8.)
phi = tf.Variable(1.2)

def fitfun(x):
    return b + s * (1 + tf.sin(2*np.pi * x/T + phi))

def chi2():
    return tf.reduce_sum((counts - fitfun(t))**2 / counts)

optimizer = tf.keras.optimizers.Adam(3e-2)

losses = []
for epoch in range(1000):
    optimizer.minimize(chi2, var_list=[b, s, T, phi])
    losses.append (chi2().numpy())

plt.plot(losses)

In [None]:
## Plot the histogram
plt.figure(figsize=(12,3))
plt.xlabel("Time [m]")
plt.ylabel("Selected events")
plt.errorbar (t, counts, np.sqrt(counts), fmt='ko', label="CYGNO-SIM data")
## Superpose the fitted model
plt.plot(t, fitfun(t), label="Fitted model", linewidth=3, color='red', zorder=10)
plt.legend()
plt.show()