In [1]:
# general imports
import os, glob, numpy as np, pandas as pd
import matplotlib.pyplot as plt
import time
plt.rcParams['figure.figsize'] = [25, 15]
plt.rcParams.update({'font.size': 18})

In [2]:
# function to load data from multiple input files
def load_dataframe_from_files(dirin, fileprefix):
    import glob
    files = glob.glob(os.path.join(dirin, fileprefix))
    print("[Info] Loading {} files wt prefix:\n{}".format(len(files), fileprefix))
    df = pd.read_csv(files[0], comment='#', index_col=False)
    for file in files[1:]:
        print(".", end='')
        dftmp = pd.read_csv(file, comment='#', index_col=False)
        df = pd.concat([df, dftmp])
    print("")
    return df

In [3]:
# Load Ar39 Data
import numpy as np

dirin = "../data/Ar39/dataset_all1ar39"    # where to write combined datasets
infilename = "Ar39_1Pileup_2000000.csv"

df_ar39 = load_dataframe_from_files(dirin, infilename)
maxPEar39 = df_ar39.pedetected.max()
print("[Info] The Ar39 decays have maximum {} PE detected".format(maxPEar39))

[Info] Loading 1 files wt prefix:
Ar39_1Pileup_2000000.csv

[Info] The Ar39 decays have maximum 51 PE detected


# Full cylinder features
Considering the 72 slices, we propose the following features:
#### Features on Detection:
- NPE: total number of PE detected
- NActiveSlices: total number of slices wt NPE>0
- MeanNPE: mean PE detected
- StdNPE: std deviation of NPEs

#### Features on Spatial Distribution:
- Range: difference between highest slice ID and lowet slice ID
- Var: variance of IDs (ids replicated for NPE)
- Std: std deviation of IDs (ids replicated for NPE)

In [4]:
df_slices = df_ar39.iloc[:, 2:]
del df_ar39

In [None]:
# compute npe
# NUMPY
t = time.time()
for i, row in df_slices.iloc[0:10000, :].iterrows():
    row.to_numpy().sum()
print("Time wt Numpy: {:5f} sec".format(time.time() - t))

# PYTHON
t = time.time()
for i, row in df_slices.iloc[0:10000, :].iterrows():
    row.sum()
print("Time wt Pandas: {:5f} sec".format(time.time() - t))

In [None]:
# compute max detection
t = time.time()
for i, row in df_slices.iloc[0:10000, :].iterrows():
    row.to_numpy().max()
print("Time wt Numpy: {:5f} sec".format(time.time() - t))

t = time.time()
for i, row in df_slices.iloc[0:10000, :].iterrows():
    row.max()
print("Time wt Pandas: {:5f} sec".format(time.time() - t))

In [None]:
# compute mean detection
t = time.time()
for i, row in df_slices.iloc[0:10000, :].iterrows():
    np.mean(row.to_numpy())
print("Time wt Numpy: {:5f} sec".format(time.time() - t))

t = time.time()
for i, row in df_slices.iloc[0:10000, :].iterrows():
    row.mean()
print("Time wt Pandas: {:5f} sec".format(time.time() - t))

In [None]:
# compute std detection
t = time.time()
for i, row in df_slices.iloc[0:10000, :].iterrows():
    np.std(row.to_numpy())
print("Time wt Numpy: {:5f} sec".format(time.time() - t))

t = time.time()
for i, row in df_slices.iloc[0:10000, :].iterrows():
    row.std()
print("Time wt Pandas: {:5f} sec".format(time.time() - t))

In [None]:
# compute nr active slices
# NUMPY
t = time.time()
for i, row in df_slices.iloc[0:10000, :].iterrows():
    np.nonzero(row.to_numpy())[0].shape[0]
print("Time wt Numpy: {:5f} sec".format(time.time() - t))

# PYTHON
t = time.time()
for i, row in df_slices.iloc[0:10000, :].iterrows():
    len(row[row>0])
print("Time wt Pandas: {:5f} sec".format(time.time() - t))

In [None]:
# compute range
t = time.time()
for i, row in df_slices.iloc[0:10000, :].iterrows():
    np.where(row.to_numpy()>0)[0][-1] - np.where(row.to_numpy()>0)[0][0]
print("Time wt Numpy: {:5f} sec".format(time.time() - t))

In [None]:
# compute var of ids
t = time.time()
for i, row in df_slices.iloc[0:10000, :].iterrows():
    rownp = row.to_numpy()
    np.var(np.repeat(np.argwhere(rownp>0), rownp[rownp>0]))
print("Time wt Numpy: {:5f} sec".format(time.time() - t))

t = time.time()
for i, row in df_slices.iloc[0:10000, :].iterrows():
    id_population = [item for id_list in [[i] * int(row.iloc[i]) for i in range(len(row))] for item in id_list]
    np.var(id_population)
print("Time wt Pandas: {:5f} sec".format(time.time() - t))

In [None]:
# compute std dev of ids
t = time.time()
for i, row in df_slices.iloc[0:10000, :].iterrows():
    rownp = row.to_numpy()
    np.std(np.repeat(np.argwhere(rownp>0), rownp[rownp>0]))
print("Time wt Numpy: {:5f} sec".format(time.time() - t))

t = time.time()
for i, row in df_slices.iloc[0:10000, :].iterrows():
    id_population = [item for id_list in [[i] * int(row.iloc[i]) for i in range(len(row))] for item in id_list]
    np.std(id_population)
print("Time wt Pandas: {:5f} sec".format(time.time() - t))

# Quadrant-based Features
The above features can be computed for quadrants, where the *quadrant* is a subsequence of slices. Basically, we can decompose the cylinder in a certain number of quadrants (eventually overlapped).

In [5]:
nslices=72
nshiftings=4
quadrant_width=36

t = time.time()
shift = nslices // nshiftings   # derived
df_quadrants_pd = []
for i_shift in range(nshiftings):
    assert(i_shift*shift < nslices)
    quadrant = df_slices.iloc[:, i_shift*shift:i_shift*shift + quadrant_width]
    if i_shift*shift + quadrant_width >= nslices:
        quadrant = pd.concat([quadrant, df_slices.iloc[:, :i_shift*shift + quadrant_width - nslices]], axis=1)
    df_quadrants_pd.append(quadrant)
print("Time wt Pandas: {:5f} sec".format(time.time() - t))

t = time.time()
df_quadrants = []
df_numpy = df_slices.to_numpy()
for i_shift in range(nshiftings):
    assert(i_shift*shift < nslices)
    quadrant = df_numpy[:, i_shift*shift:i_shift*shift + quadrant_width]
    if i_shift*shift + quadrant_width >= nslices:
        quadrant = np.concatenate([quadrant, df_numpy[:, :i_shift*shift + quadrant_width - nslices]], axis=1)
    df_quadrants.append(quadrant)
print("Time wt Numpy: {:5f} sec".format(time.time() - t))


Time wt Pandas: 0.160734 sec
Time wt Numpy: 0.174299 sec


In [None]:
df_quadrants_pd[0].to_numpy() == df_quadrants

In [None]:
df_numpy[:, 0:36]