# Setup

In [43]:
import numpy as np
import math

from PIL import Image
import glob, os

Need to use `.item()` as per https://stackoverflow.com/questions/24565916/why-is-numpy-shape-empty (and also Frank's message)

Also note that `mat1` contains only protons, and `mat2` contains only iron nuclei.

In [210]:
mat1 = np.load('../data/sim_12360_00.npy').item()
mat2 = np.load('../data/sim_12362_00.npy').item()

# Exploration

In [211]:
mat1.keys()

dict_keys(['Charges', 'Energy', 'File_info', 'dir_reco', 'core_MC', 'Gain', 'core_reco', 'Position', 'dir_MC', 'Fit_status', 'Composition'])

Each index in each array corresponds to one cosmic ray.
* Charges: dictionaries, where each dictionary has about (but not exactly!) 6-8 values. Each key is a sensor and each value is the value that sensor gathered. Presumably the sensors that did not recieve significant signal are not included.
* Energy: one value per event. Unclear what that value is.
* File_info: some information about the file that the event was drawn from 
* Gain: a list with one item-- a dictionary from sensor to whether it is a high-gain or low-gain sensor. 
* Position: the position of each sensor. Consists of an x-position array and a y-position array, but it is unclear what order they are in. 
* Fit_status: mostly `'OK'` with some `'InsufficientHits'` and `'FailedToConverge'`. Unclear where that comes into play.
* Composition: a string representing the composition of each ray.

New fields:
* Dir_MC
* Dir_reco: (Occasional `'NaN'`s)
* Core_MC
* Core_reco

In [212]:
def createImgArray(data):
    imgArray = []
    for i in range(len(data['Charges'])):
        event = data['Charges'][i]
        eventArray = []
        imgArray.append(eventArray)
        for sensor in data['Charges'][i].keys():
            try:
                eventArray.append([sensor, data['Charges'][i][sensor], data['Position'][0][sensor], data['Position'][1][sensor]])
            except KeyError:
                continue
    return imgArray

def generatePictures(imgArray, length, name):
    imageSet = []
    for i in range(length):
        image = convertChargeToPicture(imgArray[i])
        imageSet.append(image)
        imageSet[i].save("{}_{:0>3d}.png".format(name, i))
    
    return imageSet

def convertChargeToPicture(event):
    # Position[0] has a range of [-553.5,604.2] and Position[1] has a range of [-497.9,503.6]
    padding = 10
    offsetX = 554+padding; offsetY = 498+padding;
    maxX = 605+padding; maxY = 504+padding
    deltaX = 2; deltaY = 2;
    
    im = Image.new("L", (offsetX+maxX, offsetY+maxY))
    px = im.load()
    for sensor in event:
        for i in range(-deltaX, deltaX):
            for j in range(-deltaY, deltaY):
                px[sensor[2]+offsetX+i, sensor[3]+offsetY+j] = (round(sensor[1]*100))
    
    return im

def createImgSet(data, length, name):
    imgArray = createImgArray(data)
    imageSet = generatePictures(imgArray, length, name)
    
    return imageSet

In [213]:
images = createImgSet(mat2, 5, "Fe")

In [9]:
# sortedArray = sorted(imgArray)

In [10]:
# sensorCount = {}
# for event in sortedArray:
#     if sensorCount.get(event[0]) == None:
#         sensorCount[event[0]] = 1
#     else:
#         sensorCount[event[0]] += 1

In [209]:
# sensorCount

## Questions

* Do the x/y positions correlate to some real-world data? Is it possible to, for example, normalize to all positive?
* What does `mat1['File_info']` contain? My instinct is some information about the original simulation files that this was drawn from but perhaps not.
* Is it the case that sensors that did not see any signal are not included in `'Charges'`?
* What is stored under `Energy`?
* Is the `position` array `[x_dict, y_dict]` or `[y_dict, x_dict]`?
* Do we have access to the actual/calculated initial conditions information (angle/center) or is that something we need to calculate? If the latter, is there a script somewhere?

# Build a gain-differentiated dataset

Find all the sensors that are high/low gain

In [16]:
hgain = list(filter(lambda x: mat1['Gain'][0][x] == 'High', mat1['Position'][0].keys()))
lgain = list(filter(lambda x: mat1['Gain'][0][x] == 'Low', mat1['Position'][0].keys()))

Construct a dictionary that goes from name -> column index in the matrix we're about to build

In [17]:
#build a name->index dict
def get_index_dict(sensors):
    name_index_dict = {}
    for i in range(len(hgain)):
        name_index_dict[hgain[i]] = i
    return name_index_dict

In [18]:
hgain_indices = get_index_dict(hgain)
lgain_indices = get_index_dict(lgain)

Build the matrix by setting up an empty one and populating it with the values according to the indices we created above. This is pretty slow atm but I'm sure there are ways to make it more efficient. Perhaps we don't need to make it dense in order to pop it into numpy? 

In [30]:
hgain_events = np.zeros((len(mat1['Composition']) + len(mat2['Composition']), len(hgain_indices) + 1))
for i in range(len(mat1['Charges'])):
    event = mat1['Charges'][i]
    for sensor in mat1['Charges'][i].keys():
        try:
            hgain_events[i, hgain_indices[sensor]] = 0 if math.isnan(event[sensor]) else event[sensor]
            # print('updated x: {} y: {} to be {}'.format(i,hgain_indices[sensor],event[sensor]))
        except KeyError:
            continue
    hgain_events[i, -1] = 1

for i in range(len(mat2['Charges'])):
    event = mat2['Charges'][i]
    for sensor in mat2['Charges'][i].keys():
        try:
            hgain_events[len(mat1['Charges']) + i, hgain_indices[sensor]] = 0 if math.isnan(event[sensor]) else event[sensor]
        except KeyError:
            continue
    hgain_events[len(mat1['Charges']) + i, -1] = 0

In [22]:
for key in mat1['Charges'][0].keys():
    print(key)
    print(str(hgain_events[0,hgain_indices[key]]) + " " + str(mat1['Charges'][0][key]))

8163
2.1687159538269043 2.1687159538269043
3563
0.814879298210144 0.814879298210144
8161
1.5435057878494263 1.5435057878494263
3763
0.42499926686286926 0.42499926686286926
3761
0.2992176115512848 0.2992176115512848
4661
0.9247297048568726 0.9247297048568726
4663
0.544894814491272 0.544894814491272
3561
0.3185567855834961 0.3185567855834961


In [28]:
hgain_events[16529, :]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  8.47929478,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.05663168,  0.        ,  0.        ,  0.        ,
        1.85151255,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

# Split high-gain into training/test set

In [29]:
np.random.shuffle(hgain_events)
train_size = int(hgain_events.shape[0] * .9)

trainset = hgain_events[:train_size]
testset = hgain_events[train_size:]

In [38]:
trainset[trainset[:,-1]==0].shape

(13570, 163)

In [39]:
np.save('small_train.npy', trainset)
np.save('small_test.npy', testset)