# Setup

## Imports

In [16]:
import numpy as np
import math

## Useful Functions

In [2]:
def gen_matrix_single_data(data, feature_dict, direction=False):
    event_matrix = np.zeros((len(data['Composition']), len(feature_dict) + 1))

    for i in range(len(data['Charges'])):

        #insert sensor data
        event = data['Charges'][i]
        for sensor in data['Charges'][i].keys():
            try:
                event_matrix[i, feature_dict[sensor]] = 0 if math.isnan(event[sensor]) else event[sensor]
            except KeyError:
                continue
        
        #insert direction data
        if direction:
            event_matrix[i, feature_dict['zenith']] = data['dir_MC'][i][0]
            event_matrix[i, feature_dict['azimuth']] = data['dir_MC'][i][1]  

        is_proton = data['Composition'][i] == 'PPlus'  
        event_matrix[i, -1] = 0 if is_proton else 1
    return event_matrix

In [3]:
'''
Construct a dictionary that goes from name -> column index in the matrix we're about to build
'''
def get_index_dict(sensors, direction_data=False):
    name_index_dict = {}
    for i in range(len(sensors)):
        name_index_dict[sensors[i]] = i
    i = len(sensors)
    if direction_data:
        name_index_dict['zenith'] = i
        name_index_dict['azimuth'] = i + 1
    return name_index_dict

In [4]:
'''
Given a list of files, keys to pull from the dictionaries stored in those files, and a list of keys to call 
cleanup_list on, concatenates clean dictionaries into a single dictionary.
'''
def get_all_data(list_of_files, keys, dirty_keys):
    list_of_dicts = [cleanup_data(np.load(filepath).item(), dirty_keys) for filepath in list_of_files]
    full_dict = {}
    for key in keys:
        value_list = []
        for d in list_of_dicts:
            value_list.extend(d[key])
        full_dict[key] = value_list
    return full_dict

def cleanup_data(data, keys_to_clean):
    for key in keys_to_clean:
        data[key] = cleanup_list(data[key])
    return data

'''
Takes a list structured like [1a, 1b, 2a, 2b, 3a, 3b, ...] where a and b are sub-parts of a single piece of data,
and returns [(1a, 1b), (2a, 2b), (3a, 3b)...]. 
'''
def cleanup_list(li):
    new_list = []
    for i in range(len(li)//2):
        new_list.append((li[2*i], li[2*i+1]))
    return new_list

In [5]:
'''
Given a dictionary of data, return a filtered copy of the same dictionary, according to the filt function.
'''
def filter_data(data, filter_func, keys_to_filter):
    #initialize values
    new_data = {}
    for key in data.keys():
        #if we're filtering this key, then leave it empty for now
        if key in keys_to_filter:
            new_data[key] = []
        #otherwise, copy over the value
        else:
            new_data[key] = data[key].copy()
    
    #add values if they pass the filter
    length = len(data[keys_to_filter[0]])
    for event_ind in range(length):
        if filter_func(data, event_ind):
            for key in keys_to_filter:
                new_data[key].append(data[key][event_ind])
    return new_data


## Load Data

In [6]:
keys = 'Charges', 'Energy', 'File_info', 'dir_reco', 'core_MC', 'Gain', 'core_reco', 'Position', 'dir_MC', 'Fit_status', 'Composition'
dirty_keys = ('dir_MC', 'core_MC', 'dir_reco', 'core_reco')
prefixes = ('../data/sim_12360_', '../data/sim_12362_')
files = []
for prefix in prefixes:
    for i in range(20):
        files.append('{0}{1:02d}.npy'.format(prefix,i))

In [7]:
data = get_all_data(files, keys, dirty_keys)

## Filter Sensor Charge features

In [8]:
all_sensors = list(data['Gain'][0].keys())

## Filter events

In [9]:
keys_to_filter = ('Charges', 'Energy', 'File_info', 'dir_reco', 'core_MC', 'core_reco', 'dir_MC', 'Fit_status', 'Composition')

# Different Experiments

## Use only events in a certain azimuth/zenith band
This got ~57% accuracy :(

In [11]:
all_indices_dir = get_index_dict(all_sensors)

In [14]:
def is_between(point, lower, upper):
    return point >= lower and point <= upper
def is_in_band(d, i):
    ze, az = d['dir_MC'][i]
    return is_between(ze, 0, .17) and is_between(az, 0, 1) and (max(d['Charges'][i].values()) > 6)
band_data = filter_data(data, is_in_band, keys_to_filter)

In [17]:
mat = gen_matrix_single_data(band_data, all_indices_dir)

In [18]:
np.random.shuffle(mat)
train_size = int(mat.shape[0] * .9)

trainset = mat[:train_size]
testset = mat[train_size:]

In [19]:
np.save('zeazfilter_train.npy', trainset)
np.save('zeazfilter_test.npy', testset)

## Use only events with max charges > 6

In [21]:
all_indices_dir = get_index_dict(all_sensors)

In [23]:
def has_high_energy(d, i):
    return max(d['Charges'][i].values()) > 6
band_data = filter_data(data, has_high_energy, keys_to_filter)

In [24]:
mat = gen_matrix_single_data(band_data, all_indices_dir)

In [25]:
np.random.shuffle(mat)
train_size = int(mat.shape[0] * .9)

trainset = mat[:train_size]
testset = mat[train_size:]

In [26]:
np.save('highcharge_train.npy', trainset)
np.save('highcharge_test.npy', testset)

# Use events in a certain band with max charges > 6

In [28]:
all_indices_dir = get_index_dict(all_sensors)

In [30]:
band_data = filter_data(data, lambda d,i: has_high_energy(d,i) and is_in_band(d,i), keys_to_filter)

In [31]:
mat = gen_matrix_single_data(band_data, all_indices_dir)

In [32]:
np.random.shuffle(mat)
train_size = int(mat.shape[0] * .9)

trainset = mat[:train_size]
testset = mat[train_size:]

In [33]:
np.save('zeazband_train.npy', trainset)
np.save('zeazband_test.npy', testset)

# Use events within a certain energy range 

In [35]:
all_indices_dir = get_index_dict(all_sensors)

In [38]:
energy_binned_data = filter_data(data, lambda d,i: d['Energy'][i] < 5000000, keys_to_filter)

In [39]:
mat = gen_matrix_single_data(energy_binned_data, all_indices_dir)

In [40]:
np.random.shuffle(mat)
train_size = int(mat.shape[0] * .9)

trainset = mat[:train_size]
testset = mat[train_size:]

In [41]:
trainset.shape

(176551, 324)

In [42]:
np.save('enband_low_train.npy', trainset)
np.save('enband_low_test.npy', testset)