# Pandas Style

For reference: https://www.stickeria.com.br/img/products/panda-style_1_1200.jpg

In [1]:
# This lets the notebook import the "cosmic_rAI" module
import sys
sys.path.insert(0, '..')

import math
import itertools
import numpy as np
import pandas as pd

from cosmic_rAI import data_prep

In [2]:
mat1 = np.load('../data/sim_12360_00.npy').item()  # protons
mat2 = np.load('../data/sim_12362_00.npy').item()  # iron

## I. Making DataFrames from Arrays

### (a) Event DataFrame

In [3]:
def partition_list(lst, x_name='x', y_name='y'):
    """Given list that alternates between x,y values,
    partition into dictionary"""
    
    return {
        x_name: lst[::2],
        y_name: lst[1::2]}

In [4]:
def event_df_from_matrix(mat):
    """Creates multi-index DataFrame from select attrs in matrix"""
    frames = {
        'charges': pd.DataFrame(mat['Charges']),
        'energy': pd.DataFrame(mat['Energy']),
        'composition': pd.DataFrame(mat['Composition']),
        'core_MC': pd.DataFrame(partition_list(mat['core_MC'], 'x', 'y')),
        'core_reco': pd.DataFrame(partition_list(mat['core_reco'], 'x', 'y')),
        'dir_MC': pd.DataFrame(partition_list(mat['dir_MC'], 'azimuth', 'zenith')),
        'dir_reco': pd.DataFrame(partition_list(mat['dir_reco'], 'azimuth', 'zenith'))}
    return pd.concat(frames, axis=1).fillna(0)

In [5]:
def remove_nan_events(df, matrices):
    """Eliminates from dataframe any events (rows)
    where a sensor recorded NaN"""
    
    charges = list(itertools.chain.from_iterable(
        (m['Charges'] for m in matrices)))
    
    evil_indices = []
    for idx, event in enumerate(charges):
        if any(math.isnan(v) for k, v in event.items()):
            evil_indices.append(idx)     
            
    return df.drop(evil_indices)

In [6]:
matrices = [mat1, mat2]

_df = pd.concat(map(event_df_from_matrix, matrices))
event_df = remove_nan_events(_df, matrices)

In [7]:
print("Old df:", len(_df))
print("New df:", len(event_df))

Old df: 31620
New df: 31120


In [8]:
event_df.head()

Unnamed: 0_level_0,charges,charges,charges,charges,charges,charges,charges,charges,charges,charges,...,composition,core_MC,core_MC,core_reco,core_reco,dir_MC,dir_MC,dir_reco,dir_reco,energy
Unnamed: 0_level_1,0161,0162,0163,0164,0261,0262,0263,0264,0361,0362,...,0,x,y,x,y,azimuth,zenith,azimuth,zenith,0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PPlus,39.491699,27.012478,48.792127,25.608482,0.216553,0.765261,0.253442,0.899646,137553.657022
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PPlus,-249.483765,269.715753,-260.517562,262.071308,0.216553,0.765261,0.231127,0.715951,137553.657022
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PPlus,76.900929,23.952042,79.645692,32.718007,0.355619,1.739962,0.373549,1.58576,220232.485147
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PPlus,260.376586,-208.589063,273.115542,-237.33825,0.355619,1.739962,0.376335,1.702171,220232.485147
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PPlus,-232.829918,-61.683866,-254.864687,-43.366423,0.644303,1.370112,0.637948,1.358519,280275.782022


In [9]:
event_df.tail()

Unnamed: 0_level_0,charges,charges,charges,charges,charges,charges,charges,charges,charges,charges,...,composition,core_MC,core_MC,core_reco,core_reco,dir_MC,dir_MC,dir_reco,dir_reco,energy
Unnamed: 0_level_1,0161,0162,0163,0164,0261,0262,0263,0264,0361,0362,...,0,x,y,x,y,azimuth,zenith,azimuth,zenith,0
15084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Fe56Nucleus,306.264934,117.561933,363.32696,213.241584,0.989706,0.534672,1.088476,0.475856,7999586.0
15085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Fe56Nucleus,310.072884,-289.504751,272.222503,-295.656557,0.989706,0.534672,1.005638,0.541856,7999586.0
15086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Fe56Nucleus,-54.849927,24.51684,-81.159385,-4.965321,0.989706,0.534672,0.98935,0.537727,7999586.0
15087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Fe56Nucleus,241.163034,-242.21972,254.189208,-276.710978,0.989706,0.534672,0.992155,0.527354,7999586.0
15088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Fe56Nucleus,-281.7888,297.279114,-282.388461,292.040085,0.989706,0.534672,0.989696,0.57079,7999586.0


#### Another format...

In [10]:
def flatten(df):
    new_df = df.copy()
    lvl0 = df.columns.get_level_values(0).astype('str')
    lvl1 = df.columns.get_level_values(1).astype('str')
    cols = lvl0 + '_' +lvl1
    new_df.columns = cols
    return new_df

In [11]:
flat_df = flatten(event_df)
flat_df.head()

Unnamed: 0,charges_0161,charges_0162,charges_0163,charges_0164,charges_0261,charges_0262,charges_0263,charges_0264,charges_0361,charges_0362,...,composition_0,core_MC_x,core_MC_y,core_reco_x,core_reco_y,dir_MC_azimuth,dir_MC_zenith,dir_reco_azimuth,dir_reco_zenith,energy_0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PPlus,39.491699,27.012478,48.792127,25.608482,0.216553,0.765261,0.253442,0.899646,137553.657022
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PPlus,-249.483765,269.715753,-260.517562,262.071308,0.216553,0.765261,0.231127,0.715951,137553.657022
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PPlus,76.900929,23.952042,79.645692,32.718007,0.355619,1.739962,0.373549,1.58576,220232.485147
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PPlus,260.376586,-208.589063,273.115542,-237.33825,0.355619,1.739962,0.376335,1.702171,220232.485147
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PPlus,-232.829918,-61.683866,-254.864687,-43.366423,0.644303,1.370112,0.637948,1.358519,280275.782022


### (b) Sensor DataFrame

In [12]:
def sensor_df_from_matrix(mat):
    vals = {
        'gain': mat['Gain'][0],
        'pos_x': mat['Position'][0],
        'pos_y': mat['Position'][1],}
    return pd.DataFrame(vals)

In [13]:
sensor_df = sensor_df_from_matrix(mat1)  # mat1 and mat2 are eqv
len(sensor_df)

323

In [14]:
sensor_df.head()

Unnamed: 0,gain,pos_x,pos_y
161,High,-265.529999,-497.894989
162,Low,-265.529999,-497.894989
163,High,-255.699997,-496.070007
164,Low,-255.699997,-496.070007
261,High,-140.360001,-477.764999


### (c) Example Usage: Getting low gain sensors

In [15]:
charges_df = event_df['charges']
charges_df.head(n=2)

Unnamed: 0,0161,0162,0163,0164,0261,0262,0263,0264,0361,0362,...,7963,7964,8061,8062,8063,8064,8161,8162,8163,8164
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.543506,0.0,2.168716,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
high_gain_df = sensor_df.query("gain == 'High'").T
high_gain_df.head(n=2)

Unnamed: 0,0161,0163,0261,0263,0361,0363,0461,0463,0561,0563,...,7761,7763,7861,7863,7961,7963,8061,8063,8161,8163
gain,High,High,High,High,High,High,High,High,High,High,...,High,High,High,High,High,High,High,High,High,High
pos_x,-265.53,-255.7,-140.36,-130.635,-27.72,-20.395,105.655,115.265,214.74,219.905,...,-87.705,-97.1,2.11,-2.02,18.1,10.88,76.425,85.645,77.41,87.19


In [17]:
low_gain_df = charges_df.drop(high_gain_df.columns, axis=1)
low_gain_df.head(n=2)

Unnamed: 0,0162,0164,0262,0264,0362,0364,0462,0464,0562,0564,...,7762,7764,7862,7864,7962,7964,8062,8064,8162,8164
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## II. PCA

In [18]:
from sklearn.decomposition import PCA

In [19]:
X = event_df['charges']
pca = PCA(n_components=10)
new_X = pca.fit_transform(X)

In [20]:
new_X

array([[ -8.25673976,  -9.04288719,  -6.34136859, ...,  -1.11263346,
         -3.71086797,  -3.39013724],
       [ -8.26915015,  -9.07522236,  -6.34573617, ...,  -1.10866556,
         -3.73143335,  -3.39650306],
       [ -8.23496122,  -8.99733469,  -6.3366898 , ...,  -1.11171392,
         -3.68200503,  -3.36009534],
       ...,
       [ -8.24499959,  -8.86502283,  -6.32930137, ...,  -1.10057906,
         -3.59536475,  -3.32092882],
       [  6.02043238, -10.32846603,  -6.82124557, ...,  -1.16980935,
         -4.89366681,  -3.81262198],
       [ -8.27726841,  -9.07747229,  -6.35395312, ...,  -1.09080232,
         -3.7485829 ,  -3.36465105]])

In [21]:
len(new_X)

31120

## III. SVM

In [22]:
from sklearn import svm
from sklearn.model_selection import train_test_split

### (a) Gather Relevant Attributes

#### Do PCA on charge attributes

In [23]:
pca_n = 10

X = event_df['charges']
pca = PCA(n_components=10)
new_X = pca.fit_transform(X)

cols = ['charge_pca_{}'.format(i) for i in range(1, pca_n+1)]

#### Grab dir attributes

In [24]:
dir_df = event_df['dir_MC']

#### Combine into single DF

In [25]:
charge_df = pd.DataFrame(new_X, columns=cols)
charge_df.index = dir_df.index  # must make indices the same; dir_df is right

data = pd.concat([charge_df, dir_df], axis=1)

data.head(n=2)

Unnamed: 0,charge_pca_1,charge_pca_2,charge_pca_3,charge_pca_4,charge_pca_5,charge_pca_6,charge_pca_7,charge_pca_8,charge_pca_9,charge_pca_10,azimuth,zenith
0,-8.247843,-9.000194,-6.342442,-8.912251,-2.234489,-1.529589,-1.225201,1.094772,-2.066427,-2.617867,0.216553,0.765261
1,-8.260277,-9.032504,-6.347281,-8.91523,-2.255087,-1.533668,-1.217645,1.096937,-2.08364,-2.61399,0.216553,0.765261


### (b) Run sklearn

In [26]:
X = data
y = event_df['composition'][0]

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [28]:
clf = svm.SVC()

In [29]:
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [30]:
clf.score(X_test, y_test)

0.5293086660175268