# How to really groove with pandas: A modern approach

In [1]:
# This lets the notebook import the "cosmic_rAI" module
import sys
sys.path.insert(0, '..')

import math
import numpy as np
import pandas as pd

from cosmic_rAI.data_prep import (
    event_df_from_matrices,
    sensor_df_from_matrices,
    get_charges_by_gain,
    get_log_charges,)

In [2]:
mat1 = np.load('../data/sim_12360_00.npy').item()  # protons
mat2 = np.load('../data/sim_12362_00.npy').item()  # iron

matrices = [mat1, mat2]

## Creating base DataFrames

### Event DF

In [3]:
event_df = event_df_from_matrices(matrices)
event_df.shape

(31120, 333)

In [4]:
# show first 2 and last 2 items
event_df.iloc[np.r_[0:2, -2:0]]

Unnamed: 0_level_0,charges,charges,charges,charges,charges,charges,charges,charges,charges,charges,...,composition,core_MC,core_MC,core_reco,core_reco,dir_MC,dir_MC,dir_reco,dir_reco,energy
Unnamed: 0_level_1,0161,0162,0163,0164,0261,0262,0263,0264,0361,0362,...,0,x,y,x,y,azimuth,zenith,azimuth,zenith,0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PPlus,39.491699,27.012478,48.792127,25.608482,0.216553,0.765261,0.253442,0.899646,137553.7
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,PPlus,-249.483765,269.715753,-260.517562,262.071308,0.216553,0.765261,0.231127,0.715951,137553.7
15087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Fe56Nucleus,241.163034,-242.21972,254.189208,-276.710978,0.989706,0.534672,0.992155,0.527354,7999586.0
15088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Fe56Nucleus,-281.7888,297.279114,-282.388461,292.040085,0.989706,0.534672,0.989696,0.57079,7999586.0


### Sensor DF

In [5]:
sensor_df = sensor_df_from_matrices(matrices)
sensor_df.shape

(323, 3)

In [6]:
# show first 2 and last 2 items
sensor_df.iloc[np.r_[0:2, -2:0]]

Unnamed: 0,gain,pos_x,pos_y
161,High,-265.529999,-497.894989
162,Low,-265.529999,-497.894989
8163,High,87.190002,39.84
8164,Low,87.190002,39.84


## Doing stuff with DataFrames

**Note:** We never `change event_df`, we just extract parts of it and modify them.

### Filter to hgain or lgain sensors

In [7]:
lgain_charges_df = get_charges_by_gain(event_df['charges'], sensor_df, gain='Low')
lgain_charges_df.shape

(31120, 161)

In [8]:
# show first 2 and last 2 items
lgain_charges_df.iloc[np.r_[0:2, -2:0]]

Unnamed: 0,0162,0164,0262,0264,0362,0364,0462,0464,0562,0564,...,7762,7764,7862,7864,7962,7964,8062,8064,8162,8164
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# view function definition
get_charges_by_gain??

[0;31mSignature:[0m [0mget_charges_by_gain[0m[0;34m([0m[0mcharges_df[0m[0;34m,[0m [0msensor_df[0m[0;34m,[0m [0mgain[0m[0;34m=[0m[0;34m'Low'[0m[0;34m)[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mget_charges_by_gain[0m[0;34m([0m[0mcharges_df[0m[0;34m,[0m [0msensor_df[0m[0;34m,[0m [0mgain[0m[0;34m=[0m[0;34m'Low'[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Filter charges_df to particular gain"""[0m[0;34m[0m
[0;34m[0m    [0;32mreturn[0m [0mcharges_df[0m[0;34m[[0m[0;34m[0m
[0;34m[0m        [0msensor_df[0m[0;34m.[0m[0mquery[0m[0;34m([0m[0;34mf'gain == "{gain}"'[0m[0;34m)[0m[0;34m.[0m[0mT[0m[0;34m.[0m[0mcolumns[0m[0;34m][0m[0;34m[0m[0m
[0;31mFile:[0m      ~/GitRepos/Carleton/AI/cosmic-rAI/cosmic_rAI/data_prep/data_prep_pandas.py
[0;31mType:[0m      function


### Taking log of charges

In [10]:
log_charges_df = get_log_charges(event_df['charges'])
log_charges_df.shape

(31120, 323)

In [11]:
# view function definition
get_log_charges??

[0;31mSignature:[0m [0mget_log_charges[0m[0;34m([0m[0mcharges_df[0m[0;34m)[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mget_log_charges[0m[0;34m([0m[0mcharges_df[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Apply log function to every cell in charges_df, excluding 0s"""[0m[0;34m[0m
[0;34m[0m    [0;32mreturn[0m [0mcharges_df[0m[0;34m.[0m[0mapplymap[0m[0;34m([0m[0;34m[0m
[0;34m[0m        [0;32mlambda[0m [0mx[0m[0;34m:[0m [0mmath[0m[0;34m.[0m[0mlog[0m[0;34m([0m[0mx[0m[0;34m)[0m [0;32mif[0m [0mx[0m [0;34m!=[0m [0;36m0[0m [0;32melse[0m [0;36m0[0m[0;34m)[0m[0;34m[0m[0m
[0;31mFile:[0m      ~/GitRepos/Carleton/AI/cosmic-rAI/cosmic_rAI/data_prep/data_prep_pandas.py
[0;31mType:[0m      function


## Usage with Machine Learning

In [12]:
from sklearn import svm
from sklearn.model_selection import train_test_split

### Gather attributes

In [13]:
# get lgain sensors
charge_df = get_charges_by_gain(event_df['charges'], sensor_df, gain='Low')

# take logs of charges
charge_df = get_log_charges(lgain_charges_df)

# combine charge data with dir data
data = pd.concat([charge_df, event_df['dir_MC']], axis=1)
data.shape

(31120, 163)

In [14]:
# show first 2 and last 2 items
data.iloc[np.r_[0:2, -2:0]]

Unnamed: 0,0162,0164,0262,0264,0362,0364,0462,0464,0562,0564,...,7862,7864,7962,7964,8062,8064,8162,8164,azimuth,zenith
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.216553,0.765261
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.216553,0.765261
15087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.989706,0.534672
15088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.989706,0.534672


### Run sklearn

In [15]:
X = data
y = event_df['composition'][0]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [17]:
clf = svm.SVC()

In [18]:
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [19]:
clf.score(X_test, y_test)

0.5226874391431353