# Unpacking Data with Pandas

In [1]:
import sys
sys.path.append("../")

import math
import numpy as np
import pandas as pd
from cosmic_rAI import data_prep

In [2]:
mat1 = np.load('../data/sim_12360_00.npy').item()  # protons
mat2 = np.load('../data/sim_12362_00.npy').item()  # iron

## Making DataFrames from Arrays

### Event DataFrame

In [3]:
def event_df_from_matrix(mat):
    frames = {
        'charges': pd.DataFrame(mat1['Charges']),
        'energy': pd.DataFrame(mat1['Energy']),
        'composition': pd.DataFrame(mat1['Composition']),}
    return pd.concat(frames, axis=1)

In [4]:
df = event_df_from_matrix(mat1)
len(df)

16531

In [5]:
df.head(n=2)

Unnamed: 0_level_0,charges,charges,charges,charges,charges,charges,charges,charges,charges,charges,charges,charges,charges,charges,charges,charges,charges,charges,charges,composition,energy
Unnamed: 0_level_1,0161,0162,0163,0164,0261,0262,0263,0264,0361,0362,...,8061,8062,8063,8064,8161,8162,8163,8164,0,0
0,,,,,,,,,,,...,,,,,1.543506,,2.168716,,PPlus,137553.657022
1,,,,,,,,,,,...,,,,,,,,,PPlus,137553.657022


#### Add location data

In [6]:
keys = ('dir_MC', 'core_MC', 'dir_reco', 'core_reco')
d = {}
for k in keys:
    l = mat1[k]
    _d = {'x': l[::2], 'y': l[1::2]}
    d[k] = pd.DataFrame(_d)
pos_df = pd.concat(d, axis=1)

In [7]:
event_df = pd.concat([df, pos_df], axis=1)
len(event_df)

16531

In [8]:
event_df.head(n=2)

Unnamed: 0_level_0,charges,charges,charges,charges,charges,charges,charges,charges,charges,charges,...,composition,energy,core_MC,core_MC,core_reco,core_reco,dir_MC,dir_MC,dir_reco,dir_reco
Unnamed: 0_level_1,0161,0162,0163,0164,0261,0262,0263,0264,0361,0362,...,0,0,x,y,x,y,x,y,x,y
0,,,,,,,,,,,...,PPlus,137553.657022,39.491699,27.012478,48.792127,25.608482,0.216553,0.765261,0.253442,0.899646
1,,,,,,,,,,,...,PPlus,137553.657022,-249.483765,269.715753,-260.517562,262.071308,0.216553,0.765261,0.231127,0.715951


#### Another format...

In [24]:
def flatten(df):
    new_df = df.copy()
    lvl0 = event_df.columns.get_level_values(0).astype('str')
    lvl1 = event_df.columns.get_level_values(1).astype('str')
    cols = lvl0 + '_' +lvl1
    new_df.columns = cols
    return new_df

In [25]:
flat_df = flatten(event_df)
flat_df.head(n=2)

Unnamed: 0,charges_0161,charges_0162,charges_0163,charges_0164,charges_0261,charges_0262,charges_0263,charges_0264,charges_0361,charges_0362,...,composition_0,energy_0,core_MC_x,core_MC_y,core_reco_x,core_reco_y,dir_MC_x,dir_MC_y,dir_reco_x,dir_reco_y
0,,,,,,,,,,,...,PPlus,137553.657022,39.491699,27.012478,48.792127,25.608482,0.216553,0.765261,0.253442,0.899646
1,,,,,,,,,,,...,PPlus,137553.657022,-249.483765,269.715753,-260.517562,262.071308,0.216553,0.765261,0.231127,0.715951


### Sensor DataFrame

In [26]:
def sensor_df_from_matrix(mat):
    vals = {
        'gain': mat1['Gain'][0],
        'pos_x': mat1['Position'][0],
        'pos_y': mat1['Position'][1],}
    return pd.DataFrame(vals)

In [27]:
sensor_df = sensor_df_from_matrix(mat1)
len(sensor_df)

323

In [28]:
sensor_df.head()

Unnamed: 0,gain,pos_x,pos_y
161,High,-265.529999,-497.894989
162,Low,-265.529999,-497.894989
163,High,-255.699997,-496.070007
164,Low,-255.699997,-496.070007
261,High,-140.360001,-477.764999


## PCA

In [127]:
from sklearn.decomposition import PCA

In [128]:
X = event_df['charges']
X.head()

Unnamed: 0,0161,0162,0163,0164,0261,0262,0263,0264,0361,0362,...,7963,7964,8061,8062,8063,8064,8161,8162,8163,8164
0,,,,,,,,,,,...,,,,,,,1.543506,,2.168716,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,0.439023,,0.496192,,7.982038,,9.587134,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


#### Idea 1

In [133]:
# PCA doesn't like sparse data
# Here's one (bad!) idea...

from sklearn.preprocessing import Imputer

imp = Imputer()
X = event_df['charges']
X = imp.fit_transform(X)

In [134]:
pca = PCA(n_components=2)
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [135]:
X.shape

(16531, 323)

In [136]:
X = pca.transform(X)

In [137]:
X.shape

(16531, 2)