In [16]:
import scipy.io
from pyarrow.feather import write_feather
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Import the data


In [3]:
data_in = scipy.io.loadmat('./arrayDATA.mat')

## Convert Data to pandas DataFrame

The code below uses list comprehension to loop through each trial's data and turn it into a pandas DataFrame. This results in a list of DataFrames, each being the data from one trial. We then use `pd.concat()` to convert the list of DataFrames to a single pandas DataFrame.

This uses a couple of NumPy functions we haven't seen before. `np.c_` concatenates a series of NumPy objects into columns. We do this because we want to combine the three-column data array with additional columns that encode the orientation and trial number information. We create those latter columns using `np.repeat`, which generates a NumPy array by broadcasting (repeating) the input (first argument) a specified number of times (second argument; in this case, the length of the input NumPy data array, which we get as `data_in['DATA'][trial][0][0].shape[0]`).

In [4]:
cols = ['channel', 'sort', 'time', 'orientation', 'trial']

df_list = [pd.DataFrame(np.c_[data_in['DATA'][trial][0][0],
                              np.repeat(data_in['DATA'][trial][0][1][0][0],
                                        data_in['DATA'][trial][0][0].shape[0]),
                              np.repeat(trial, 
                                        data_in['DATA'][trial][0][0].shape[0])
                             ],
                        columns=cols)
           for trial in np.arange(data_in['DATA'].shape[0])]

df = pd.concat(df_list)

In [5]:
df.head()

Unnamed: 0,channel,sort,time,orientation,trial
0,20.0,1.0,0.624,90.0,0.0
1,20.0,1.0,0.63,90.0,0.0
2,20.0,3.0,0.652,90.0,0.0
3,20.0,3.0,0.659,90.0,0.0
4,20.0,255.0,0.686,90.0,0.0


In [6]:
df.sample(12)

Unnamed: 0,channel,sort,time,orientation,trial
740,79.0,0.0,0.468,0.0,1203.0
1003,33.0,1.0,1.584,90.0,1086.0
222,72.0,1.0,0.123,0.0,1928.0
375,74.0,2.0,0.309,90.0,1446.0
3342,40.0,255.0,-0.323,0.0,1772.0
674,90.0,255.0,0.923,0.0,156.0
357,33.0,1.0,0.272,90.0,256.0
547,80.0,255.0,0.684,90.0,1197.0
1333,9.0,255.0,2.094,0.0,2249.0
856,93.0,2.0,1.264,90.0,1518.0


In [7]:
df.shape

(3425176, 5)

### Clean the data

We know from the provider of the data that sort codes 0 and 255 represent bad data, so above we defined these as `noise_codes`. Here we remove all rows that have these sort codes. Note that the `~` operator in front of a statement means "not". So here we are saying to keep in `df` only those rows where `sort` is *not* equal to one of the `noise_codes`.

In [8]:
noise_codes = [0, 255] # explained later

df = df[~df['sort'].isin(noise_codes)]

In [9]:
df.shape

(2340739, 5)

In [10]:
df['channel'].unique()

array([20., 42., 72., 94., 53.,  1.,  6., 37., 10., 63., 66., 74., 77.,
       22., 39., 84., 38., 85., 15., 55., 81., 86., 69., 40., 11., 23.,
       19., 79., 80., 60., 76., 90., 87.,  9., 61., 75., 35., 17., 64.,
       46., 71., 89., 58., 44.,  2., 49., 91., 78., 65., 18., 96., 25.,
       36., 33., 88., 43., 29., 12.,  7., 93., 41., 30., 92.,  5., 83.,
       14.,  8., 45.,  4., 24., 13., 67., 62., 16., 82., 48., 51., 50.,
       54., 57., 73.,  3., 26., 47., 70., 59., 52., 27., 95., 32., 28.])

In [24]:
pd.set_option('display.max_rows', None)
df.groupby(['channel', 'sort'])['time'].count()

channel  sort
1.0      1.0      21263
2.0      1.0       1953
3.0      1.0       1550
         2.0       1894
4.0      1.0       1638
5.0      1.0       3289
6.0      1.0      10911
         2.0      35184
7.0      1.0       2271
8.0      1.0      16169
9.0      1.0      57536
10.0     1.0       9498
11.0     1.0       5848
12.0     1.0       3169
13.0     1.0       7914
14.0     1.0      30385
15.0     1.0      43942
16.0     1.0       5609
17.0     1.0       8242
18.0     1.0     100534
19.0     1.0      28172
         2.0       1829
20.0     1.0      52161
         2.0      20953
         3.0      96756
         4.0      63667
22.0     1.0      66749
23.0     1.0      72019
24.0     1.0       3453
25.0     1.0       8598
26.0     1.0       3557
27.0     1.0       2093
28.0     1.0        757
29.0     1.0       3838
         2.0       5393
30.0     1.0       4909
32.0     1.0        223
33.0     1.0      26440
         2.0        424
35.0     1.0      11676
36.0     1.0      33271
37

## Select subset of channels

Based on looking at the PSTH from all channels (previous version of lessons), selected the following to have amixture of response patterns (and a couple of bad ones) 

In [29]:
keep_chans = [4, 7, 10, 22, 23, 32, 35, 42, 44, 45, 63, 67, 73, 75, 79, 80, 86, 91, 95, 96]
print(len(keep_chans))
df_use = df[df['channel'].isin(keep_chans)]

20


In [30]:
sorted(keep_chans)

[4, 7, 10, 22, 23, 32, 35, 42, 44, 45, 63, 67, 73, 75, 79, 80, 86, 91, 95, 96]

In [31]:
df_use.groupby(['channel', 'sort'])['time'].count()

channel  sort
4.0      1.0       1638
7.0      1.0       2271
10.0     1.0       9498
22.0     1.0      66749
23.0     1.0      72019
32.0     1.0        223
35.0     1.0      11676
42.0     1.0     118478
44.0     1.0      15929
45.0     1.0        558
63.0     1.0       7507
67.0     1.0      10506
73.0     1.0       7201
75.0     1.0      50969
79.0     1.0      41536
80.0     1.0      35279
86.0     1.0     141910
91.0     1.0      10211
95.0     1.0        662
96.0     1.0      18586
Name: time, dtype: int64

In [34]:
df_use = df_use.drop(['sort'], axis=1)

## Export data to CSV

In [35]:
df_use.to_csv('../data/multielectrode_data.csv', index=False)

In [18]:
# write_feather(df_use, '../data/multielectrode_data.fea')

In [19]:
# write_feather(df, '../data/multielectrode_data_all.fea')