In [1]:
import scipy.io
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Import the data


In [3]:
data_in = scipy.io.loadmat('extras/arrayDATA.mat')

## Convert Data to pandas DataFrame

The code below uses list comprehension to loop through each trial's data and turn it into a pandas DataFrame. This results in a list of DataFrames, each being the data from one trial. We then use `pd.concat()` to convert the list of DataFrames to a single pandas DataFrame.

This uses a couple of NumPy functions we haven't seen before. `np.c_` concatenates a series of NumPy objects into columns. We do this because we want to combine the three-column data array with additional columns that encode the orientation and trial number information. We create those latter columns using `np.repeat`, which generates a NumPy array by broadcasting (repeating) the input (first argument) a specified number of times (second argument; in this case, the length of the input NumPy data array, which we get as `data_in['DATA'][trial][0][0].shape[0]`).

In [4]:
cols = ['channel', 'sort', 'time', 'orientation', 'trial']

df_list = [pd.DataFrame(np.c_[data_in['DATA'][trial][0][0],
                              np.repeat(data_in['DATA'][trial][0][1][0][0],
                                        data_in['DATA'][trial][0][0].shape[0]),
                              np.repeat(trial, 
                                        data_in['DATA'][trial][0][0].shape[0])
                             ],
                        columns=cols)
           for trial in np.arange(data_in['DATA'].shape[0])]

df = pd.concat(df_list)

In [5]:
df.head()

Unnamed: 0,channel,sort,time,orientation,trial
0,20.0,1.0,0.624,90.0,0.0
1,20.0,1.0,0.63,90.0,0.0
2,20.0,3.0,0.652,90.0,0.0
3,20.0,3.0,0.659,90.0,0.0
4,20.0,255.0,0.686,90.0,0.0


In [6]:
df.sample(12)

Unnamed: 0,channel,sort,time,orientation,trial
578,6.0,2.0,0.521,0.0,586.0
1409,16.0,1.0,2.573,0.0,1370.0
102,36.0,1.0,-0.257,90.0,1331.0
1013,75.0,255.0,1.381,0.0,859.0
136,75.0,0.0,-0.058,0.0,1123.0
251,14.0,255.0,0.179,0.0,1643.0
1452,20.0,255.0,2.298,90.0,629.0
1201,23.0,1.0,2.088,0.0,1099.0
990,87.0,1.0,1.599,90.0,1594.0
1202,44.0,0.0,2.397,0.0,1177.0


In [9]:
df.shape

(2340739, 5)

### Clean the data

We know from the provider of the data that sort codes 0 and 255 represent bad data, so above we defined these as `noise_codes`. Here we remove all rows that have these sort codes. Note that the `~` operator in front of a statement means "not". So here we are saying to keep in `df` only those rows where `sort` is *not* equal to one of the `noise_codes`.

In [8]:
noise_codes = [0, 255] # explained later

df = df[~df['sort'].isin(noise_codes)]

In [9]:
df.shape

(2340739, 5)

In [11]:
df['channel'].unique()

array([20., 42., 72., 94., 53.,  1.,  6., 37., 10., 63., 66., 74., 77.,
       22., 39., 84., 38., 85., 15., 55., 81., 86., 69., 40., 11., 23.,
       19., 79., 80., 60., 76., 90., 87.,  9., 61., 75., 35., 17., 64.,
       46., 71., 89., 58., 44.,  2., 49., 91., 78., 65., 18., 96., 25.,
       36., 33., 88., 43., 29., 12.,  7., 93., 41., 30., 92.,  5., 83.,
       14.,  8., 45.,  4., 24., 13., 67., 62., 16., 82., 48., 51., 50.,
       54., 57., 73.,  3., 26., 47., 70., 59., 52., 27., 95., 32., 28.])

## Export data to CSV

In [None]:
df.to_csv('data/multielectrode_data.csv')

In [12]:
# Some new stuff