In [1]:
import pandas as pd 
import numpy as np
import os 
from glob import glob

path = os.getcwd()
path = os.path.join(path, 'data')

files = [f for f in glob(path + '/*.csv')]
files

['/home/srg/Documents/git/EEG/pseudo-inverse/data_prep/data/bashirinzrf_och.csv']

In [2]:
df = pd.read_csv(files[0], delimiter=';')
print(df.shape)
df.head()

(602370, 11)


Unnamed: 0,time,label,e1,e2,e3,e4,e5,e6,e7,e8,e9
0,0.0,0,-0.88806,-0.069266,-7.4e-05,-0.670848,0.076333,-0.019143,0.250427,0.366932,-0.000989
1,0.000994,0,-0.891288,-0.003727,-0.000101,-0.658991,0.139277,-0.020493,0.216534,0.411316,-0.001753
2,0.001988,0,-0.753324,0.130316,-0.000121,-0.537161,0.280191,-0.018788,0.266752,0.528652,-0.002153
3,0.002982,0,-0.800268,0.159339,-0.00014,-0.560644,0.304739,-0.020451,0.188607,0.54155,-0.00289
4,0.003976,0,-0.839113,0.222631,-0.000168,-0.555244,0.367433,-0.022216,0.150791,0.582864,-0.003714


In [32]:
df.label[np.where(df.label.values > 0)[0]].values

array([16,  6, 14, 17,  7, 14,  4, 17,  7, 13,  3, 17,  7, 17,  7,  5, 11,
        1,  5, 15,  5, 16,  6, 14, 12,  2, 14,  4,  4, 11,  1, 15,  5, 11,
        1, 15,  5, 17,  7, 14,  4, 16,  6, 15,  5, 15,  5, 15,  5, 13,  3,
       17,  7, 16,  6, 14,  4, 15,  5, 11,  1, 12, 14, 16,  6,  4,  5, 13,
       14, 14, 17, 12,  2, 11,  1, 13, 16,  6, 16,  6, 12,  2, 14,  4, 17,
       13, 13,  3,  4, 11,  1, 17,  7, 11,  1, 12,  2, 15,  5, 17, 11, 15,
        5, 14,  4, 16,  6, 17,  7,  1, 12,  2, 14,  4,  1, 12,  2, 16,  6,
       13, 13,  3, 13,  3, 17,  7, 15,  5, 14,  4, 12,  2, 11, 15,  5, 12,
        2, 17,  7, 13,  3,  2, 15,  5, 17,  7, 16,  6, 13,  3, 12,  2, 13,
        3, 13,  3, 13, 16, 13,  3, 14,  4, 16,  6,  7, 17,  7, 12,  2, 16,
        6, 11,  1, 11, 12,  2, 14,  4, 13,  3, 17,  7, 14,  6, 11,  4, 13,
        3, 16,  6, 17,  7,  6, 11,  1, 13,  3, 17,  7, 11,  1, 17,  7, 17,
        7, 13,  3, 11,  1, 17,  7, 11,  1, 16, 17,  7, 15,  5, 13,  3, 12,
        2, 11,  1, 11,  1

In [50]:
def compute_covariance(X):
    """
    Computes covariance vector

    X : np.array of shape (n_channels, n_samples)

    returns:
    c : covariance vector
    """
    n_channels = X.shape[0]
    return np.cov(X)[np.triu_indices(n_channels, k=1)]

def get_listen_inds(labels):
    return np.where(labels > 10)[0]

def get_repeat_inds(labels):
    return np.where((labels < 10) * (labels > 0))[0]

def get_listen_repeat_inds(labels):
    non_zero = labels[labels > 0]
    non_zero_inds = np.where(labels > 0)[0]

    left, right = 0, 1
    while True:
        if right >= non_zero.shape[0]:
            break
        if non_zero[left] == non_zero[right] + 10:
            left += 2
            right += 2
            continue
        non_zero[left] = 0
        left += 1
        right += 1
    return non_zero_inds[non_zero > 0]

def add_zero_inds(inds): # only for paired inds
    rpt = inds[1::2]
    noise = rpt + 1000 # 1000ms shift
    union = np.sort(np.concatenate((inds, noise), axis=0))
    return union

def prolong_inds(inds, frame_size):
    add = np.arange(frame_size)
    frames = [add + ind for ind in inds]
    return np.concatenate(frames, axis=0)

def window_inds(inds, frame_size, window_size, hop_length):
    inds = inds.reshape(-1, frame_size)
    n_windows = (frame_size - window_size) / hop_length + 1
    n_windows = int(np.floor(n_windows))

    res = np.zeros((inds.shape[0], n_windows, window_size))
    for i in range(n_windows):
        start = i * hop_length
        res[:, i, :] = inds[:, start:(start + window_size)]

    return res.astype(int) # shape (batch, n_windows, window_size) 

def split_signals(X, inds):
    """
    X: np array of shape (n_channels, n_samples)
    inds: np array of shape (batch, n_windows, window_size) 

    returns:
    S: np array of shape (n_channels, batch * n_windows, window_size)
    """

    S = X[:, inds.ravel()]
    shape = inds.shape
    return S.reshape(X.shape[0], shape[0] * shape[1], shape[2])

def get_cov(S):
    """
    S: np array of shape (n_channels, batch * n_windows, window_size)

    returns: 
    C: np array of shape (batch * n_windows, n_channels*(n_channels - 1)/2)
    """
    n_channels = S.shape[0]
    C = np.zeros((S.shape[1], int(n_channels * (n_channels - 1) / 2)))
    for win in range(S.shape[1]):
        X = S[:, win, :] # shape (n_channels, n_samples)
        C[win] = compute_covariance(X)

    return C

def convert(X, labels, frame=306, window=50, hop=32):
    storage = os.path.join(path, 'converted')
    if not os.path.isdir(storage):
        os.mkdir(storage)

    inds = get_listen_repeat_inds(labels)
    inds = add_zero_inds(inds)
    y = labels[inds]
    inds = prolong_inds(inds, frame)
    
    inds = window_inds(inds, frame, window, hop)
    S = split_signals(X, inds)
    S = get_cov(S)

    return S, y

In [None]:
some_inds = get_listen_repeat_inds(df.label.values)
some_inds = add_zero_inds(some_inds)
some_inds = prolong_inds(some_inds, 5)
df.label[some_inds].values[:1000]

In [51]:
X = df.iloc[:, 2:].to_numpy().T
labels = df.label.values

S, y = convert(X, labels)
S.shape, y.shape

((3267, 36), (363,))

In [52]:
y

array([16,  6,  0, 17,  7,  0, 14,  4,  0, 17,  7,  0, 13,  3,  0, 17,  7,
        0, 17,  7,  0, 11,  1,  0, 15,  5,  0, 16,  6,  0, 12,  2,  0, 14,
        4,  0, 11,  1,  0, 15,  5,  0, 11,  1,  0, 15,  5,  0, 17,  7,  0,
       14,  4,  0, 16,  6,  0, 15,  5,  0, 15,  5,  0, 15,  5,  0, 13,  3,
        0, 17,  7,  0, 16,  6,  0, 14,  4,  0, 15,  5,  0, 11,  1,  0, 16,
        6,  0, 12,  2,  0, 11,  1,  0, 16,  6,  0, 16,  6,  0, 12,  2,  0,
       14,  4,  0, 13,  3,  0, 11,  1,  0, 17,  7,  0, 11,  1,  0, 12,  2,
        0, 15,  5,  0, 15,  5,  0, 14,  4,  0, 16,  6,  0, 17,  7,  0, 12,
        2,  0, 14,  4,  0, 12,  2,  0, 16,  6,  0, 13,  3,  0, 13,  3,  0,
       17,  7,  0, 15,  5,  0, 14,  4,  0, 12,  2,  0, 15,  5,  0, 12,  2,
        0, 17,  7,  0, 13,  3,  0, 15,  5,  0, 17,  7,  0, 16,  6,  0, 13,
        3,  0, 12,  2,  0, 13,  3,  0, 13,  3,  0, 13,  3,  0, 14,  4,  0,
       16,  6,  0, 17,  7,  0, 12,  2,  0, 16,  6,  0, 11,  1,  0, 12,  2,
        0, 14,  4,  0, 13

### Covariance Matrix Parameters:

    1.  window (row): 50ms (50 samples) -- 1102 sound samples (6 fft windows)
    2.  HOP_LENGTH: 32ms (32 samples)
    3.  frame size: 306 samples
    4.  9 windows

### STFT Parameters:

    1.  FRAME_SIZE = 512
    2.  SR = 22050
    3.  HOP_LENGTH = 118
    4.  Each row: 1102 samples = 6 windows. Entire phoneme = 9 rows (6766 samples)
