In [1]:
import h5py
import pandas as pd

In [2]:
filepath = "t15_copyTask_neuralData/hdf5_data_final/t15.2023.08.11/data_train.hdf5"

In [3]:
import h5py

def load_h5py_file(file_path):
    data = {
        'neural_features': [],
        'n_time_steps': [],
        'seq_class_ids': [],
        'seq_len': [],
        'transcriptions': [],
        'sentence_label': [],
        'session': [],
        'block_num': [],
        'trial_num': [],
    }
    # Open the hdf5 file for that day
    with h5py.File(file_path, 'r') as f:

        keys = list(f.keys())

        # For each trial in the selected trials in that day
        for key in keys:
            g = f[key]

            neural_features = g['input_features'][:]
            n_time_steps = g.attrs['n_time_steps']
            seq_class_ids = g['seq_class_ids'][:] if 'seq_class_ids' in g else None
            seq_len = g.attrs['seq_len'] if 'seq_len' in g.attrs else None
            transcription = g['transcription'][:] if 'transcription' in g else None
            sentence_label = g.attrs['sentence_label'][:] if 'sentence_label' in g.attrs else None
            session = g.attrs['session']
            block_num = g.attrs['block_num']
            trial_num = g.attrs['trial_num']

            data['neural_features'].append(neural_features)
            data['n_time_steps'].append(n_time_steps)
            data['seq_class_ids'].append(seq_class_ids)
            data['seq_len'].append(seq_len)
            data['transcriptions'].append(transcription)
            data['sentence_label'].append(sentence_label)
            data['session'].append(session)
            data['block_num'].append(block_num)
            data['trial_num'].append(trial_num)
    return data


In [4]:
data = load_h5py_file(filepath)

In [5]:
list(data.keys())

['neural_features',
 'n_time_steps',
 'seq_class_ids',
 'seq_len',
 'transcriptions',
 'sentence_label',
 'session',
 'block_num',
 'trial_num']

In [6]:
f = h5py.File(filepath, 'r')

In [7]:
list(f.keys())

['trial_0000',
 'trial_0001',
 'trial_0002',
 'trial_0003',
 'trial_0004',
 'trial_0005',
 'trial_0006',
 'trial_0007',
 'trial_0008',
 'trial_0009',
 'trial_0010',
 'trial_0011',
 'trial_0012',
 'trial_0013',
 'trial_0014',
 'trial_0015',
 'trial_0016',
 'trial_0017',
 'trial_0018',
 'trial_0019',
 'trial_0020',
 'trial_0021',
 'trial_0022',
 'trial_0023',
 'trial_0024',
 'trial_0025',
 'trial_0026',
 'trial_0027',
 'trial_0028',
 'trial_0029',
 'trial_0030',
 'trial_0031',
 'trial_0032',
 'trial_0033',
 'trial_0034',
 'trial_0035',
 'trial_0036',
 'trial_0037',
 'trial_0038',
 'trial_0039',
 'trial_0040',
 'trial_0041',
 'trial_0042',
 'trial_0043',
 'trial_0044',
 'trial_0045',
 'trial_0046',
 'trial_0047',
 'trial_0048',
 'trial_0049',
 'trial_0050',
 'trial_0051',
 'trial_0052',
 'trial_0053',
 'trial_0054',
 'trial_0055',
 'trial_0056',
 'trial_0057',
 'trial_0058',
 'trial_0059',
 'trial_0060',
 'trial_0061',
 'trial_0062',
 'trial_0063',
 'trial_0064',
 'trial_0065',
 'trial_00

In [8]:
import os 

for folder, _, files in os.walk('t15_copyTask_neuralData/hdf5_data_final'):
    
    if 'data_train.hdf5' in files:

        print(folder, files)

t15_copyTask_neuralData/hdf5_data_final/t15.2024.07.21 ['data_test.hdf5', 'data_train.hdf5', 'data_val.hdf5']
t15_copyTask_neuralData/hdf5_data_final/t15.2024.04.28 ['data_train.hdf5']
t15_copyTask_neuralData/hdf5_data_final/t15.2024.07.28 ['data_test.hdf5', 'data_train.hdf5', 'data_val.hdf5']
t15_copyTask_neuralData/hdf5_data_final/t15.2025.03.30 ['data_test.hdf5', 'data_train.hdf5', 'data_val.hdf5']
t15_copyTask_neuralData/hdf5_data_final/t15.2023.10.13 ['data_test.hdf5', 'data_train.hdf5', 'data_val.hdf5']
t15_copyTask_neuralData/hdf5_data_final/t15.2023.09.24 ['data_test.hdf5', 'data_train.hdf5', 'data_val.hdf5']
t15_copyTask_neuralData/hdf5_data_final/t15.2025.01.10 ['data_test.hdf5', 'data_train.hdf5', 'data_val.hdf5']
t15_copyTask_neuralData/hdf5_data_final/t15.2023.12.08 ['data_test.hdf5', 'data_train.hdf5', 'data_val.hdf5']
t15_copyTask_neuralData/hdf5_data_final/t15.2023.08.27 ['data_test.hdf5', 'data_train.hdf5', 'data_val.hdf5']
t15_copyTask_neuralData/hdf5_data_final/t15.2

In [9]:
import torch 

class NeuralDataset(torch.utils.data.Dataset): 

    def __init__(self, dir):
        
        self.data = {
            'neural_features': [],
            'n_time_steps': [],
            'seq_class_ids': [], 
            'seq_len': [], 
            'transcription': [], 
            'sentence_label': [], 
            'session': [], 
            'block_num': [], 
            'trial_num': []
        }

        for folder, __, files in os.walk(dir): 

            if 'data_train.hdf5' in files: 

                # load file 
                f = h5py.File(os.path.join(folder, 'data_train.hdf5'))

                # loop through trials 
                for i in list(f.keys()): 

                    trial = f[i]

                    neural_features = trial['input_features'][:]
                    n_time_steps = trial.attrs['n_time_steps']
                    seq_class_ids = trial['seq_class_ids'][:] if 'seq_class_ids' in trial else None
                    seq_len = trial.attrs['seq_len'] if 'seq_len' in trial.attrs else None
                    transcription = trial['transcription'][:] if 'transcription' in trial else None
                    sentence_label = trial.attrs['sentence_label'][:] if 'sentence_label' in trial.attrs else None
                    session = trial.attrs['session']
                    block_num = trial.attrs['block_num']
                    trial_num = trial.attrs['trial_num']

                    # append trial features to data list 
                    self.data['neural_features'].append(neural_features)
                    self.data['n_time_steps'].append(n_time_steps)
                    self.data['seq_class_ids'].append(seq_class_ids)
                    self.data['seq_len'].append(seq_len)
                    self.data['transcription'].append(transcription)
                    self.data['sentence_label'].append(sentence_label)
                    self.data['session'].append(session)
                    self.data['block_num'].append(block_num)
                    self.data['trial_num'].append(trial_num)

    def __len__(self): 

        return len(self.data['neural_features'])

    def __getitem__(self, idx): 

        return {
            'neural_features': torch.tensor(self.data['neural_features'][idx]),
            'n_time_steps': torch.tensor(self.data['n_time_steps'][idx]),
            'seq_class_ids': torch.tensor(self.data['seq_class_ids'][idx]),
            'seq_len': torch.tensor(self.data['seq_len'][idx]),
            'transcription': self.data['transcription'][idx],
            'sentence_label': self.data['sentence_label'][idx],
            'session': self.data['session'][idx],
            'block_num': self.data['block_num'][idx],
            'trial_num': self.data['trial_num'][idx]
        }

        



In [10]:
brainDataset = NeuralDataset('t15_copyTask_neuralData/hdf5_data_final')

In [11]:
brainDataset.__getitem__(4)['neural_features'].shape

torch.Size([517, 512])

In [12]:
# create dataloader 
from torch.nn.utils.rnn import pad_sequence 
import numpy as np

def collate_fn(batch):

    # gather all the variable length items (neural_features, seq_labels) from the batch into separate python lists first so we can find the max length of each group and pad each group together

    neural_features = [i['neural_features'] for i in batch]
    seq_class_ids = [i['seq_class_ids'] for i in batch]
    n_time_steps = [i['n_time_steps'] for i in batch]
    seq_len = [i['seq_len'] for i in batch] 
    transcription = [i['transcription'] for i in batch]
    sentence_label = [i['sentence_label'] for i in batch]
    session = [i['session'] for i in batch]
    block_num = [i['block_num'] for i in batch]
    trial_num = [i['trial_num'] for i in batch]
    # neural_lengths = [len(i) for i in neural_features]
    # seq_class_lengths = [len(i) for i in seq_class_ids]

    # max_neural_idx = np.argmax(neural_lengths)
    # max_seq_class_idx = np.argmax(seq_class_lengths)

    # max_neural_len = neural_lengths[max_neural_idx]
    # max_seq_len = seq_class_lengths[max_seq_class_idx]

    neural_features_padded = pad_sequence(neural_features, batch_first=True, padding_value=0)
    seq_class_ids_padded = pad_sequence(seq_class_ids, batch_first=True, padding_value=0)

    return {
        'neural_features': neural_features_padded,
        'seq_class_ids': seq_class_ids_padded,
        'n_time_steps': n_time_steps,
        'seq_len': seq_len,
        'transcription': transcription,
        'sentence_label': sentence_label,
        'session': session,
        'block_num': block_num,
        'trial_num': trial_num
    }

    

In [13]:

from torch.utils.data import DataLoader

trainLoader = DataLoader(brainDataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

In [14]:
next(iter(trainLoader))

{'neural_features': tensor([[[-0.3003,  0.4238, -0.7064,  ..., -0.3712, -0.3446,  0.2089],
          [-0.3003, -0.7120, -0.7064,  ...,  3.3335,  0.9658, -0.3117],
          [-0.3003, -0.7120, -0.7064,  ...,  0.9588, -1.1609,  1.6493],
          ...,
          [-0.3003, -0.7120,  3.0599,  ..., -0.8327,  1.0939, -0.1702],
          [-0.3003, -0.7120,  0.5490,  ..., -0.9575, -0.3140, -1.1289],
          [-0.3003, -0.7120, -0.7064,  ...,  0.0992, -1.4184,  0.5835]],
 
         [[-0.2886, -0.6776,  0.6852,  ..., -0.8004,  0.0621,  0.5236],
          [-0.2886,  0.5355,  3.3447,  ...,  1.2232,  0.5247, -0.9197],
          [-0.2886,  0.5355, -0.6445,  ..., -0.2722, -0.6868, -0.5445],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
 
         [[-0.2746,  0.5775, -0.6807,  ...,  0.9687, -0.0815,  0.6085],
         

In [43]:
LOGIT_TO_PHONEME = [
   # "BLANK" = CTC blank symbol
'AA', 'AE', 'AH', 'AO', 'AW',
'AY', 'B', 'CH', 'D', 'DH',
'EH', 'ER', 'EY', 'F', 'G',
'HH', 'IH', 'IY', 'JH', 'K',
'L', 'M', 'N', 'NG', 'OW',
'OY', 'P', 'R', 'S', 'SH',
'T', 'TH', 'UH', 'UW', 'V',
'W', 'Y', 'Z', 'ZH',
' | ',    # "|" = silence token
] 

len(LOGIT_TO_PHONEME)

40

In [16]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [74]:
# define model 

import torch.nn.functional as F

class BaselineLSTM(torch.nn.Module):

    def __init__(self):

        super().__init__()

        # input (B x T x 512) --> output (B x T x 41)

        self.rnn = torch.nn.LSTM(input_size=512,hidden_size=768, num_layers=5)
        self.proj = torch.nn.Linear(in_features=768, out_features=41)


    def forward(self, x): 
        # print('rnn input: ', x.shape)
        x, _ = self.rnn(x)
        # print('linear input: ', x.shape)
        x = F.log_softmax(self.proj(x),dim=2)

        return x 


In [75]:
model = BaselineLSTM().to(device)

In [76]:
print(model)

BaselineLSTM(
  (rnn): LSTM(512, 768, num_layers=5)
  (proj): Linear(in_features=768, out_features=41, bias=True)
)


In [None]:
# define training loop 
import torch.optim as optim 

optimizer = optim.SGD(params=model.parameters(), lr=1e-5)
loss_fn = torch.nn.CTCLoss()
num_epochs = 2

model.train()

epoch_loss = 0
train_losses = []
for epoch in range(num_epochs):

    
    train_loss = 0
    num_batches = len(trainLoader)
    for i, batch in enumerate(trainLoader):

        # place tensors on device
        inputs, targets = batch['neural_features'].to(device), batch['seq_class_ids'].to(device)
        inputs = torch.transpose(inputs, 0, 1)
        # print(f'inputs shape: {inputs.shape}')
        # print(f'targets shape: {targets.shape}')
        input_lengths, target_lengths = torch.tensor(batch['n_time_steps']).to(device), torch.tensor(batch['seq_len']).to(device)
        # print(f'input_lengths shape: {input_lengths.shape}')
        # print(f'target_lengths shape: {target_lengths.shape}')

        # zero optimizer
        optimizer.zero_grad()

        # forward 
        output = model.forward(inputs)
        # print(f'output shape: {output.shape}' )

        # compute loss 
        loss = loss_fn(output, targets, input_lengths, target_lengths)
        train_loss += loss.item()
        train_losses.append(train_loss)

        # backprop 
        loss.backward()

        # update weights 
        optimizer.step()
    
    train_loss /= num_batches 
    print(f'Epoch {epoch} loss: {train_loss}')





KeyboardInterrupt: 

In [21]:
sample = brainDataset.__getitem__(27)

In [22]:
''.join([chr(int(i)) for i in sample['transcription']]).replace('\x00','')

"You can't make a decision."

In [23]:
LOGIT_TO_PHONEME[1]

'AA'

In [24]:
[LOGIT_TO_PHONEME[i] for i in sample['seq_class_ids']]

['Y',
 'UW',
 ' | ',
 'K',
 'AE',
 'N',
 'T',
 ' | ',
 'M',
 'EY',
 'K',
 ' | ',
 'AH',
 ' | ',
 'D',
 'IH',
 'S',
 'IH',
 'ZH',
 'AH',
 'N',
 ' | ',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',
 'BLANK',


In [68]:
import nltk
from nltk.corpus import cmudict

In [26]:
nltk.download('cmudict')

[nltk_data] Downloading package cmudict to /home/max-
[nltk_data]     rivera/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [27]:
d = cmudict.dict()

In [28]:
d['carnivore'][0]

['K', 'AA1', 'R', 'N', 'IH0', 'V', 'AO2', 'R']

In [29]:
[i.replace('\d+','') for i in d['shoe'][0]]

  [i.replace('\d+','') for i in d['shoe'][0]]


['SH', 'UW1']

In [30]:
import re 

s = 'UW1'

re.sub(r'\d','',s)


'UW'

In [31]:
[re.sub(r'\d','',s) for s in d['shoe'][0]]

['SH', 'UW']

In [59]:
LOGIT_TO_PHONEME = [
"BLANK",# "BLANK" = CTC blank symbol
'AA', 'AE', 'AH', 'AO', 'AW',
'AY', 'B', 'CH', 'D', 'DH',
'EH', 'ER', 'EY', 'F', 'G',
'HH', 'IH', 'IY', 'JH', 'K',
'L', 'M', 'N', 'NG', 'OW',
'OY', 'P', 'R', 'S', 'SH',
'T', 'TH', 'UH', 'UW', 'V',
'W', 'Y', 'Z', 'ZH',
# ' | ',    # "|" = silence token
] 


In [60]:
from pyctcdecode import build_ctcdecoder

In [64]:
decoder = build_ctcdecoder(
    labels=LOGIT_TO_PHONEME,
    kenlm_model_path="4-gram.arpa",
    alpha=0.4,
    beta=1.6,
    unigrams=None
)

Loading the LM will be faster if you build a binary file.
Reading /home/max-rivera/git/kaggle-brain-to-text-25/4-gram.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
Space token ' ' missing from vocabulary.
Unigrams and labels don't seem to agree.


In [67]:
# infer on a sample 
sample = brainDataset.__getitem__(25)
print('sample: ', ''.join([chr(i) for i in sample['transcription']]))

model.eval()

with torch.no_grad():

    inputs, targets = sample['neural_features'].to(device), sample['seq_class_ids'].to(device)

    print(inputs.shape)

    inputs = torch.unsqueeze(inputs,0)

    print(inputs.shape)

    outputs = model(inputs)

    logits = outputs[0]  # or outputs.logits[0] depending on your model output

    logits_np = logits.detach().cpu().numpy()

    print(decoder.decode(logits_np,beam_width=50))

sample:  I grew up in Alabama.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
torch.Size([857, 512])
torch.Size([1, 857, 512])
RUHR


In [65]:
model.eval()
with torch.no_grad():
    # sample['neural_features'] has shape [T, features]
    inputs = sample['neural_features'].to(device)          # -> [T, 512]
    inputs = inputs.unsqueeze(1)                          # -> [T, 1, 512] (seq_len, batch=1, feat)
    outputs = model(inputs)                              # -> [T, 1, num_labels]
    logits = outputs[:, 0, :]                            # -> [T, num_labels]
    logits_np = logits.detach().cpu().numpy()

    # Try decode with log-probs first, if it looks off try np.exp(logits_np)
    print("outputs.shape:", outputs.shape)
    print("logits.shape:", logits.shape)
    print("decode (log-probs):", decoder.decode(logits_np, beam_width=50))
    print("decode (probs):", decoder.decode(np.exp(logits_np), beam_width=50))

outputs.shape: torch.Size([857, 1, 41])
logits.shape: torch.Size([857, 41])
decode (log-probs): R
decode (probs): R
