In [6]:
# formats the competitionData into tfRecords for RNN training, including blockwise feature normalization
# speechBCI 까지 온 상태 (ls 치면 AnalysisExamples 나오게)
# baseDir = "/home/s2/nlp002/nlp/speechBCI"
baseDir = "/oak/stanford/groups/henderj/fwillett/speechPaperRelease_08_20"

baseDir = "/home/s2/nlp002/pj_data"

import os

os.makedirs(baseDir + "/derived/tfRecords", exist_ok=True)

In [12]:
import scipy.io
import numpy as np
import tensorflow as tf
import os
from pathlib import Path
import matplotlib.pyplot as plt
from g2p_en import G2p
import re
# from neuralDecoder.datasets.speechDataset import PHONE_DEF, VOWEL_DEF, CONSONANT_DEF, SIL_DEF, PHONE_DEF_SIL

import pathlib
import random
import numpy as np
import tensorflow as tf

PHONE_DEF = [
    'AA', 'AE', 'AH', 'AO', 'AW',
    'AY', 'B',  'CH', 'D', 'DH',
    'EH', 'ER', 'EY', 'F', 'G',
    'HH', 'IH', 'IY', 'JH', 'K',
    'L', 'M', 'N', 'NG', 'OW',
    'OY', 'P', 'R', 'S', 'SH',
    'T', 'TH', 'UH', 'UW', 'V',
    'W', 'Y', 'Z', 'ZH'
]

PHONE_DEF_SIL = [
    'AA', 'AE', 'AH', 'AO', 'AW',
    'AY', 'B',  'CH', 'D', 'DH',
    'EH', 'ER', 'EY', 'F', 'G',
    'HH', 'IH', 'IY', 'JH', 'K',
    'L', 'M', 'N', 'NG', 'OW',
    'OY', 'P', 'R', 'S', 'SH',
    'T', 'TH', 'UH', 'UW', 'V',
    'W', 'Y', 'Z', 'ZH', 'SIL'
]

CHANG_PHONE_DEF = [
    'AA', 'AE', 'AH', 'AW',
    'AY', 'B',  'D', 'DH',
    'EH', 'ER', 'EY', 'F', 'G',
    'HH', 'IH', 'IY', 'K',
    'L', 'M', 'N', 'NG', 'OW',
    'P', 'R', 'S',
    'T', 'TH', 'UH', 'UW', 'V',
    'W', 'Y', 'Z'
]

CONSONANT_DEF = ['CH', 'SH', 'JH', 'R', 'B',
                 'M',  'W',  'V',  'F', 'P',
                 'D',  'N',  'L',  'S', 'T',
                 'Z',  'TH', 'G',  'Y', 'HH',
                 'K', 'NG', 'ZH', 'DH']
VOWEL_DEF = ['EY', 'AE', 'AY', 'EH', 'AA',
             'AW', 'IY', 'IH', 'OY', 'OW',
             'AO', 'UH', 'AH', 'UW', 'ER']

SIL_DEF = ['SIL']

class SpeechDataset():
    def __init__(self,
                 rawFileDir,
                 nInputFeatures,
                 nClasses,
                 maxSeqElements,
                 bufferSize,
                 syntheticFileDir=None,
                 syntheticMixingRate=0.33,
                 subsetSize=-1,
                 labelDir=None,
                 timeWarpSmoothSD=0.0,
                 timeWarpNoiseSD=0.0,
                 chanIndices=None
                 ):

        self.rawFileDir = rawFileDir
        self.nInputFeatures = nInputFeatures
        self.nClasses = nClasses
        self.maxSeqElements = maxSeqElements
        self.bufferSize = bufferSize
        self.syntheticFileDir = syntheticFileDir
        self.syntheticMixingRate = syntheticMixingRate
        self.timeWarpSmoothSD = timeWarpSmoothSD
        self.timeWarpNoiseSD = timeWarpNoiseSD
        self.subsetSize = subsetSize
        self.chanIndices = chanIndices
        
    def build(self, batchSize, isTraining):
        def _loadDataset(fileDir):
            files = sorted([str(x) for x in pathlib.Path(fileDir).glob("*.tfrecord")])
            if isTraining:
                random.shuffle(files)

            dataset = tf.data.TFRecordDataset(files)
            return dataset

        print(f'Load data from {self.rawFileDir}')
        rawDataset = _loadDataset(self.rawFileDir)
        if self.syntheticFileDir and self.syntheticMixingRate > 0:
            print(f'Load data from {self.syntheticFileDir}')
            syntheticDataset = _loadDataset(self.syntheticFileDir)
            dataset = tf.data.experimental.sample_from_datasets(
                [rawDataset.repeat(), syntheticDataset.repeat()],
                weights=[1.0 - self.syntheticMixingRate, self.syntheticMixingRate])
        else:
            dataset = rawDataset

        datasetFeatures = {
            "inputFeatures": tf.io.FixedLenSequenceFeature([self.nInputFeatures], tf.float32, allow_missing=True),
            #"classLabelsOneHot": tf.io.FixedLenSequenceFeature([self.nClasses+1], tf.float32, allow_missing=True),
            "newClassSignal": tf.io.FixedLenSequenceFeature([], tf.float32, allow_missing=True),
            "ceMask": tf.io.FixedLenSequenceFeature([], tf.float32, allow_missing=True),
            "seqClassIDs": tf.io.FixedLenFeature((self.maxSeqElements), tf.int64),
            "nTimeSteps": tf.io.FixedLenFeature((), tf.int64),
            "nSeqElements": tf.io.FixedLenFeature((), tf.int64),
            "transcription": tf.io.FixedLenFeature((self.maxSeqElements), tf.int64)
        }

        if self.timeWarpNoiseSD>0 and self.timeWarpSmoothSD>0:
            from scipy.ndimage.filters import gaussian_filter1d
            inp = np.zeros([200])
            inp[int(len(inp)/2)] = 1
            gaussKernel = gaussian_filter1d(inp, self.timeWarpSmoothSD)

            validIdx = np.argwhere(gaussKernel>0.001)
            gaussKernel = gaussKernel[validIdx]
            gaussKernel = np.squeeze(gaussKernel/np.sum(gaussKernel))

            timeWarpNoiseSD= self.timeWarpNoiseSD

            def parseDatasetFunctionWarp(exampleProto):
                dat = tf.io.parse_single_example(exampleProto, datasetFeatures)

                warpDat = {}
                warpDat['seqClassIDs'] = dat['seqClassIDs']
                warpDat['nSeqElements'] = dat['nSeqElements']
                warpDat['transcription'] = dat['transcription']

                whiteNoise = tf.random.normal([dat['nTimeSteps']*2], mean=0, stddev=timeWarpNoiseSD)
                rateNoise = tf.nn.conv1d(whiteNoise[tf.newaxis,:,tf.newaxis],
                                         gaussKernel[:,np.newaxis,np.newaxis].astype(np.float32), 1, 'SAME')

                rateNoise = rateNoise[0,:,0]
                toSum = tf.ones([dat['nTimeSteps']*2], dtype=tf.float32) + rateNoise
                toSum = tf.nn.relu(toSum)

                warpFun = tf.cumsum(toSum)
                resampleIdx = tf.cast(warpFun, dtype=tf.int32)
                resampleIdx = resampleIdx[resampleIdx<tf.cast(dat['nTimeSteps'],dtype=tf.int32)]

                warpDat['nTimeSteps'] = tf.cast(tf.reduce_sum(tf.cast(resampleIdx>-1,dtype=tf.int32)), dtype=tf.int32)
                warpDat['inputFeatures'] = tf.gather(dat['inputFeatures'], resampleIdx, axis=0)
                if self.chanIndices is not None:
                    selectChans = tf.gather(warpDat['inputFeatures'], tf.constant(self.chanIndices),axis=-1)
                    paddings = [[0, 0], [0, 256-tf.shape(selectChans)[-1]]]
                    warpDat['inputFeatures'] = tf.pad(selectChans, paddings, 'CONSTANT',constant_values=0)
                warpDat['newClassSignal'] = tf.gather(dat['newClassSignal'], resampleIdx, axis=0)
                warpDat['ceMask'] = tf.gather(dat['ceMask'], resampleIdx, axis=0)

                return warpDat

            dataset = dataset.map(parseDatasetFunctionWarp, num_parallel_calls=tf.data.AUTOTUNE)

        else:
            def parseDatasetFunctionSimple(exampleProto):
                dat = tf.io.parse_single_example(exampleProto, datasetFeatures)
                if self.chanIndices is not None:
                    newDat = {}
                    newDat['seqClassIDs'] = dat['seqClassIDs']
                    newDat['nSeqElements'] = dat['nSeqElements']
                    newDat['transcription'] = dat['transcription']
                    newDat['nTimeSteps'] = dat['nTimeSteps']
                    newDat['newClassSignal'] = dat['newClassSignal']
                    newDat['ceMask'] = dat['ceMask']
                    print(dat['inputFeatures'])
                    selectChans = tf.gather(dat['inputFeatures'], tf.constant(self.chanIndices),axis=-1)
                    paddings = [[0, 0], [0, 256-tf.shape(selectChans)[-1]]]
                    newDat['inputFeatures'] = tf.pad(selectChans, paddings, 'CONSTANT',constant_values=0)
                    print(tf.shape(newDat['inputFeatures']))

                    return newDat
                else:
                    return dat
            dataset = dataset.map(parseDatasetFunctionSimple, num_parallel_calls=tf.data.AUTOTUNE)

        if isTraining:
            # Use all elements to adapt normalization layer
            datasetForAdapt = dataset.map(lambda x: x['inputFeatures'] + 0.001,
                num_parallel_calls=tf.data.AUTOTUNE)
            
            # Take a subset of the data if specified
            if self.subsetSize > 0:
                dataset = dataset.take(self.subsetSize)

            # Shuffle and transform data if training
            dataset = dataset.shuffle(self.bufferSize)
            if self.syntheticMixingRate == 0:
                dataset = dataset.repeat()
            dataset = dataset.padded_batch(batchSize)
            dataset = dataset.prefetch(tf.data.AUTOTUNE)
            
            

            return dataset, datasetForAdapt
        else:
            dataset = dataset.padded_batch(batchSize)
            dataset = dataset.prefetch(tf.data.AUTOTUNE)

            return dataset

def makeTFRecordsFromCompetitionFiles(sessionName, dataPath, tfRecordFolder):
    
    partNames = ['train','test','competitionHoldOut']
    
    for partIdx in range(len(partNames)):
        sessionPath = dataPath + '/' + partNames[partIdx] + '/' + sessionName + '.mat'
        if not os.path.isfile(sessionPath):
            continue
            
        dat = scipy.io.loadmat(sessionPath)

        input_features = []
        transcriptions = []
        frame_lens = []
        block_means = []
        block_stds = []
        n_trials = dat['sentenceText'].shape[0]

        #collect area 6v tx1 and spikePow features
        for i in range(n_trials):    
            #get time series of TX and spike power for this trial
            #first 128 columns = area 6v only
            features = np.concatenate([dat['tx1'][0,i][:,0:128], dat['spikePow'][0,i][:,0:128]], axis=1)

            sentence_len = features.shape[0]
            sentence = dat['sentenceText'][i].strip()

            input_features.append(features)
            transcriptions.append(sentence)
            frame_lens.append(sentence_len)

        #block-wise feature normalization
        blockNums = np.squeeze(dat['blockIdx'])
        blockList = np.unique(blockNums)
        blocks = []
        for b in range(len(blockList)):
            sentIdx = np.argwhere(blockNums==blockList[b])
            sentIdx = sentIdx[:,0].astype(np.int32)
            blocks.append(sentIdx)

        for b in range(len(blocks)):
            feats = np.concatenate(input_features[blocks[b][0]:(blocks[b][-1]+1)], axis=0)
            feats_mean = np.mean(feats, axis=0, keepdims=True)
            feats_std = np.std(feats, axis=0, keepdims=True)
            for i in blocks[b]:
                input_features[i] = (input_features[i] - feats_mean) / (feats_std + 1e-8)

        #convert to tfRecord file
        session_data = {
            'inputFeatures': input_features,
            'transcriptions': transcriptions,
            'frameLens': frame_lens
        }

        folderName = tfRecordFolder+'/'+partNames[partIdx]
        convertToTFRecord(session_data, 
                          folderName,
                          np.arange(0,len(input_features)).astype(np.int32))
        
def convertToTFRecord(sessionData, recordFolder, partIdx):

    nClasses = 31
    maxSeqLen = 500
    g2p = G2p()
    
    def _floats_feature(value):
        return tf.train.Feature(float_list=tf.train.FloatList(value=value))

    def _ints_feature(value):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

    def _convert_to_ascii(text):
        return [ord(char) for char in text]

    def phoneToId(p):
        return PHONE_DEF_SIL.index(p)

    saveDir = Path(recordFolder)
    saveDir.mkdir(parents=True, exist_ok=True)
    print(partIdx)

    with tf.io.TFRecordWriter(str(saveDir.joinpath('chunk_0.tfrecord'))) as writer:
        for trialIdx in partIdx:
            inputFeats = sessionData['inputFeatures'][trialIdx]

            classLabels = np.zeros([inputFeats.shape[0], nClasses]).astype(np.float32)
            newClassSignal = np.zeros([inputFeats.shape[0], 1]).astype(np.float32)
            seqClassIDs = np.zeros([maxSeqLen]).astype(np.int32)

            thisTranscription = sessionData['transcriptions'][trialIdx]

            # Remove punctuation
            thisTranscription = re.sub(r'[^a-zA-Z\- \']', '', thisTranscription)
            thisTranscription = thisTranscription.replace('--', '').lower()
            phonemes = []
            if len(thisTranscription) == 0:
                phonemes = SIL_DEF
            else:
                for p in g2p(thisTranscription):
                    if p==' ':
                        phonemes.append('SIL')

                    p = re.sub(r'[0-9]', '', p)  # Remove stress
                    if re.match(r'[A-Z]+', p):  # Only keep phonemes
                        phonemes.append(p)

                #add one SIL symbol at the end so there's one at the end of each word
                phonemes.append('SIL')

            seqLen = len(phonemes)
            seqClassIDs[0:seqLen] = [phoneToId(p) + 1 for p in phonemes]
            print(phonemes)

            print(thisTranscription)
            ceMask = np.zeros([inputFeats.shape[0]]).astype(np.float32)
            ceMask[0:sessionData['frameLens'][trialIdx]] = 1

            paddedTranscription = np.zeros([maxSeqLen]).astype(np.int32)
            paddedTranscription[0:len(thisTranscription)] = np.array(_convert_to_ascii(thisTranscription))

            feature = {'inputFeatures': _floats_feature(np.ravel(inputFeats).tolist()),
                'classLabelsOneHot': _floats_feature(np.ravel(classLabels).tolist()),
                'newClassSignal': _floats_feature(np.ravel(newClassSignal).tolist()),
                'seqClassIDs': _ints_feature(seqClassIDs),
                'nTimeSteps': _ints_feature([sessionData['frameLens'][trialIdx]]),
                'nSeqElements': _ints_feature([seqLen]),
                'ceMask': _floats_feature(np.ravel(ceMask).tolist()),
                'transcription': _ints_feature(paddedTranscription)}

            #print(paddedTranscription[0:10])
            print(seqClassIDs[0:10])
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            writer.write(example.SerializeToString())    


In [13]:
from getSpeechSessionBlocks import getSpeechSessionBlocks
blockLists = getSpeechSessionBlocks()

for sessIdx in range(len(blockLists)):
    sessionName = blockLists[sessIdx][0]
    dataPath = baseDir + '/competitionData'
    tfRecordFolder = baseDir + '/derived/tfRecords/'+sessionName
    makeTFRecordsFromCompetitionFiles(sessionName, dataPath, tfRecordFolder)

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 244 245 24