In [1]:
#This notebook uses forced-alginment HMMs to label ALL datasets and BOTH of the 
#train/test partitions ('HeldOutBlocks' and 'HeldOutTrials').

#For a walkthrough of how the data labeling step works see 'Step2_hmmDataLabeling_walkthrough', which steps 
#through a single example sentence while visualizing the key variables.

In [2]:
import numpy as np
import scipy.io
from characterDefinitions import getHandwritingCharacterDefinitions
from characterDefinitionsOrig import getHandwritingCharacterDefinitionsOrig
from dataLabelingStep import labelDataset, constructRNNTargets
import os
import datetime

#point this towards the top level dataset directory
rootDir = os.path.expanduser('C:/handwritingBCI-main') + '/handwritingBCIData/'

#define which datasets to process
#dataDirs = ['t5.2019.05.08','t5.2019.11.25','t5.2019.12.09','t5.2019.12.11','t5.2019.12.18',
#            't5.2019.12.20','t5.2020.01.06','t5.2020.01.08','t5.2020.01.13','t5.2020.01.15',
#            'IamOnline1','IamOnline2','IamOnline3','IamOnline4',
#            'IamOnline5','IamOnline6','IamOnline7','IamOnline8','IamOnline9','IamOnline10','IamOnline11','IamOnline12']
#dataDirs = ['IamOnline4',
#            'IamOnline5','IamOnline6','IamOnline7','IamOnline8','IamOnline9','IamOnline10','IamOnline11','IamOnline12']
#dataDirs = ['IamOnline1','IamOnline2','IamOnline3','IamOnline4','IamOnline5','IamOnline6']
#dataDirs = ['IamOnline1']
#dataDirs = [ 'IamOnline', 't5.2019.05.08'] # 'IamOnline', 't5.2019.05.08'
dataDirs = ['t5.2019.05.08','t5.2019.11.25','t5.2019.12.09','t5.2019.12.11','t5.2019.12.18',
            't5.2019.12.20','t5.2020.01.06','t5.2020.01.08','t5.2020.01.13','t5.2020.01.15']
dataDirs = ['IamOnline3']

#saves all labels in this folder
if not os.path.isdir(rootDir + 'RNNTrainingSteps/Step2_HMMLabels'):
    os.mkdir(rootDir + 'RNNTrainingSteps/Step2_HMMLabels')

In [3]:
for dataDir in dataDirs:
    timeStart = datetime.datetime.now()
    
  #defines the list of all 31 characters and what to call them  
    if "IamOnline" in dataDir:
      charDef = getHandwritingCharacterDefinitions()
    else:
      charDef = getHandwritingCharacterDefinitionsOrig() 
    print('Labeling ' + dataDir + ' dataset')
    
    #load sentences, single letter, time-warped files, and train/test partitions
    print(rootDir+'Datasets/'+dataDir)
    
    if "IamOnline" in dataDir:
 #     sentenceDat = scipy.io.loadmat(rootDir+'Datasets/'+'t5.2019.05.08'+'/sentences.mat')# Template is required
      sentenceDat = scipy.io.loadmat(rootDir+'Datasets/'+dataDir+'/'+'neuralCubeStruct_IamOnline.mat')
      sentence1 = scipy.io.loadmat(rootDir+'Datasets/'+dataDir+'/'+'neuralCubeStruct_IamOnline.mat')
 #     sentenceDat['neuralActivityCube'] = sentence1['neuralActivityCube']
#      sentenceDat['intendedText'] = sentence1['intendedText']
 #     sentenceDat['sentencePrompt'] = sentence1['sentencePrompt']
 #     sentenceDat['numTimeBinsPerSentence'] = sentence1['numTimeBinsPerSentence']
      sentenceDat['blockList'] = []
    else:
      sentenceDat = scipy.io.loadmat(rootDir+'Datasets/'+dataDir+'/sentences.mat')
        
  #  print(sentence1['intendedText'])   
    if "IamOnline" in dataDir:
   #    singleLetterDat =  scipy.io.loadmat(rootDir+'Datasets/'+'t5.2019.05.08'+'/singleLetters.mat')   # Template is required to overwrite
       singleLetterDat = scipy.io.loadmat(rootDir+'Datasets/'+dataDir+'/'+'singleChar_IamOnline.mat') # with new data
       print("Modifying to Iamonline data")
   #    for char in charDef['charList']:
   #      singleLetterDat['neuralActivityCube_'+char] = single1['neuralActivityCube_'+char].astype(np.float64)
    else:
        singleLetterDat = scipy.io.loadmat(rootDir+'Datasets/'+dataDir+'/singleLetters.mat')
     
    twCubes = scipy.io.loadmat(rootDir+'RNNTrainingSteps/Step1_TimeWarping/'+dataDir+'_warpedCubes.mat')

    cvPart_heldOutBlocks = scipy.io.loadmat(rootDir+'RNNTrainingSteps/trainTestPartitions_HeldOutBlocks.mat')
    cvPart_heldOutTrials = scipy.io.loadmat(rootDir+'RNNTrainingSteps/trainTestPartitions_HeldOutTrials.mat')
    cvParts = [cvPart_heldOutBlocks, cvPart_heldOutTrials]
    
    #the last two sessions have hashmarks (#) to indicate that T5 should take a brief pause
    #here we remove these from the sentence prompts, otherwise the code below will get confused (because # isn't a character)
    for x in range(sentenceDat['sentencePrompt'].shape[0]):
        sentenceDat['sentencePrompt'][x,0][0] = sentenceDat['sentencePrompt'][x,0][0].replace('#','')
    
    cvFolderNames = ['HeldOutBlocks', 'HeldOutTrials']
    
    sentences = sentenceDat['sentencePrompt'][:,0]
    sentenceLens = sentenceDat['numTimeBinsPerSentence'][:,0]
    
    #construct separate labels for each training partition
    for cvPart, cvFolder in zip(cvParts, cvFolderNames):
        print("Labeling '" + cvFolder + "' partition")
        if "IamOnline" in dataDir:
          trainPartitionIdx = sentence1['trainPartitionIdx']
          testPartitionIdx = sentence1['testPartitionIdx']
        else:
          trainPartitionIdx = cvPart[dataDir+'_train']
          testPartitionIdx = cvPart[dataDir+'_test']  
        print("Train Partition Index is :")
        print(trainPartitionIdx)
        print("Test Partition Index is :")
        print(testPartitionIdx)
        
        #label the data with an iterative forced alignmnet HMM
        letterStarts, letterDurations, blankWindows = labelDataset(dataDir, sentenceDat, 
                                                                   singleLetterDat, 
                                                                   twCubes,
                                                                   trainPartitionIdx, 
                                                                   testPartitionIdx, 
                                                                   charDef)
        if dataDir == "IamOnline1" or dataDir == "IamOnline2" or dataDir == "IamOnline3" or dataDir == "IamOnline5":
           modStep2 = scipy.io.loadmat(rootDir+'Datasets/'+dataDir+'/'+"IamOnline_bdry_step2.mat")
           idx = modStep2['idx']
           dur = modStep2['dur']
           lenX, lenY = idx.shape
           print("Now updating the word boundaries .....")
           print(lenX, lenY)
           for jj in range(lenX):
             letterStarts[jj] = idx[jj]
             letterDurations[jj] = dur[jj]
             print(jj, letterStarts[jj])
        
        #construct targets for supervised learning
        charStartTarget, charProbTarget, ignoreErrorHere = constructRNNTargets(letterStarts, 
                                                                               letterDurations, 
                                                                               sentenceDat['neuralActivityCube'].shape[1], 
                                                                               sentences, 
                                                                               charDef)
        
        saveDict = {}
        saveDict['letterStarts'] = letterStarts
        saveDict['letterDurations'] = letterDurations
        saveDict['charStartTarget'] = charStartTarget.astype(np.float32)
        saveDict['charProbTarget'] = charProbTarget.astype(np.float32)
        saveDict['ignoreErrorHere'] = ignoreErrorHere.astype(np.float32)
        saveDict['blankWindows'] = blankWindows
        saveDict['timeBinsPerSentence'] = sentenceDat['numTimeBinsPerSentence']
        
        if not os.path.isdir(rootDir + 'RNNTrainingSteps/Step2_HMMLabels/'+cvFolder):
            os.mkdir(rootDir + 'RNNTrainingSteps/Step2_HMMLabels/'+cvFolder)
            
        scipy.io.savemat(rootDir + 'RNNTrainingSteps/Step2_HMMLabels/'+cvFolder+'/'+dataDir+'_timeSeriesLabels.mat', saveDict)
        
    timeEnd = datetime.datetime.now()
    print('Total time taken: ' + str((timeEnd - timeStart).total_seconds()) + ' seconds')
    print(' ')


Labeling IamOnline3 dataset
C:/handwritingBCI-main/handwritingBCIData/Datasets/IamOnline3
Modifying to Iamonline data
Labeling 'HeldOutBlocks' partition
Train Partition Index is :
[[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
   18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
   36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
   54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
   72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
   90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
  108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
  126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
  144 145 146 147 148 149]]
Test Partition Index is :
[]
HMM Iteration 0
HMM Iteration 1
Now updating the word boundaries .....
150 200
0 [   4.  111.  172.  248.  378.  473.  533.  678.  801.  963. 1112. 1296.
 1406. 1545. 1677

19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
Labeling 'HeldOutTrials' partition
Train Partition Index is :
[[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
   18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
   36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
   54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
   72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
   90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
  108 109 110 111 112 113 114 115 116 117 118 119 1

17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
Total time taken: 1745.540086 seconds
 
