# Process File Of Aligned Transcripts

In [1]:
import json
import numpy as np
import copy

In [2]:
### punctuation encoder
punEnc = {
    ",": "COMMA",
    ".": "PERIOD",
    ";": "SEMICOLON",
    ":": "COLON",
    "!": "EXCLAMATION",
    "?": "QUESTION"
}

In [3]:
### Extract All The Words + Words Begins + Words Ends

# fileName = "AudioFeatures/xaa_00"
fileName = "AudioFeatures/aligned_transcripts.json"
with open(fileName, "r") as inpFile:
    data = json.load(inpFile)

listDictio = list(data.keys())
numDictio = len(listDictio)

listAllWords = []
listAllBegins = []
listAllEnds = []
for i in range(numDictio):
    listAllWords += data[listDictio[i]]['words']
    listAllBegins += data[listDictio[i]]['words_begins']
    listAllEnds += data[listDictio[i]]['words_ends']
print("Number Of Words = ", len(listAllWords))
print("Number Of Begins = ", len(listAllBegins))
print("Number Of Ends = ", len(listAllEnds))

Number Of Words =  1671024
Number Of Begins =  1671024
Number Of Ends =  1671024


In [4]:
### Write The Output File

numWords = len(listAllWords)
nameOutFile = "outFile_00.txt"
outFile = open(nameOutFile, "w")
countBegins = 0
for i in range(numWords-1):
    word = listAllWords[i]
    begin = str(listAllBegins[i])
    end = str(listAllEnds[i])
    nextWord = listAllWords[i+1]
    # check if word is punctuation
    if word == "," or word == "." or word == ";" or word == ":" or word == "!" or word == "?":
        continue
    # check if the next word is punctuation
    if nextWord == "," or nextWord == "." or nextWord == ";" or nextWord == ":" or nextWord == "!" or nextWord == "?":
        outFile.write(word.lower() + "\t" + punEnc[nextWord] + "\t" + begin + "\t" + end + "\n")
        continue
    # check if the last character of the word is puncutation
    if len(word)>1 and word[-1] == "," or word[-1] == "." or word[-1] == ";" or word[-1] == ":" or word[-1] == "!" or word[-1] == "?":
        outFile.write(word[0:-1].lower() + "\t" + punEnc[word[-1]] + "\t" + begin + "\t" + end + "\n")
        continue
    if nextWord != "," or nextWord != "." or nextWord != ";" or nextWord != ":" or nextWord != "!" or nextWord != "?":
        outFile.write(word.lower() + "\t" + "SPACE" + "\t" + begin + "\t" + end + "\n")
# take care of last word in the list
word = listAllWords[numWords-1]
begin = str(listAllBegins[numWords-1])
end = str(listAllEnds[numWords-1])
if len(word)>1 and word[-1] == "," or word[-1] == "." or word[-1] == ";" or word[-1] == ":" or word[-1] == "!" or word[-1] == "?":
    outFile.write(word[0:-1].lower() + "\t" + punEnc[word[-1]] + "\t" + begin + "\t" + end + "\n")
else:
    outFile.write(word.lower() + "\t" + "SPACE" + "\t" + begin + "\t" + end + "\n")
outFile.close()

In [5]:
### Clean The Output File, Elininate Words Where Begin Time And/Or End Time Is 'None'

nameOutFileCleaned = "outFile_01.txt"
with open(nameOutFile, "r") as outFile_00:
    with open(nameOutFileCleaned, "w") as outFile_01:
        lines = outFile_00.readlines()
        for line in lines:
            split = line.split("\t")
            if 'None' not in line:
                outFile_01.write(line)
            else:
                continue

In [6]:
### Apply Corrections To The Output File

nameInpFile = "outFile_01.txt"
nameOutFile = "outFile_02.txt"
sequenceSize = 32

with open(nameInpFile, "r") as inpFile:
    lines = inpFile.readlines()
numLine = len(lines)

# get the bound for the loop
rest = numLine - sequenceSize * (numLine//sequenceSize)
bound = numLine - rest - sequenceSize

dataWords = []
dataPuncs = []
dataBegins = []
dataEnds = []

count = 0  # index for the rows in input file
while count < numLine:
    
    sequenceWords = []
    sequencePuncs = []
    sequenceBegins = []
    sequenceEnds = []
    
    ### get the values for a sequence
    tmp = 0  # this to count the effective size of the sequence
    if count <= bound:
        for i in range(sequenceSize):
            tmp += 1
            items = lines[count+i].split("\t")
            sequenceWords.append(items[0])
            sequencePuncs.append(items[1])
            sequenceBegins.append(float(items[2]))
            sequenceEnds.append(float(items[3]))
    else:
        for i in range(bound+sequenceSize, numLine):
            tmp += 1
            items = lines[i].split("\t")
            sequenceWords.append(items[0])
            sequencePuncs.append(items[1])
            sequenceBegins.append(float(items[2]))
            sequenceEnds.append(float(items[3]))
    # print("count = ", count)
    # print("tmp = ", tmp)
    count += sequenceSize

    ### CORRECTION 1
    ### apply the correction to the time stamp
    sequenceBeginsCorr = np.asarray(copy.deepcopy(sequenceBegins))
    sequenceEndsCorr = np.asarray(copy.deepcopy(sequenceEnds))
    for i in range(tmp-1):
        wordEnd = sequenceEnds[i]
        nextWordBegin = sequenceBegins[i+1]
        ### i add an additional condition because sometimes wordEnd > nextWordBegin
        ### but not beacause of the start of a new sentence.
        if wordEnd > nextWordBegin and abs(wordEnd - nextWordBegin) > 0.021:
            sequenceBeginsCorr[i+1:] += wordEnd
            sequenceEndsCorr[i+1:] += wordEnd
#         ### same as before but without the additional condition
#         if wordEnd > nextWordBegin:
#             sequenceBeginsCorr[i+1:] += wordEnd
#             sequenceEndsCorr[i+1:] += wordEnd
    ### CORRECTION 2
    ### add an additional correction
    ### set beginning of first word in the sentence as time zero.
    sequenceBeginsCorr[:] -= sequenceBegins[0]
    sequenceEndsCorr[:] -= sequenceBegins[0]
    
    ### collect the corrected values
    dataWords += sequenceWords
    dataPuncs += sequencePuncs
    dataBegins += list(sequenceBeginsCorr)
    dataEnds += list(sequenceEndsCorr)

### Output The Corrected File
with open(nameOutFile, "w") as outFile:
    for i in range(numLine):
        outFile.write(dataWords[i] + "\t" + dataPuncs[i] + "\t" + str(dataBegins[i]) + "\t" + str(dataEnds[i]) + "\n")

In [7]:
### Check The Output File
### endTimeWord > startTimeNextWord only in case the two words belongs to different sequences.

lineIndexList = []
with open(nameOutFile, "r") as f:
    lines = f.readlines()
numLines = len(lines)
for i in range(numLines-1):
    items = lines[i].split("\t")
    nextItems = lines[i+1].split("\t")
    endTimeWord = float(items[3])
    startTimeNextWord = float(nextItems[2])
    if endTimeWord > startTimeNextWord:
        lineIndex = i+1
        if (lineIndex % sequenceSize) == 0:
            continue
        else:
            lineIndexList.append(lineIndex)
print("Number of issues = ", len(lineIndexList))

Number of issues =  37100


In [8]:
### Obtain another output file where there is only SPACE and PERIOD.

nameInpFile = "outFile_02.txt"
nameOutFile = "outFile_03.txt"
with open(nameInpFile, 'r') as inpFile:
    lines = inpFile.readlines()
with open(nameOutFile, 'w') as outFile:
    numLines = len(lines)
    for line in lines:
        items = line.split("\t")
        if items[1] == "COMMA" or items[1] == "SEMICOLON" or items[1] == "COLON" or items[1] == "EXCLAMATION" or items[1] == "QUESTION":
            items[1] = "PERIOD"
        outFile.write(items[0] + "\t" + items[1] + "\t" + items[2]+ "\t" + items[3])