# Process File Of Aligned Transcripts

In [1]:
import json
import numpy as np
import copy

In [2]:
### punctuation encoder
punEnc = {
    ",": "COMMA",
    ".": "PERIOD",
    ";": "SEMICOLON",
    ":": "COLON",
    "!": "EXCLAMATION",
    "?": "QUESTION"
}

In [3]:
### Extract All The Words + Words Begins + Words Ends

fileName = "AudioFeatures/aligned_transcripts.json"

with open(fileName, "r") as inpFile:
    data = json.load(inpFile)

listDictio = list(data.keys())
numDictio = len(listDictio)

listAllWords = []
listAllBegins = []
listAllEnds = []
for i in range(numDictio):
    listAllWords += data[listDictio[i]]['words']
    listAllBegins += data[listDictio[i]]['words_begins']
    listAllEnds += data[listDictio[i]]['words_ends']
print("Number Of Words = ", len(listAllWords))
print("Number Of Begins = ", len(listAllBegins))
print("Number Of Ends = ", len(listAllEnds))

Number Of Words =  1671024
Number Of Begins =  1671024
Number Of Ends =  1671024


In [4]:
### Write The Output File

nameOutFile = "AudioFeatures/outFile_00.txt"

numWords = len(listAllWords)
outFile = open(nameOutFile, "w")
countBegins = 0
for i in range(numWords-1):
    word = listAllWords[i]
    begin = str(listAllBegins[i])
    end = str(listAllEnds[i])
    nextWord = listAllWords[i+1]
    # check if word is punctuation
    if word == "," or word == "." or word == ";" or word == ":" or word == "!" or word == "?":
        continue
    # check if the next word is punctuation
    if nextWord == "," or nextWord == "." or nextWord == ";" or nextWord == ":" or nextWord == "!" or nextWord == "?":
        outFile.write(word.lower() + "\t" + punEnc[nextWord] + "\t" + begin + "\t" + end + "\n")
        continue
    # check if the last character of the word is puncutation
    if len(word)>1 and word[-1] == "," or word[-1] == "." or word[-1] == ";" or word[-1] == ":" or word[-1] == "!" or word[-1] == "?":
        outFile.write(word[0:-1].lower() + "\t" + punEnc[word[-1]] + "\t" + begin + "\t" + end + "\n")
        continue
    if nextWord != "," or nextWord != "." or nextWord != ";" or nextWord != ":" or nextWord != "!" or nextWord != "?":
        outFile.write(word.lower() + "\t" + "SPACE" + "\t" + begin + "\t" + end + "\n")
# take care of last word in the list
word = listAllWords[numWords-1]
begin = str(listAllBegins[numWords-1])
end = str(listAllEnds[numWords-1])
if len(word)>1 and word[-1] == "," or word[-1] == "." or word[-1] == ";" or word[-1] == ":" or word[-1] == "!" or word[-1] == "?":
    outFile.write(word[0:-1].lower() + "\t" + punEnc[word[-1]] + "\t" + begin + "\t" + end + "\n")
else:
    outFile.write(word.lower() + "\t" + "SPACE" + "\t" + begin + "\t" + end + "\n")
outFile.close()

In [5]:
### Correct The Output File
### Elininate Words Where Begin Time And/Or End Time Is 'None'

nameInpFile = "AudioFeatures/outFile_00.txt"
nameOutFile = "AudioFeatures/outFile_01.txt"

with open(nameInpFile, "r") as inpFile:
    with open(nameOutFile, "w") as outFile:
        lines = inpFile.readlines()
        for line in lines:
            split = line.split("\t")
            if 'None' not in line:
                outFile.write(line)
            else:
                continue

In [6]:
### Correct The Output File
### Sometimes wordEnd > nextWordBegin but not beacause of the start of a new sentence because
### the difference is very small and nexWordBegin is not zero.

nameInpFile = "AudioFeatures/outFile_01.txt"
nameOutFile = "AudioFeatures/outFile_02.txt"

with open(nameInpFile, "r") as inpFile:
    lines = inpFile.readlines()

with open(nameOutFile, "w") as outFile:
    line = lines[0].split("\t")
    line[2] = float(line[2])
    line[3] = float(line[3])
    outFile.write("{:15s} \t {:13s} \t {:7.4f} \t {:7.4f} \n".format(line[0], line[1], line[2], line[3]))
    for i in range(1, len(lines)):
        prevLine = lines[i-1].split("\t")
        line = lines[i].split("\t")
        wordBeg = float(line[2])
        wordEnd = float(line[3])
        prevWordEnd = float(prevLine[3])
        if wordBeg - prevWordEnd < .0 and abs(wordBeg - prevWordEnd) < 0.03:
            wordBeg = prevWordEnd
        outFile.write("{:15s} \t {:13s} \t {:7.4f} \t {:7.4f} \n".format(line[0], line[1], wordBeg, wordEnd ))

In [7]:
### Correct The Output File

nameInpFile = "AudioFeatures/outFile_02.txt"
nameOutFile = "AudioFeatures/outFile_03.txt"

sequenceSize = 32

with open(nameInpFile, "r") as inpFile:
    lines = inpFile.readlines()
numLine = len(lines)

# get the bound for the loop
rest = numLine - sequenceSize * (numLine//sequenceSize)
bound = numLine - rest - sequenceSize

dataWords = []
dataPuncs = []
dataBegins = []
dataEnds = []

count = 0  # index for the rows in input file
while count < numLine:
    
    sequenceWords = []
    sequencePuncs = []
    sequenceBegins = []
    sequenceEnds = []
    
    ### get the values for a sequence
    tmp = 0  # this to count the effective size of the sequence
    if count <= bound:
        for i in range(sequenceSize):
            tmp += 1
            items = lines[count+i].split("\t")
            sequenceWords.append(items[0])
            sequencePuncs.append(items[1])
            sequenceBegins.append(float(items[2]))
            sequenceEnds.append(float(items[3]))
    else:
        for i in range(bound+sequenceSize, numLine):
            tmp += 1
            items = lines[i].split("\t")
            sequenceWords.append(items[0])
            sequencePuncs.append(items[1])
            sequenceBegins.append(float(items[2]))
            sequenceEnds.append(float(items[3]))
    # print("count = ", count)
    # print("tmp = ", tmp)
    count += sequenceSize

    # apply the corrections to the time-stamp
    sequenceBeginsCorr = np.asarray(copy.deepcopy(sequenceBegins))
    sequenceEndsCorr = np.asarray(copy.deepcopy(sequenceEnds))
    for i in range(tmp-1):
        wordEnd = sequenceEnds[i]
        nextWordBegin = sequenceBegins[i+1]
        ### CORRECTION 1
        ### Correct in case a sequence spans more than one sentence.
        if wordEnd > nextWordBegin:
            sequenceBeginsCorr[i+1:] += wordEnd
            sequenceEndsCorr[i+1:] += wordEnd
    ### CORRECTION 2
    ### Set beginning of first word in the sentence as time zero.
    sequenceBeginsCorr[:] -= sequenceBegins[0]
    sequenceEndsCorr[:] -= sequenceBegins[0]
    
    ### collect the corrected values
    dataWords += sequenceWords
    dataPuncs += sequencePuncs
    dataBegins += list(sequenceBeginsCorr)
    dataEnds += list(sequenceEndsCorr)

### Output The Corrected File
with open(nameOutFile, "w") as outFile:
    for i in range(numLine):
        outFile.write("{:15s} \t {:13s} \t {:7.4f} \t {:7.4f} \n".format(dataWords[i], dataPuncs[i], dataBegins[i], dataEnds[i]))

In [8]:
# ### Check The Output File
# ### endTimeWord > startTimeNextWord only in case the two words belongs to different sequences.

# nameInpFile = "AudioFeatures/outFile_03.txt"

# lineIndexList = []
# with open(nameInpFile, "r") as f:
#     lines = f.readlines()
# numLines = len(lines)
# for i in range(numLines-1):
#     items = lines[i].split("\t")
#     nextItems = lines[i+1].split("\t")
#     endTimeWord = float(items[3])
#     startTimeNextWord = float(nextItems[2])
#     if endTimeWord > startTimeNextWord:
#         lineIndex = i+1
#         if (lineIndex % sequenceSize) == 0:
#             continue
#         else:
#             lineIndexList.append(lineIndex)
# print("Number of issues = ", len(lineIndexList))

In [9]:
### Obtain another output file where there is only SPACE and PERIOD.

nameInpFile = "AudioFeatures/outFile_03.txt"
nameOutFile = "AudioFeatures/outFile_04.txt"

with open(nameInpFile, 'r') as inpFile:
    lines = inpFile.readlines()
with open(nameOutFile, 'w') as outFile:
    for line in lines:
        items = line.split("\t")
        word = items[0].strip()
        punc = items[1].strip()
        wordBeg = float(items[2])
        wordEnd = float(items[3])
        if punc == "COMMA" or punc == "SEMICOLON" or punc == "COLON" or punc == "EXCLAMATION" or punc == "QUESTION":
            punc = "PERIOD"
        outFile.write("{:15s} \t {:13s} \t {:7.4f} \t {:7.4f} \n".format(word, punc, wordBeg, wordEnd))

In [10]:
### Add two columns:
###     . gap from previous word
###     . cumulative gap

nameInpFile = "AudioFeatures/outFile_04.txt"
nameOutFile = "AudioFeatures/outFile_05.txt"

with open(nameInpFile, "r") as inpFile:
    lines = inpFile.readlines()

numLines = len(lines)
out = []

# initialize first raw of out
subOut = lines[0].split("\t")
subOut[2] = float(subOut[2])
subOut[3] = float(subOut[3])
subOut.append(.0)  # gap
subOut.append(.0)  # cumulative gap
out.append(subOut)

for i in range(1, numLines):
    subOut = lines[i].split("\t")
    wordBeg = float(subOut[2])
    wordEnd = float(subOut[3])
    prevWordEnd = float(out[-1][3])
    gap = wordBeg - prevWordEnd
    # in case we are in between two sequences:
    if gap < .0:
        subOut.append(.0)  # gap
        subOut.append(.0)  # cumGap
    else:
        subOut.append(gap)
        cumGap = float(out[-1][-1]) + gap
        subOut.append(cumGap)
    subOut[2] = wordBeg
    subOut[3] = wordEnd
    out.append(subOut)

### write output file
with open(nameOutFile, "w") as outFile:
    for subOut in out:
        outFile.write("{:15s} \t {:13s} \t {:7.4f} \t {:7.4f} \t {:7.4f} \t {:7.4f} \n".format(subOut[0], subOut[1], 
                                                                                               subOut[2], subOut[3], subOut[4], subOut[5]))