In [36]:
# textgrid_short2dataframe
# H. Muller
# 2023-08-04

# Define functions

In [37]:
import tgt

# open file
def inputtextlines(filename):
    textgrid_short = tgt.io.read_textgrid(filename, encoding='latin-1', include_empty_intervals=True)
    textgrid_long = tgt.io.export_to_long_textgrid(textgrid_short)
    linelist = textgrid_long.split('\n')
    return(linelist)

In [38]:
# Conversion routines
def converttextgrid2list(textgridlines,textgridname):

    data = []

    newtier = False
    for line in textgridlines[9:]:
        line = re.sub('\n','',line)
        line = re.sub('\t','',line)
        line = re.sub('^ *','',line)
        linepair = line.split(' = ')
        if len(linepair) == 2:
            if linepair[0] == 'class':
                classname = linepair[1].strip().strip('\"')
            if linepair[0] == 'name':
                tiername = linepair[1].strip().strip('\"')
            if linepair[0] == 'xmin':
                xmin = float(linepair[1])
            if linepair[0] == 'xmax':
                xmax = float(linepair[1])
            if linepair[0] == 'text':
                text = linepair[1].strip().strip('\"')
                diff = xmax-xmin
                data.append([textgridname, classname, tiername, text, xmin, xmax, diff])
                
    return(data)

# Load modules and data

In [39]:
import sys, re, os
import pandas as pd

In [40]:
# Parse arguments
if os.environ.get('inputfiles'):
    inputPath = os.environ.get('inputfiles')
else:
    inputPath = '../DataProcessed/CGN_alignment_comp-o_nl/'
    
if os.environ.get('outputpath'):
    outputPath = os.environ.get('outputpath')
else:
    outputPath = '../DataProcessed/cgn_alignments_comp-o_nl.pkl'

In [41]:
# get all files
onlyfiles = [os.path.join(inputPath,f) for f in os.listdir(inputPath) if os.path.isfile(os.path.join(inputPath, f))]
onlyfiles[:10]

['../DataProcessed/CGN_alignment_comp-o_nl/fn001092.awd',
 '../DataProcessed/CGN_alignment_comp-o_nl/fn001093.awd',
 '../DataProcessed/CGN_alignment_comp-o_nl/fn001094.awd',
 '../DataProcessed/CGN_alignment_comp-o_nl/fn001095.awd',
 '../DataProcessed/CGN_alignment_comp-o_nl/fn001096.awd',
 '../DataProcessed/CGN_alignment_comp-o_nl/fn001097.awd',
 '../DataProcessed/CGN_alignment_comp-o_nl/fn001098.awd',
 '../DataProcessed/CGN_alignment_comp-o_nl/fn001099.awd',
 '../DataProcessed/CGN_alignment_comp-o_nl/fn001100.awd',
 '../DataProcessed/CGN_alignment_comp-o_nl/fn001101.awd']

In [42]:
# set run to True to use subset for faster development
RUN=False
if RUN==True:
    onlyfiles = onlyfiles[:5]

# Parse textgrids

In [43]:
data = []
for filePath in onlyfiles:
    
    fileName = os.path.split(filePath)[1]
    textgrid = inputtextlines(filePath)
    data.extend(converttextgrid2list(textgrid, fileName))

df = pd.DataFrame(data, columns =['FileName', 'TierType', 'TierName', 'Label', 'Start', 'End', 'Duration'])
df['FileNameTierName'] = df.FileName + df.TierName
display(df.head())
display(df.tail())

Unnamed: 0,FileName,TierType,TierName,Label,Start,End,Duration,FileNameTierName
0,fn001092.awd,IntervalTier,N00551,,0.0,0.922,0.922,fn001092.awdN00551
1,fn001092.awd,IntervalTier,N00551,ik,0.922,1.087,0.165,fn001092.awdN00551
2,fn001092.awd,IntervalTier,N00551,vertelde,1.087,1.529,0.442,fn001092.awdN00551
3,fn001092.awd,IntervalTier,N00551,meneer,1.529,1.786,0.257,fn001092.awdN00551
4,fn001092.awd,IntervalTier,N00551,Wild,1.786,2.125,0.339,fn001092.awdN00551


Unnamed: 0,FileName,TierType,TierName,Label,Start,End,Duration,FileNameTierName
3647892,fn001091.awd,IntervalTier,N00551_SEG,l,231.015,231.086,0.071,fn001091.awdN00551_SEG
3647893,fn001091.awd,IntervalTier,N00551_SEG,E+,231.086,231.238,0.152,fn001091.awdN00551_SEG
3647894,fn001091.awd,IntervalTier,N00551_SEG,k,231.238,231.339,0.101,fn001091.awdN00551_SEG
3647895,fn001091.awd,IntervalTier,N00551_SEG,@,231.339,231.399,0.06,fn001091.awdN00551_SEG
3647896,fn001091.awd,IntervalTier,N00551_SEG,,231.399,231.5,0.101,fn001091.awdN00551_SEG


In [44]:
# extract relevant tiernames
fileTierNames = df.FileNameTierName.drop_duplicates().to_list()
fileTierNames = [fileTier for fileTier in fileTierNames if "_" not in fileTier]
fileTierNames[:10]

['fn001092.awdN00551',
 'fn001093.awdN00552',
 'fn001094.awdN00552',
 'fn001095.awdN00554',
 'fn001096.awdN00554',
 'fn001097.awdN00555',
 'fn001098.awdN00555',
 'fn001099.awdN00557',
 'fn001100.awdN00557',
 'fn001101.awdN00558']

In [45]:
# set up list to store results and index to iterate through labels
data = []

# for each tier
for fileTier in fileTierNames:
    labelIndex = 1
    dfLabel = df.loc[df.FileNameTierName==fileTier,]
    dfFon = df.loc[df.FileNameTierName==fileTier+'_FON',]
    dfSeg = df.loc[df.FileNameTierName==fileTier+'_SEG',]
    
    # align segments to labels 
    for index, row in dfSeg.iterrows():
    
        # extract segment with start and end value
        Seg = row['Label']
        start = row['Start']
        end = row['End']
    
        # extract corresponding label (=word)
        label = dfLabel.iloc[labelIndex, dfLabel.columns.get_loc('Label')]
        trans = dfFon.iloc[labelIndex, dfFon.columns.get_loc('Label')]
        labelStart = dfLabel.iloc[labelIndex, dfLabel.columns.get_loc('Start')]
        labelEnd = dfLabel.iloc[labelIndex, dfLabel.columns.get_loc('End')]
        ID = str(labelIndex) + '-' + label
    
        # append everything to list
        data.append([fileTier, ID, label, labelStart, labelEnd, trans, Seg, start, end])
    
        # check if segment matches last segment of label or
        # is smaller (CGN does not perfectly align word boundaries with phone boundaries)
        if labelEnd <= end:
            labelIndex += 1
    
results = pd.DataFrame(data, columns =['FileNameTierName', 'ID', 'WordOrtho', 'WordStart', 'WordEnd', 'WordPhono', 'Phone', 
                    'PhoneStart', 'PhoneEnd',])
results[:20]

Unnamed: 0,FileNameTierName,ID,WordOrtho,WordStart,WordEnd,WordPhono,Phone,PhoneStart,PhoneEnd
0,fn001092.awdN00551,1-ik,ik,0.922,1.087,Ik,,0.0,0.922
1,fn001092.awdN00551,1-ik,ik,0.922,1.087,Ik,I,0.922,1.025
2,fn001092.awdN00551,1-ik,ik,0.922,1.087,Ik,k,1.025,1.087
3,fn001092.awdN00551,2-vertelde,vertelde,1.087,1.529,f@rtEld@,f,1.087,1.138
4,fn001092.awdN00551,2-vertelde,vertelde,1.087,1.529,f@rtEld@,@,1.138,1.2
5,fn001092.awdN00551,2-vertelde,vertelde,1.087,1.529,f@rtEld@,r,1.2,1.23
6,fn001092.awdN00551,2-vertelde,vertelde,1.087,1.529,f@rtEld@,t,1.23,1.302
7,fn001092.awdN00551,2-vertelde,vertelde,1.087,1.529,f@rtEld@,E,1.302,1.395
8,fn001092.awdN00551,2-vertelde,vertelde,1.087,1.529,f@rtEld@,l,1.395,1.436
9,fn001092.awdN00551,2-vertelde,vertelde,1.087,1.529,f@rtEld@,d,1.436,1.477


In [46]:
results.loc[results.WordOrtho=='boodschappenlijstje', ]

Unnamed: 0,FileNameTierName,ID,WordOrtho,WordStart,WordEnd,WordPhono,Phone,PhoneStart,PhoneEnd


In [47]:
results.to_pickle(outputPath)

Load results into environment with:

`results = pd.read_pickle('../DataProcessed/cgn_alignments_dataframe.pkl')`  