In [1]:
# textgrid_short2dataframe
# H. Muller
# 2023-08-04

# Define functions

In [2]:
import tgt

# open file
def inputtextlines(filename):
    textgrid_short = tgt.io.read_textgrid(filename, encoding='latin-1', include_empty_intervals=True)
    textgrid_long = tgt.io.export_to_long_textgrid(textgrid_short)
    linelist = textgrid_long.split('\n')
    return(linelist)

In [3]:
# Conversion routines
def converttextgrid2list(textgridlines,textgridname):

    data = []

    newtier = False
    for line in textgridlines[9:]:
        line = re.sub('\n','',line)
        line = re.sub('\t','',line)
        line = re.sub('^ *','',line)
        linepair = line.split(' = ')
        if len(linepair) == 2:
            if linepair[0] == 'class':
                classname = linepair[1].strip().strip('\"')
            if linepair[0] == 'name':
                tiername = linepair[1].strip().strip('\"')
            if linepair[0] == 'xmin':
                xmin = float(linepair[1])
            if linepair[0] == 'xmax':
                xmax = float(linepair[1])
            if linepair[0] == 'text':
                text = linepair[1].strip().strip('\"')
                diff = xmax-xmin
                data.append([textgridname, classname, tiername, text, xmin, xmax, diff])
                
    return(data)

# Load modules and data

In [4]:
import sys, re, os
import pandas as pd

In [5]:
# Parse arguments
if os.environ.get('inputfiles'):
    inputPath = os.environ.get('inputfiles')
else:
    inputPath = '../DataProcessed/CGN_alignment_comp-o_nl/'
    
if os.environ.get('outputpath'):
    outputPath = os.environ.get('outputpath')
else:
    outputPath = '../DataProcessed/cgn_alignments_comp-o_nl.pkl'

In [6]:
# get all files
onlyfiles = [os.path.join(inputPath,f) for f in os.listdir(inputPath) if os.path.isfile(os.path.join(inputPath, f))]
onlyfiles[:10]

['../DataProcessed/CGN_alignment_comp-o_vl/fv800097.awd',
 '../DataProcessed/CGN_alignment_comp-o_vl/fv800098.awd',
 '../DataProcessed/CGN_alignment_comp-o_vl/fv800101.awd',
 '../DataProcessed/CGN_alignment_comp-o_vl/fv800102.awd',
 '../DataProcessed/CGN_alignment_comp-o_vl/fv800103.awd',
 '../DataProcessed/CGN_alignment_comp-o_vl/fv800104.awd',
 '../DataProcessed/CGN_alignment_comp-o_vl/fv800105.awd',
 '../DataProcessed/CGN_alignment_comp-o_vl/fv800106.awd',
 '../DataProcessed/CGN_alignment_comp-o_vl/fv800107.awd',
 '../DataProcessed/CGN_alignment_comp-o_vl/fv800108.awd']

In [7]:
# set run to True to use subset for faster development
RUN=False
if RUN==True:
    onlyfiles = onlyfiles[:5]

# Parse textgrids

In [8]:
data = []
for filePath in onlyfiles:
    
    fileName = os.path.split(filePath)[1]
    textgrid = inputtextlines(filePath)
    data.extend(converttextgrid2list(textgrid, fileName))

df = pd.DataFrame(data, columns =['FileName', 'TierType', 'TierName', 'Label', 'Start', 'End', 'Duration'])
df['FileNameTierName'] = df.FileName + df.TierName
display(df.head())
display(df.tail())

Unnamed: 0,FileName,TierType,TierName,Label,Start,End,Duration,FileNameTierName
0,fv800097.awd,IntervalTier,V80010,,0.0,0.803,0.803,fv800097.awdV80010
1,fv800097.awd,IntervalTier,V80010,Falcone,0.803,1.434,0.631,fv800097.awdV80010
2,fv800097.awd,IntervalTier,V80010,was,1.434,1.631,0.197,fv800097.awdV80010
3,fv800097.awd,IntervalTier,V80010,toen,1.631,1.825,0.194,fv800097.awdV80010
4,fv800097.awd,IntervalTier,V80010,nog,1.825,2.007,0.182,fv800097.awdV80010


Unnamed: 0,FileName,TierType,TierName,Label,Start,End,Duration,FileNameTierName
2391556,fv800096.awd,IntervalTier,V80010_SEG,s,117.06,117.2,0.14,fv800096.awdV80010_SEG
2391557,fv800096.awd,IntervalTier,V80010_SEG,i,117.2,117.28,0.08,fv800096.awdV80010_SEG
2391558,fv800096.awd,IntervalTier,V80010_SEG,p,117.28,117.34,0.06,fv800096.awdV80010_SEG
2391559,fv800096.awd,IntervalTier,V80010_SEG,@,117.34,117.444,0.104,fv800096.awdV80010_SEG
2391560,fv800096.awd,IntervalTier,V80010_SEG,,117.444,117.88,0.436,fv800096.awdV80010_SEG


In [9]:
# extract relevant tiernames
fileTierNames = df.FileNameTierName.drop_duplicates().to_list()
fileTierNames = [fileTier for fileTier in fileTierNames if "_" not in fileTier]
fileTierNames[:10]

['fv800097.awdV80010',
 'fv800098.awdV80010',
 'fv800101.awdV80011',
 'fv800102.awdV80011',
 'fv800103.awdV80011',
 'fv800104.awdV80011',
 'fv800105.awdV80011',
 'fv800106.awdV80011',
 'fv800107.awdV80011',
 'fv800108.awdV80011']

In [10]:
# set up list to store results and index to iterate through labels
data = []

# for each tier
for fileTier in fileTierNames:
    labelIndex = 1
    dfLabel = df.loc[df.FileNameTierName==fileTier,]
    dfFon = df.loc[df.FileNameTierName==fileTier+'_FON',]
    dfSeg = df.loc[df.FileNameTierName==fileTier+'_SEG',]
    
    # align segments to labels 
    for index, row in dfSeg.iterrows():
    
        # extract segment with start and end value
        Seg = row['Label']
        start = row['Start']
        end = row['End']
    
        # extract corresponding label (=word)
        label = dfLabel.iloc[labelIndex, dfLabel.columns.get_loc('Label')]
        trans = dfFon.iloc[labelIndex, dfFon.columns.get_loc('Label')]
        labelStart = dfLabel.iloc[labelIndex, dfLabel.columns.get_loc('Start')]
        labelEnd = dfLabel.iloc[labelIndex, dfLabel.columns.get_loc('End')]
        ID = str(labelIndex) + '-' + label
    
        # append everything to list
        data.append([fileTier, ID, label, labelStart, labelEnd, trans, Seg, start, end])
    
        # check if segment matches last segment of label or
        # is smaller (CGN does not perfectly align word boundaries with phone boundaries)
        if labelEnd <= end:
            labelIndex += 1
    
results = pd.DataFrame(data, columns =['FileNameTierName', 'ID', 'WordOrtho', 'WordStart', 'WordEnd', 'WordPhono', 'Phone', 
                    'PhoneStart', 'PhoneEnd',])
results[:20]

Unnamed: 0,FileNameTierName,ID,WordOrtho,WordStart,WordEnd,WordPhono,Phone,PhoneStart,PhoneEnd
0,fv800097.awdV80010,1-Falcone,Falcone,0.803,1.434,fAlkowne,,0.0,0.803
1,fv800097.awdV80010,1-Falcone,Falcone,0.803,1.434,fAlkowne,f,0.803,0.88
2,fv800097.awdV80010,1-Falcone,Falcone,0.803,1.434,fAlkowne,A,0.88,0.94
3,fv800097.awdV80010,1-Falcone,Falcone,0.803,1.434,fAlkowne,l,0.94,1.01
4,fv800097.awdV80010,1-Falcone,Falcone,0.803,1.434,fAlkowne,k,1.01,1.08
5,fv800097.awdV80010,1-Falcone,Falcone,0.803,1.434,fAlkowne,o,1.08,1.21
6,fv800097.awdV80010,1-Falcone,Falcone,0.803,1.434,fAlkowne,w,1.21,1.24
7,fv800097.awdV80010,1-Falcone,Falcone,0.803,1.434,fAlkowne,n,1.24,1.28
8,fv800097.awdV80010,1-Falcone,Falcone,0.803,1.434,fAlkowne,e,1.28,1.434
9,fv800097.awdV80010,2-was,was,1.434,1.631,wAs,w,1.434,1.48


In [11]:
results.loc[results.WordOrtho=='boodschappenlijstje', ]

Unnamed: 0,FileNameTierName,ID,WordOrtho,WordStart,WordEnd,WordPhono,Phone,PhoneStart,PhoneEnd
1165898,fv801246.awdV80125,175-boodschappenlijstje,boodschappenlijstje,58.618,59.515,botsxAp@lE+stS@,b,58.618,58.68
1165899,fv801246.awdV80125,175-boodschappenlijstje,boodschappenlijstje,58.618,59.515,botsxAp@lE+stS@,o,58.68,58.76
1165900,fv801246.awdV80125,175-boodschappenlijstje,boodschappenlijstje,58.618,59.515,botsxAp@lE+stS@,t,58.76,58.8
1165901,fv801246.awdV80125,175-boodschappenlijstje,boodschappenlijstje,58.618,59.515,botsxAp@lE+stS@,s,58.8,58.86
1165902,fv801246.awdV80125,175-boodschappenlijstje,boodschappenlijstje,58.618,59.515,botsxAp@lE+stS@,x,58.86,58.92
1165903,fv801246.awdV80125,175-boodschappenlijstje,boodschappenlijstje,58.618,59.515,botsxAp@lE+stS@,A,58.92,58.97
1165904,fv801246.awdV80125,175-boodschappenlijstje,boodschappenlijstje,58.618,59.515,botsxAp@lE+stS@,p,58.97,59.09
1165905,fv801246.awdV80125,175-boodschappenlijstje,boodschappenlijstje,58.618,59.515,botsxAp@lE+stS@,@,59.09,59.12
1165906,fv801246.awdV80125,175-boodschappenlijstje,boodschappenlijstje,58.618,59.515,botsxAp@lE+stS@,l,59.12,59.19
1165907,fv801246.awdV80125,175-boodschappenlijstje,boodschappenlijstje,58.618,59.515,botsxAp@lE+stS@,E+,59.19,59.31


In [12]:
results.to_pickle(outputPath)

Load results into environment with:

`results = pd.read_pickle('../DataProcessed/cgn_alignments_dataframe.pkl')`  