In [37]:
from datasets import Dataset
from pympi import Elan
import os
from glob import glob
import pandas as pd
os.chdir('/Users/markjos/projects/malachor5')
import sys
sys.path.append('scripts')
from eval import get_word_language

In [38]:
eafs = glob('meta/*.eaf')
eafs

['meta/HH20240424.eaf',
 'meta/HH20210312.eaf',
 'meta/HH20210913.eaf',
 'meta/HH20220327-2.eaf']

In [39]:
def convert_eaf_to_pandas(eaf_path):
    eaf = Elan.Eaf(eaf_path)
    data = []
    for tier in eaf.get_tier_names():
        if len(tier)!=3 and not tier.isupper():
            continue
        for annotation in eaf.get_annotation_data_for_tier(tier):
            data.append({
                'tier': tier,
                'file': eaf_path,
                'start': annotation[0],
                'end': annotation[1],
                'duration': annotation[1]-annotation[0],
                'value': annotation[2]
            })
    return pd.DataFrame(data)

In [72]:
df_list=[]
for eaf_fp in eafs:
    # skip noisy recording
    if '20240424' in eaf_fp:
        continue
    df_list.append(convert_eaf_to_pandas(eaf_fp))
df = pd.concat(df_list)
df['duration_min']=df['duration']/60000
df.head()

Unnamed: 0,tier,file,start,end,duration,value,duration_min
0,SHA,meta/HH20210312.eaf,10340,13050,2710,"Alright, we got it set up.",0.045167
1,SHA,meta/HH20210312.eaf,23170,26460,3290,Ok we ready? So you can stop sharing.,0.054833
2,SHA,meta/HH20210312.eaf,44260,64300,20040,"Now I just have to, alright, so now I'm re-rec...",0.334
3,SHA,meta/HH20210312.eaf,72560,72780,220,-pulled them,0.003667
4,SHA,meta/HH20210312.eaf,96470,98280,1810,"And can you add ""ùnɛ́ɾɛ́"" after it?",0.030167


In [73]:
get_pct_lang = lambda s, lang: [get_word_language(w) for w in s.split()].count(lang)/len(s.split()) if s else 0
get_pct_tira = lambda s: get_pct_lang(s, 'tira')
get_pct_eng = lambda s: get_pct_lang(s, 'eng')
get_pct_misc = lambda s: get_pct_lang(s, 'misc')

df['pct_tira']=df['value'].apply(get_pct_tira)
df['pct_eng']=df['value'].apply(get_pct_eng)
df['pct_misc']=df['value'].apply(get_pct_misc)

In [74]:
df['codeswitched'] = True
df.loc[
    (df['pct_tira']==0)|(df['pct_eng']==0), 'codeswitched'
] = False
df['matrix_language'] = 'eng'
df.loc[
    (df['pct_tira']>df['pct_eng']), 'matrix_language'
] = 'tira'

In [75]:
df['codeswitched'].value_counts()

codeswitched
False    1673
True      334
Name: count, dtype: int64

In [76]:
df.pivot_table(index=['matrix_language', 'codeswitched'], aggfunc='count')

Unnamed: 0_level_0,Unnamed: 1_level_0,duration,duration_min,end,file,pct_eng,pct_misc,pct_tira,start,tier,value
matrix_language,codeswitched,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
eng,False,1375,1375,1375,1375,1375,1375,1375,1375,1375,1375
eng,True,274,274,274,274,274,274,274,274,274,274
tira,False,298,298,298,298,298,298,298,298,298,298
tira,True,60,60,60,60,60,60,60,60,60,60


In [79]:
df.pivot_table(
    index=['file',],
    values=['pct_eng', 'pct_tira', 'duration_min'],
    aggfunc={'pct_eng': 'mean', 'pct_tira': 'mean', 'duration_min': 'sum'},
)

Unnamed: 0_level_0,duration_min,pct_eng,pct_tira
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
meta/HH20210312.eaf,21.040517,0.685613,0.123367
meta/HH20210913.eaf,22.1731,0.725026,0.233066
meta/HH20220327-2.eaf,35.3138,0.712899,0.228862
