<a href="https://colab.research.google.com/github/spicytigermeat/labbu/blob/main/MFA_TextGrid_%3E_Lab_with_Fixes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@markdown # Mount Drive and Setup Dependencies
#@markdown ---
from IPython.display import clear_output
from google.colab import drive
#@markdown <font size=-1.5> Uncheck if you do not want to mount your drive. (TextGrids are very small files and you should have no issue uploading them to the runtime.)
mount_drive = False #@param {type: 'boolean'}

if mount_drive:
  drive.mount('/content/drive')

#clone repo and install dependencies
!git clone https://github.com/spicytigermeat/labbu.git
!pip install -r "/content/labbu/requirements.txt"

clear_output()
print('Setup is complete!!')


In [None]:
#@markdown # Import "labbu" module, initialize specific functions and decompress TextGrids.
#@markdown ---
#@markdown <font size=-1.5> Define Language. If custom, upload a text file that looks like [this](https://github.com/spicytigermeat/cmu2dsdict/blob/main/phones_eng.txt). 'default' covers English and Japanese.
!cd labbu
import sys
import glob
import re
sys.path.append('/content/labbu')
import labbu

language = 'default' #@param ["default", "japanese", "custom"]
#@markdown <font size=-1.5> A zip/7z/rar of .TextGrid files from MFA.
textgrid_path = '' #@param {type: 'string'}
#@markdown <font size=-1.5> (This is optional, only if you're using a custom language.)
custom_lang_path = '' #@param {type: 'string'}

!mkdir '/content/textgrids'

#initialize labbu
if language == 'default':
	labu = labbu.labbu()
elif language == 'japanese':
	labu = labbu.labbu('japanese')
elif language == 'custom':
	labu = labbu.labbu(f"{custom_lang_path}")

#decompress textgrids
!7z e {textgrid_path} -o/content/textgrids

#turns short [t] or [d] between 2 vowels or after [r] into [dx]
def quick_dxer(labu):
	dx_timing = range(300000, 500000)
	for i in range(labu.get_length()):
		curr = labu.curr_phone(i)
		prev = labu.prev_phone(i)
		next = labu.next_phone(i)
		if curr == 't' or curr == 'd':
			if labu.is_type(prev, 'vowel') and labu.is_type(next, 'vowel') and labu.get_pho_len(i) in dx_timing:
				labu.change_phone(i, 'dx')
			if labu.is_type(prev, 'r') and labu.is_type(next, 'vowel') and labu.get_pho_len(i) in dx_timing:
				labu.change_phone(i, 'dx')

#merges [uh, r] into [er]
def fix_uh_r(labu):
	for i in range(labu.get_length()):
		if labu.curr_phone(i) == 'uh' and labu.next_phone(i) == 'r':
			labu.merge(i, 'er')

#fixes [hh] preceeding [w] in words like "what" [hh w ah t] > [w ah t]
def fix_hh_w(labu):
	for i in range(labu.get_length()):
		try:
			if labu.curr_phone(i) == 'hh' and labu.next_phone(i) == 'w' and labu.is_type(labu.next_phone(i+1), 'vowel'):
				labu.merge(i, 'w')
		except IndexError as e:
			pass

#fixing "and [ah, n, d]" to be "and [ae, n]"
def fix_ah_n_d(labu):
	for i in range(labu.get_length()):
		try:
			if labu.curr_phone(i) == 'n' and labu.prev_phone(i) == 'ah' and labu.next_phone(i) == 'd':
				labu.merge(i, 'n')
				labu.change_phone(i-1, 'ae')
		except IndexError as e:
			pass

#turns any [AH0] from the TextGrid into [ax]
def AH0_2_ax(labu):
	for i in range(labu.get_length()):
		if labu.curr_phone(i) == 'AH0':
			labu.change_phone(i, 'ax')

#checks the label and normalizes time
def clean_and_check(labu):
	labu.check_label()
	labu.normalize_time()

def jpnlbl2englbl(labu):
	labu.replace_all('pau', 'SP')
	labu.replace_all('br', 'AP')
	labu.replace_all('E', 'ee')
	labu.replace_all('A', 'aa')
	labu.replace_all('I', 'iy')
	labu.replace_all('U', 'ux')
	labu.replace_all('O', 'oo')
	labu.replace_all('a', 'aa')
	labu.replace_all('i', 'iy')
	labu.replace_all('e', 'ee')
	labu.replace_all('u', 'ux')
	labu.replace_all('o', 'oo')
	labu.replace_all('N', 'n')
	labu.replace_all('j', 'jh')
	labu.replace_all('t', 'tx')
	labu.replace_all('h', 'hh')
	labu.replace_all('cl', 'q')
	labu.replace_all('r', 'dx')

	for i in range(labu.get_length()):
		if labu.is_type(labu.curr_phone(i), 'palatal'):
			labu.depalatize(i)
		if labu.curr_phone(i) == 'ts':
			labu.split_label(i, 't', 's')
		if labu.curr_phone(i) == 'z':
			labu.split_label(i, 'd', 'z')

# Convertion
Only run one, depending on the language you are converting you TextGrids from!

In [None]:
import os
from pathlib import Path
#@markdown # English (Arpabet) TextGrid > Lab convertion with Cleanup
#@markdown ---
#@markdown <font size=-1.5> convert all [AH0] phonemes in textgrids to [ax]
AH02ax = True #@param {type: 'boolean'}
#@markdown <font size=-1.5> type of labels (only thing this changes is SP/AP vs pau/br)
output_type = 'diff' #@param ['diff', 'enunu']
#@markdown <font size=-1.5> merge any instance of [uh] with [r] and change to [er]
uhr2er = True #@param {type: 'boolean'}
#@markdown <font size=-1.5> turn any very short [t] or [d] between 2 vowels into [dx]
make_dx = True #@param {type: 'boolean'}
#@markdown <font size=-1.5> fix the weird [hh][w] thing (MFA turns words like 'what' into [hh w ah t])
fix_hhw = True #@param {type: 'boolean'}
#@markdown <font size=-1.5> Change any instance of [ah][n][d] with [ae][n]. MFA labels "and" as [ah n d] and it irks me.
fix_and = True #@param {type: 'boolean'}
#@markdown <font size=-1.5> Check labs and ensure timing is correct after conversion from TextGrid. This will not fix labels but will tell you if there are labels out of language and ensure no labels overlap.
check_labs_post = True #@param {type: 'boolean'}

if not os.path.exists('/content/outlabs'):
  !mkdir /content/outlabs

for grid in glob.glob('/content/textgrids/*.TextGrid'):
  labu.load_lab_from_textgrid(grid)
  if AH02ax:
    AH0_2_ax(labu)

  labu.clean_all_phones()

  if output_type == 'diff':
    labu.enunu2diff()
    labu.fix_spap()
  elif output_type == 'enunu':
    labu.diff2enunu()

  if make_dx:
    quick_dxer(labu)

  if fix_hhw:
    fix_hh_w(labu)

  if fix_and:
    fix_ah_n_d(labu)

  new_fn = re.sub('.TextGrid', '.lab', grid)
  print(f"Wrote {new_fn}")
  labu.export_lab(new_fn)
  labu.unload_lab()

for lab in glob.glob('/content/textgrids/*.lab'):
  !mv {lab} /content/outlabs

#check and clean the labs if it's selected :)
if check_labs_post:
  for lab in glob.glob('content/outlabs/*.lab'):
    labu.load_lab(lab)
    check_and_clean(labu)
    labu.export_lab(lab)
    labu.unload_lab()

In [None]:
import os
from pathlib import Path
#@markdown # Japanese TextGrid > Lab convertion
#@markdown ---
#@markdown <font size=-1.5> type of labels (only thing this changes is SP/AP vs pau/br)
output_type = 'diff' #@param ['diff', 'enunu']
#@markdown <font size=-1.5> Check labs and ensure timing is correct after conversion from TextGrid. This will not fix labels but will tell you if there are labels out of language and ensure no labels overlap.
check_labs_post = True #@param {type: 'boolean'}

!rm /content/outlabs/*
!mkdir /content/outlabs

for grid in glob.glob('/content/textgrids/*.TextGrid'):
  labu.load_lab_from_textgrid(grid)
  new_fn = re.sub('.TextGrid', '.lab', grid)
  print(f"Wrote {new_fn}")
  labu.export_lab(new_fn)
  labu.unload_lab()

for lab in glob.glob('/content/textgrids/*.lab'):
  !mv {lab} /content/outlabs

if check_labs_post:
  for lab in glob.glob('content/outlabs/*.lab'):
    labu.load_lab(lab)
    check_and_clean(labu)
    labu.export_lab(lab)
    labu.unload_lab()


#Compress and Export

In [None]:
#@markdown # Compress Labs and export to drive
#@markdown ---
#@markdown <font size=-1.5> Choose Destination for zip folder of labs
export_path = '/content' #@param {type: 'string'}
!7z a 'conv_labs.zip' /content/outlabs -o{export_path}
print(f"Exported compressed .lab's to '{export_path}/conv_labs.zip'")
