In [1]:
# Import modules
import os, subprocess, shutil, re, csv, sys, importlib
import pandas as pd
# Turn Copy-On-Write on
pd.options.mode.copy_on_write = True
import numpy as np

# For creating textgrids
from praatio import textgrid

# For move files concurrently
from concurrent.futures import ThreadPoolExecutor

# Import Path
from pathlib import Path

# Import functions from cv_processing.py
import vxc_processing as vxcproc


In [2]:
###################################### Directories ################################################

# This is the directory where your data downloaded from Common Voice should be saved. This is the root directory where data from each language should be saved in individual folders.
# NO (BACK)SLASH at the end!!!
commonVoice_dir = '/Users/miaozhang/Research/CorpusPhon/CorpusData/CommonVoice' 

# To use XPF as the G2P engine to process lexicon, you will need to download the XPF data from: https://github.com/CohenPr-XPF/XPF/tree/master/Data and save them on your computer.
# Specify the directory where your XPF data is saved.
# NO (BACK)SLASH at the end!!!
xpf_dir = '/Users/miaozhang/Research/CorpusPhon/CorpusData/G2P/XPF' 

######################### Language name/code and Common Voice version ##############################

# Language-related variable names
# the Common Voice code of the language (unfortunately, Common Voice mixes the use of iso 639-3 and iso 639-1 codes (they use bcp47 code). This code is also used in XPF).
# The code should match the code used in the name of the folder you downloaded from Common Voice.
lang_code = 'ca' 

# The version of the data in Common Voice
# Only numbers!!!
cv_version = '17' 

######################### G2P ######################################################################

# Specify the G2P engine. Only these keywords are acceptable: 
# 'xpf' for XPF
# 'epi' for Epitran
# 'chr' for Charsiu
# 'mfa' for MFA
# 'vxc' for self-difined lexicon
g2p = 'vxc'

######################### The delimiter ############################################################

# Set it to 0 if you use a Windows machine.
if_mac = 1 

######################### What writing system is the language using? ###############################

# Specify if the language is Chinese/Japanese/Korean
if_cjk = 0

######################### Using existing model? ###############################

if_self_mod = 1

######################### Using existing lexicon? ###############################

if_self_lex = 1

###################### G2P settings (XPF or Epitran) ################################################

# This is where VxcCommunis_tracking.csv is (NO (BACK)SLASH at the end!!!):
cv_tracking_file = 'VoxCommunis_Info.csv'

# Get the G2P processing code for the language
if g2p == 'xpf' or 'chr':
    with open(cv_tracking_file, 'r') as f:
        reader = csv.DictReader(f)
        lang_row = [row for row in reader if row['code_cv'] == lang_code][0]
    # If you are using XPF, get the name of the language in XPF corpus
    if g2p == 'xpf':
        lang_name = lang_row['name_xpf'].replace(' ', '')
    # If you are using Charsiu, get the processing code for the language in Charsiu.
    elif g2p == 'chr':
        code_chr = lang_row['code_chr']

if g2p == 'epi':
    # If you are using epitran, ...
    # Please refer to VoxCommunics_info.csv to get the processing code of the language in epitran
    # !!!Do this manually, since depending on the type of the orthography, the epitran code can differ!!!
    epi_code = 'ukr-Cyrl'


# Specify if the subversion of a corpus is used. The default is 0
if_subversion = 0 
# If if_subversion == 1, what suffix you would use?:
# Ignore this part, if you don't have a subversion of the corpus you are using.
subversion = '_' + 'sub3'

###################################################################################################

if if_mac == 1:
    path_sep = '/'
    # this is the default directory where Praat is installed on a Mac.
    #praat_path = '/Applications/Praat.app/Contents/MacOS/Praat' 
else:
    path_sep = '\\'
    # the directory of Praat installed on Windows.
    #praat_path = 'C:\Program Files\Praat.exe' 

# The folder for the language
language_dir = commonVoice_dir + path_sep + lang_code + '_v' + cv_version

# The file that contains the duration of each clip:
clip_info_path = language_dir + path_sep + 'clip_durations.tsv'

# MFA paths
# The folder of the OOV word files (NO (BACK)SLASH at the end!!!):
mfa_oov_path = '/Users/miaozhang/Documents/MFA/validated'
# This is where the acoustic model will be saved after MFA training is done (NO (BACK)SLASH at the end!!!):
mfa_mod_folder = '/Users/miaozhang/Documents/MFA/pretrained_models/acoustic'


############################################################################################################################


# This is where files that will be uploaded to the OSF repo will be saved after the processing is finished (NO (BACK)SLASH at the end!!!):
osf_path = '/Users/miaozhang/Research/CorpusPhon/CorpusData/VoxCommunis_OSF'


####################################################################################################################################
####################################################################################################################################

# Get the naming schema.
naming_schema = pd.read_csv('vxc_naming_schema.csv', usecols = ['Python_code'])['Python_code'].tolist()
naming_schema = [eval(name) for name in naming_schema]
acs_mod_name = naming_schema[0]
textgrid_folder_name = naming_schema[1]
word_file_name = naming_schema[2]
dict_file_name = naming_schema[3]
spkr_file_name = naming_schema[4]
textgrid_folder_path = language_dir + path_sep + textgrid_folder_name
word_file_path = language_dir + path_sep + word_file_name
dict_file_path = language_dir + path_sep + dict_file_name
spkr_file_path = language_dir + path_sep + spkr_file_name
del naming_schema

###################################################################################################################
###################################################################################################################

# For step 3: prepare the lexicon and pronunciation dictionary
validated_log = language_dir + path_sep + 'validated.tsv'

###################################################################################################################
###################################################################################################################

# For step 4: G2P
if g2p == 'xpf':
    xpf_translater_path = 'xpf_translate04.py'
    rule_file_path = xpf_dir + path_sep + lang_code + '_' + lang_name + path_sep + lang_code + '.rules'
    verify_file_path = xpf_dir + path_sep + lang_code + '_' + lang_name + path_sep + lang_code + '.verify.csv'
elif g2p == 'epi':
    epitran_translater_path = 'epi_run.py'
elif g2p == 'chr':
    from transformers import T5ForConditionalGeneration, AutoTokenizer
    chr_model = T5ForConditionalGeneration.from_pretrained('charsiu/g2p_multilingual_byT5_tiny_16_layers_100')
    chr_tok = AutoTokenizer.from_pretrained('google/byt5-small')

###################################################################################################################
###################################################################################################################

# For step 6: running MFA
# Validate the corpus
validated_recs_path = language_dir + path_sep + 'validated'
if if_subversion == 0:
    acs_mod_path = mfa_mod_folder + path_sep + acs_mod_name
else:
    acs_mod_name = re.sub('.zip', subversion + '.zip', acs_mod_name)
    acs_mod_path = mfa_mod_folder + path_sep + acs_mod_name
output_path = language_dir + path_sep + 'output'

if if_self_mod == 1:
    # Specify the path of the model
    acs_mod_path = '/Users/miaozhang/Documents/MFA/pretrained_models/acoustic/ca_vxc_acoustic16.zip'
if if_self_lex == 1:
    # Specify the path of the lexicon
    dict_file_path = '/Users/miaozhang/Documents/MFA/pretrained_models/acoustic/ca_lexicon-IPA.dict'   

###################################################################################################################
###################################################################################################################

# Finale:
txtgrds_path = osf_path + path_sep + 'textgrids' + path_sep + textgrid_folder_name[:-4]

###################################################################################################################
###################################################################################################################

print("Processing the folder:\t", language_dir)
print("The acoustic model to be trained/used:\t", acs_mod_path)
print("The lexicon to be generated/used:\t", dict_file_path)

Processing the folder:	 /Users/miaozhang/Research/CorpusPhon/CorpusData/CommonVoice/ca_v17
The acoustic model to be trained/used:	 /Users/miaozhang/Documents/MFA/pretrained_models/acoustic/ca_vxc_acoustic16.zip
The lexicon to be generated/used:	 /Users/miaozhang/Documents/MFA/pretrained_models/acoustic/ca_lexicon-IPA.dict


In [3]:
whole = vxcproc.remap_spkr(language_dir, path_sep, spkr_file_path, lang_code)

In [7]:
# Group the files and create the subfolders
valid_clips = whole[whole['validation'] == 'validated']
all_items = os.listdir(validated_recs_path)
all_mp3 = [item for item in all_items if os.path.splitext(item)[1] == '.mp3']
n_clips = len(all_mp3)
if n_clips > 32000:
    # Create the paths in the subfolders for each recording according to their grouping
    splits = vxcproc.make_groups(validated_recs_path, valid_clips, 32000)
    splits = splits[splits['path'].isin(all_mp3)]

splits.to_csv(os.path.join(language_dir, 'all_splits.csv'), index = False)
del all_items, all_mp3

In [None]:
def split_recs(df):
    for src_snd, sub_snd in zip(df.new_path, df.sub_path):
        src_tg = Path(src_snd).with_suffix('.TextGrid')
        sub_tg = Path(sub_snd).with_suffix('.TextGrid')
        try:
            shutil.move(src_snd, sub_snd)
            shutil.move(src_tg, sub_tg)
        except Exception as e:
            print(f"File moving error: {e}")

def merge_recs(df):
    for src_snd, sub_snd in zip(df.sub_path, df.new_path):
        src_tg = Path(src_snd).with_suffix('.TextGrid')
        sub_tg = Path(sub_snd).with_suffix('.TextGrid')
        try:
            shutil.move(src_snd, sub_snd)
            shutil.move(src_tg, sub_tg)
        except Exception as e:
            print(f"File moving error: {e}")