## Ph[]neMask

In [2]:
import textgrid
from pydub import AudioSegment
import random
import os
from tqdm import tqdm

In [3]:
CONSONANTS = {
    'NASAL': {
        'LABIAL': ['m', 'mʲ', 'm̩'],
        'LABIODENTAL': ['ɱ'],
        'DENTAL': ['n̪'],
        'ALVEOLAR': ['n', 'n̩'],
        'PALATAL': ['ɲ'],
        'VELAR': ['ŋ']
    },
    'STOP': {
        'LABIAL': ['p', 'b'],
        'DENTAL': ['t', 'd'],
        'ALVEOLOPALATAL': ['c', 'ɟ'],
        'VELAR': ['k', 'g'],
        'GLOTTAL': ['ʔ']
    },
    'AFFRICATE': {
        'ALVEOPALATAL': ['tʃ', 'dʒ']
    },
    'SIBILANT': {
        'ALVEOLAR': ['s', 'z'],
        'ALVEOLOPALATAL': ['ʃ', 'ʒ']
    },
    'FRICATIVE': {
        'LABIAL': ['f', 'v', 'fʲ', 'vʲ'],
        'DENTAL': ['θ', 'ð'],
        'PALATAL': ['ç'],
        'GLOTTAL': ['h']
    },
    'APPROXIMANT': {
        'LABIAL': ['w'],
        'ALVEOLOPALATAL': ['ɹ'],
        'PALATAL': ['j']
    },
    'TAP': {
        'ALVEOLAR': ['ɾ']
    },
    'LATERAL': {
        'ALVEOLAR': ['l', 'ɫ', 'ɫ̩'],
        'PALATAL': ['ʎ']
    }
}

In [4]:
VOWELS = {
    'CLOSE': {
        'FRONT': ['i', 'iː'],
        'NEAR-FRONT': ['ɪ'],
        'CENTRAL': ['u', 'uː']
    },
    'CLOSE-MID': {
        'FRONT': ['ej', 'e'],
        'CENTRAL': ['ə', 'ɘ'],
        'BACK': ['ow']
    },
    'OPEN-MID': {
        'FRONT': ['ɛ', 'æ'],
        'CENTRAL': ['ɜ', 'ɐ'],
        'BACK': ['ʌ', 'ɔ', 'o'],
    },
    'OPEN': {
        'BACK': ['ɑ', 'ɑː', 'ɒ', 'ɒː']
    }
}

In [11]:
PHONES_TO_MASK = []

PHONES_TO_MASK.extend(CONSONANTS['NASAL']['PALATAL'])
PHONES_TO_MASK.extend(CONSONANTS['AFFRICATE']['ALVEOPALATAL'])
PHONES_TO_MASK.extend(CONSONANTS['SIBILANT']['ALVEOLOPALATAL'])
PHONES_TO_MASK.extend(CONSONANTS['FRICATIVE']['PALATAL'])
PHONES_TO_MASK.extend(CONSONANTS['APPROXIMANT']['PALATAL'])
PHONES_TO_MASK.extend(CONSONANTS['LATERAL']['PALATAL'])

# for position in VOWELS['OPEN-MID']:
#     PHONES_TO_MASK.extend(VOWELS['OPEN-MID'][position])

In [13]:
PHONES_TO_MASK

['ɲ', 'tʃ', 'dʒ', 'ʃ', 'ʒ', 'ç', 'j', 'ʎ']

In [15]:
DIR = 'audio\\train'

for filename in tqdm(os.listdir(DIR)):
    if len(PHONES_TO_MASK) > 0:
        file = os.path.join(DIR, filename)        
        if os.path.isfile(file):
            rec = filename.split('.')[0]
            if rec.endswith('w'):
                speaker = filename.split('u')[0]
                
                audio = AudioSegment.from_file(file)
                alignment = textgrid.TextGrid.fromFile('FA_output/{}/{}.TextGrid'.format(speaker, rec))
                phones_tier = alignment.getList('phones')[0]

                f_intervals = [(interval.minTime, interval.maxTime) for interval in phones_tier if interval.mark in PHONES_TO_MASK]
                # f_intervals = [x for x in f_intervals if bool(random.getrandbits(1))] # probabilities

                if len(f_intervals) > 0:
                    for start_time, end_time in f_intervals:
                        start_time_ms = start_time * 1000
                        end_time_ms = end_time * 1000

                        silence = AudioSegment.silent(duration=end_time_ms - start_time_ms)
                        audio = audio[:start_time_ms] + silence + audio[end_time_ms:]

                    audio.export("augmented_data/palatals/{}_masked.WAV".format(rec), format="wav") 

100%|██████████| 37140/37140 [33:38<00:00, 18.40it/s] 
