In [1]:
# !pip3 install git+https://github.com/mesolitica/malaya-speech@8fe9cfea37fc32ac63399d9ae5fff22af697f4be
# !pip3 install num2words

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
from transformers.models.bart.modeling_bart import shift_tokens_right
from transformers.models.gpt2 import modeling_gpt2
import malaya
from malaya.text.normalization import cardinal
import malaya_speech
from malaya_speech.utils.subword import merge_sentencepiece_tokens
import re
import itertools
import unicodedata
import json
import numpy as np
from num2words import num2words
from streaming import MDSWriter, LocalDataset

tokenizer = malaya.tokenizer.Tokenizer(hypen = False, parliament = False, time = False, time_pukul = False,
                                      temperature = False, distance = False, volume = False, duration = False,
                                      weight = False, date = False, money = False)

def cardinal_en(x):
    cp_x = x[:]
    try:
        if re.match('.*[A-Za-z]+.*', x):
            return x
        x = re.sub(',', '', x, count=10)

        if re.match('.+\\..*', x):
            x = num2words(float(x))
        elif re.match('\\..*', x):
            x = num2words(float(x))
        else:
            x = num2words(int(x))
        x = re.sub('-', ' ', x, count=10)
        return x
    except BaseException as e:
        logger.debug(traceback.format_exc())
        return cp_x

Cannot import beam_search_ops from Tensorflow 1, ['malaya.jawi_rumi.deep_model', 'malaya.phoneme.deep_model', 'malaya.rumi_jawi.deep_model', 'malaya.stem.deep_model'] for stemmer will not available to use, make sure Tensorflow 1 version >= 1.15
`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [4]:
def tokenize_and_replace(t, en = False):
    tokenized = tokenizer.tokenize(t)
    for i in range(len(tokenized)):
        if en:
            c = cardinal_en(tokenized[i])
        else:
            c = cardinal(tokenized[i])
        if c != tokenized[i]:
            tokenized[i] = c
    return ' '.join(tokenized)

tokenize_and_replace('pada 15 ogos 1940 70% pihak berikat menyerang perancis selatan serangan ini dipanggil operation dragoon')

'pada lima belas ogos seribu sembilan ratus empat puluh 70% pihak berikat menyerang perancis selatan serangan ini dipanggil operation dragoon'

In [5]:
tokenize_and_replace(
    'pada 15 ogos 1940 pihak berikat menyerang perancis selatan serangan ini dipanggil operation dragoon',
en = True)

'pada fifteen ogos one thousand, nine hundred and forty pihak berikat menyerang perancis selatan serangan ini dipanggil operation dragoon'

In [6]:
vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "z", "0", "1", "x", "2", "q", "5", "3", "4", "6", "9", "8", "7"]
sr = 16000

def preprocessing_text(string, en = False):
    
    string = tokenize_and_replace(string, en = en)
    string = unicodedata.normalize('NFC', string.lower())
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

In [7]:
model = malaya_speech.force_alignment.transducer.huggingface(model = 'mesolitica/conformer-medium-mixed')

In [8]:
_ = model.cuda()

In [9]:
dataset = LocalDataset('pseudolabel')
len(dataset)

2221856

In [10]:
!mkdir force-alignment

mkdir: cannot create directory ‘force-alignment’: File exists


In [11]:
def convert(results):
    diag = np.diag(results['alignment']).tolist()
    subwords_alignment = results['subwords_alignment']

    for i in range(len(subwords_alignment)):
        subwords_alignment[i]['start'] = float(subwords_alignment[i]['start'])
        subwords_alignment[i]['end'] = float(subwords_alignment[i]['end'])
        subwords_alignment[i]['score'] = float(subwords_alignment[i]['score'])
        
    return diag, subwords_alignment

In [None]:
from tqdm import tqdm

for i in tqdm(range((len(dataset) // 2) * 1, (len(dataset) // 2) * 2, 1)):
    filename = os.path.join('force-alignment', f'{i}.json')
    if os.path.exists(filename):
        continue
    
    l = json.loads(dataset[i]['text'])
    if not os.path.exists(l['audio_filename']):
        continue
    
    y, _ = malaya_speech.load(l['audio_filename'])
    t_ms = l['predict_ms'][41:-13].strip()
    t_en = l['predict_en'][41:-13].strip()
    try:
        p_ms = preprocessing_text(t_ms)
    except:
        p_ms = None
    try:
        p_en = preprocessing_text(t_en, en = True)
    except:
        p_en = None
    
    try:
        results_ms = model.predict(y, p_ms)
        diag_ms, subwords_alignment_ms = convert(results_ms)
    except:
        diag_ms = None
        subwords_alignment_ms = None
        
    try:
        results_en = model.predict(y, p_en)
        diag_en, subwords_alignment_en = convert(results_en)
    except:
        diag_en = None
        subwords_alignment_en = None

    d = {
        'p_ms': p_ms,
        'p_en': p_en,
        'diag_ms': diag_ms,
        'subwords_alignment_ms': subwords_alignment_ms,
        'diag_en': diag_en,
        'subwords_alignment_en': subwords_alignment_en,
    }

    with open(filename, 'w') as fopen:
        json.dump(d, fopen)

  0%|          | 208/1110928 [06:22<589:03:45,  1.91s/it]

In [51]:
from glob import glob

files = glob('force-alignment/*.json')

In [52]:
with open(files[0]) as fopen:
    data = json.load(fopen)

In [54]:
data.keys()

dict_keys(['p_ms', 'p_en', 'diag_ms', 'subwords_alignment_ms', 'diag_en', 'subwords_alignment_en'])

In [57]:
split, temp = [], []
diag = data['diag_ms']
for no, r in enumerate(data['subwords_alignment_ms']):
    if r['score'] >= 0.05 or diag[no] > 0.1:
        temp.append(r)
    
    else:
        if len(temp):
            split.append(temp)
            temp = []
            
if len(temp):
    split.append(temp)
    
selected = []
for s in split:
    start = s[0]['start']
    end = s[-1]['start']
    if end - start >= 0.5:
        seq = [s_['text'] for s_ in s]
        merged = model.tokenizer.sp_model.Decode(model.tokenizer.sp_model.PieceToId(seq))
        selected.append((merged, start, end))
        
selected

[]

In [76]:
split, temp = [], []
diag = data['diag_en']
for no, r in enumerate(data['subwords_alignment_en']):
    if r['score'] >= 0.05 or diag[no] > 0.1:
        temp.append(r)
    
    else:
        if len(temp):
            split.append(temp)
            temp = []
            
if len(temp):
    split.append(temp)
    
selected = []
for s in split:
    start = s[0]['start']
    a = [s[0]]
    for s_ in s[1:]:
        a.append(s_)
        end = s_['end'] + 0.1
        if end - start >= 0.5:
            seq = [s__['text'] for s__ in a]
            merged = model.tokenizer.sp_model.Decode(model.tokenizer.sp_model.PieceToId(seq))
            selected.append((merged, start, end))
        
len(selected)

53