# Text normalization



In [1]:
#!docker run --gpus=all --net=host --rm -it -v $PWD:/myworkspace nvcr.io/nvidia/nemo:22.01 bash

In [3]:
from typing import List
import os
import json
import multiprocessing

from tqdm import tqdm
from functools import partial

from nemo_text_processing.text_normalization.normalize import Normalizer

def load_jsonl(filepath):
    data = []
    with open(filepath, 'r', encoding='utf_8') as fp:
        inlines = fp.readlines()
        for line in inlines:
            if line.startswith("//") or line.strip() == '':
                continue
            row = json.loads(line)
            data.append(row)
    return data


def dump_jsonl(filepath, data):
    with open(filepath, 'w') as fp:
        for datum in data:
            row = json.dumps(datum, ensure_ascii=False)
            fp.write(row)
            fp.write('\n')
            
def normalize_manifest(input_manifest, output_manifest, normalizer):
    utterances = load_jsonl(input_manifest)
    transcripts = [utt['text_original'] for utt in utterances]
    
    pool = multiprocessing.Pool(processes=os.cpu_count())
    normalized_result = tqdm(pool.imap(partial(normalizer.normalize, verbose=False), transcripts))
    for i, text in enumerate(normalized_result):
        utterances[i]['text'] = text  
    dump_jsonl(output_manifest, utterances)

In [None]:
normalizer = Normalizer(input_case="cased", lang='de')

for dataset in ['mls', 'voxpopuli', 'mcv']:
    for subset in ['train', 'dev', 'test']:        
        input_manifest = os.path.join('./data/processed/', dataset, f"{dataset}_{subset}_manifest.json")
        output_manifest = os.path.join('./data/processed/', dataset, f"{dataset}_{subset}_manifest_normalized.json")
        print("Processing ", input_manifest)
        normalize_manifest(input_manifest, output_manifest, normalizer)
            

[NeMo I 2022-05-05 13:23:46 tokenize_and_classify:83] Creating ClassifyFst grammars. This might take some time...
Processing  ./data/processed/mls/mls_train_manifest.json


2703it [14:12,  2.67it/s]