In [1]:
import json
import random
import os
import string
import re
import hashlib


def get_manifest_lines(manifest_items, shuffle=True):
    manifest_lines = [json.dumps(item, ensure_ascii=False) + '\n' for item in manifest_items]
    if shuffle:
        random.shuffle(manifest_lines)
    return manifest_lines

    
current_dir = os.path.abspath('')
with open(f'{current_dir}/manifest/jasper_manifest.json') as f:
    manifest_items = [json.loads(line) for line in f.readlines() if line != '']
    
    final_letter_map = str.maketrans(
        'ךםןףץ',
        'כמנפצ',
    )
    def transform(s):
        r = s
        r = r.translate(final_letter_map)
        #r = re.sub(r'\s*([\.,])\s*', r' \1 ', r)
        #r = re.sub(r'([\.,])(\s+\1)+', lambda match: match.group(0).replace(' ', ''), r)
        r = re.sub(r'[\.,]', r'', r)
        r = r.strip()
        return r
        
    max_duration = 15
    
    transformed_manifest = []
    for item in manifest_items:
        if item['duration'] > max_duration:
            continue
            
        transformed_item = {
            **item,
            'text':  transform(item['text']),
        }
        transformed_manifest.append(transformed_item)
        
    #manifest_lines = [json.dumps(item, ensure_ascii=False) + '\n' for item in transformed_manifest]
    
val_lines_factor = 0.02
test_lines_factor = 0.02

train_items = []
val_items = []
test_items = []

for item in transformed_manifest:
    item_hash = hashlib.sha1(item['audio_filepath'].encode()).hexdigest()
    hash_num = int(item_hash, 16) % (2 ** 32)
    
    if hash_num < val_lines_factor * (2 ** 32):
        val_items.append(item)
    elif hash_num < (val_lines_factor + test_lines_factor) * (2 ** 32):
        test_items.append(item)
    else:
        train_items.append(item)


train_lines = get_manifest_lines(train_items)
val_lines = get_manifest_lines(val_items)
test_lines = get_manifest_lines(test_items)

# num_lines = len(manifest_lines)
# num_val_lines = int(num_lines * 0.02)
# num_test_lines = int(num_lines * 0.02)
# num_train_lines = num_lines - num_val_lines - num_test_lines
# 
# start = end = 0
# 
# start = end
# end += num_train_lines
# train_lines = manifest_lines[start:end]
# 
# start = end
# end += num_val_lines
# val_lines = manifest_lines[start:end]
# 
# start = end
# end += num_test_lines
# test_lines = manifest_lines[start:end]

target_dir = f'{current_dir}/manifest/normalized'
os.makedirs(target_dir, exist_ok=True)

transformed_manifest_path = f'{target_dir}/transformed_manifest.json'
with open(transformed_manifest_path, 'w') as f:
    json.dump(transformed_manifest, f)

train_manifest_path = f'{target_dir}/jasper_manifest.train.json'
with open(train_manifest_path, 'w') as f:
    print(f'Writing {len(train_lines)} lines to train manifest')
    f.writelines(train_lines)

val_manifest_path = f'{target_dir}/jasper_manifest.val.json'
with open(val_manifest_path, 'w') as f:
    print(f'Writing {len(val_lines)} lines to val manifest')
    f.writelines(val_lines)
    
test_manifest_path = f'{target_dir}/jasper_manifest.test.json'
with open(test_manifest_path, 'w') as f:
    print(f'Writing {len(test_lines)} lines to test manifest')
    f.writelines(test_lines)

Writing 303034 lines to train manifest
Writing 6207 lines to val manifest
Writing 6294 lines to test manifest


In [94]:
%set_env WORKDIR={os.environ['PWD']}/hebrew-stt/workdir/jasper
%set_env CUDA_VISIBLE_DEVICES=0
!mkdir -p $WORKDIR/logs && \
    python ../nemo/examples/asr/jasper.py \
    --model_config $PWD/jasper_checkpoints/jasper_orig/jasper10x5dr_custom.yaml \
    --load_dir $PWD/jasper_checkpoints/jasper_orig \
    --checkpoint_dir $WORKDIR/model_checkpoints \
    --work_dir $WORKDIR \
    --train_dataset $PWD/manifest/jasper_manifest.train.json \
    --eval_datasets $PWD/manifest/jasper_manifest.val.json \
    --tensorboard_dir $WORKDIR/tensorboard \
    --batch_size 8 \
    --num_epochs 100 \
    --lr 0.015 \
    --warmup_steps 8000 \
    --weight_decay 0.00 \
    2>&1 \
    | tee $WORKDIR/logs/jasper.log;

env: WORKDIR=/code/hebrew-stt/workdir/jasper
env: CUDA_VISIBLE_DEVICES=0
^C


In [None]:
%set_env WORKDIR={os.environ['PWD']}/hebrew-stt/workdir/quartznet
%set_env CUDA_VISIBLE_DEVICES=1
!mkdir -p $WORKDIR/logs && \
    python ../nemo/examples/asr/jasper.py \
    --model_config $PWD/jasper_checkpoints/quartznet_orig/quartznet15x5_custom.yaml \
    --checkpoint_dir $WORKDIR/model_checkpoints \
    --work_dir $WORKDIR \
    --train_dataset $PWD/manifest/jasper_manifest.train.json \
    --eval_datasets $PWD/manifest/jasper_manifest.val.json \
    --tensorboard_dir $WORKDIR/tensorboard \
    --batch_size 16 \
    --num_epochs 100 \
    --lr 0.015 \
    --warmup_steps 8000 \
    --weight_decay 0.00 \
    2>&1 \
    | tee $WORKDIR/logs/jasper.log;

env: WORKDIR=/code/hebrew-stt/workdir/quartznet
env: CUDA_VISIBLE_DEVICES=1
[NeMo I 2020-04-04 10:53:42 collections:138] Dataset loaded with 317153 files totalling 968.00 hours
[NeMo I 2020-04-04 10:53:42 collections:139] 15507 files were filtered totalling 84.14 hours
[NeMo I 2020-04-04 10:53:42 jasper:99] Have 317153 examples to train on.
[NeMo I 2020-04-04 10:53:42 features:144] PADDING: 16
[NeMo I 2020-04-04 10:53:42 features:152] STFT using conv
[NeMo I 2020-04-04 10:53:45 collections:138] Dataset loaded with 6927 files totalling 21.95 hours
[NeMo I 2020-04-04 10:53:45 collections:139] 3 files were filtered totalling 0.00 hours
[NeMo I 2020-04-04 10:53:45 jasper:147] Number of parameters in encoder: 18894656
[NeMo I 2020-04-04 10:53:45 jasper:148] Number of parameters in decoder: 27675
[NeMo I 2020-04-04 10:53:45 jasper:149] Total number of parameters in model: 18922331
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults f