# init

In [None]:
!pip install torch soundfile ruamel.yaml omegaconf nemo-toolkit datasets gdown

In [None]:
!pip install lhotse jiwer pytorch-lightning nemo-asr hydra-core ruamel.yaml webdataset pyannote.audio

In [None]:
!pip install nemo_asr

In [4]:
!git clone https://github.com/AbdelrhmanElnenaey/ASR_for_egyptian_dialect

Cloning into 'ASR_for_egyptian_dialect'...
remote: Enumerating objects: 588, done.[K
remote: Counting objects: 100% (182/182), done.[K
remote: Compressing objects: 100% (164/164), done.[K
remote: Total 588 (delta 108), reused 35 (delta 17), pack-reused 406 (from 1)[K
Receiving objects: 100% (588/588), 31.70 MiB | 19.40 MiB/s, done.
Resolving deltas: 100% (317/317), done.


In [5]:
%cd ASR_for_egyptian_dialect

/content/ASR_for_egyptian_dialect


In [6]:
import nemo
import torch

print(f"Nemo version: {nemo.__version__}")
print(f"PyTorch version: {torch.__version__}")

Nemo version: 2.2.1
PyTorch version: 2.6.0+cu124


# Code

In [14]:
import torch
import librosa
import nemo.collections.asr as nemo_asr
from ruamel.yaml import YAML
from omegaconf import OmegaConf
from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load ASR model
def load_asr_model(ckpt_path):
    config_path = '/content/ASR_for_egyptian_dialect/configs/FC-transducer-inference.yaml'
    yaml = YAML(typ='safe')
    with open(config_path) as f:
        params = yaml.load(f)
    params['model'].pop('test_ds', None)
    conf = OmegaConf.create(params)
    conf['model'].vocab_size = 32001

    model = nemo_asr.models.EncDecRNNTBPEModel(cfg=conf['model']).to(device)

    # List of incompatible layer keys
    incompatible_keys = [
        'decoder.prediction.embed.weight',
        'joint.joint_net.2.weight',
        'joint.joint_net.2.bias'
    ]

    checkpoint = torch.load(ckpt_path, map_location=device, weights_only=False)['state_dict']

    # Filter out incompatible keys
    filtered_checkpoint = {k: v for k, v in checkpoint.items() if k not in incompatible_keys}

    model.load_state_dict(filtered_checkpoint, strict=False)
    model.eval()

    return model

# Function to load audio using librosa
def load_audio(audio_path):
    audio, sr = librosa.load(audio_path, sr=16000)  # Ensure correct sample rate
    return audio, sr

# Inference function
def infer(model, audio):
    return model.transcribe([audio])

# Path to your specific WAV file
audio_path = "/content/recorded_audio.wav"

# Load the ASR model
print("Loading ASR model...")
asr_model = load_asr_model("/content/drive/MyDrive/asr_model.ckpt")

# Load the audio using librosa
print(f"Loading audio from {audio_path}...")
audio, sr = load_audio(audio_path)

# Perform inference
print("Starting inference...")
with torch.no_grad():
    transcript = infer(model=asr_model, audio=audio)

# Print or save the result
print(f"\n\nTranscript: {transcript[0].text}")

Loading ASR model...
[NeMo I 2025-04-16 17:09:21 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 32000 tokens


[NeMo W 2025-04-16 17:09:52 nemo_logging:405] Could not load dataset as `manifest_filepath` was None. Provided config : {'manifest_filepath': None, 'sample_rate': 16000, 'batch_size': 32, 'shuffle': True, 'num_workers': 8, 'pin_memory': True, 'max_duration': 16.7, 'min_duration': 0.1, 'is_tarred': False, 'tarred_audio_filepaths': None, 'shuffle_n': 2048, 'bucketing_strategy': 'fully_randomized', 'bucketing_batch_size': None}
[NeMo W 2025-04-16 17:09:57 nemo_logging:405] Could not load dataset as `manifest_filepath` was None. Provided config : {'manifest_filepath': None, 'sample_rate': 16000, 'batch_size': 32, 'shuffle': False, 'use_start_end_token': False, 'num_workers': 8, 'pin_memory': True}


[NeMo I 2025-04-16 17:09:57 nemo_logging:393] PADDING: 0
[NeMo I 2025-04-16 17:10:10 nemo_logging:393] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}
[NeMo I 2025-04-16 17:10:10 nemo_logging:393] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}
[NeMo I 2025-04-16 17:10:10 nemo_logging:393] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}
Loading audio from /content/recorded_audio.wav...
Starting inference...


Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]
Beam search progress::   0%|          | 0/1 [00:00<?, ?sample/s][A
Beam search progress:: 100%|██████████| 1/1 [03:08<00:00, 188.41s/sample]
Transcribing: 100%|██████████| 1/1 [03:08<00:00, 188.72s/it]



Transcript: ٫٠٠ فالأ الحث





# Another tokenizer

In [1]:
from transformers import AutoTokenizer
import sentencepiece as spm

# Load the Aranizer tokenizer
tokenizer = AutoTokenizer.from_pretrained("riotu-lab/Aranizer-SP-32k")

vocab = tokenizer.get_vocab()

# Save the tokens to a file (one token per line)
with open("vocab.txt", "w") as f:
    for token in sorted(vocab, key=lambda x: vocab[x]):
        f.write(token + "\n")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sentencepiece as spm

spm.SentencePieceTrainer.train(
    input="C:/Users/Patrickn/Jupyter_notebooks/Graduation/vocab.txt",          # Your training data file
    model_prefix="tokenizer",    # Outputs: tokenizer.model, tokenizer.vocab
    vocab_size=32000,            # Match the original tokenizer's vocab size
    model_type="bpe"            # Use "bpe" or "unigram" (match your original tokenizer's algorithm)
)

In [3]:
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load("C:/Users/Patrickn/Jupyter_notebooks/Graduation/docker_image/ASR_for_egyptian_dialect/token/tokenizer.model")

# Example encoding
text = "باتريك"
print("Tokens:", sp.encode_as_pieces(text))
print("IDs:", sp.encode_as_ids(text))

Tokens: ['▁بات', 'ريك']
IDs: [2014, 8348]
