In [1]:
!pip install nemo_toolkit[asr] datasets transformers librosa jiwer git-lfs accelerate torch soundfile tqdm audiomentations wandb

Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting git-lfs
  Downloading git_lfs-1.6-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting audiomentations
  Downloading audiomentations-0.37.0-py3-none-any.whl.metadata (11 kB)
Collecting nemo_toolkit[asr]
  Downloading nemo_toolkit-1.23.0-py3-none-any.whl.metadata (18 kB)
Collecting triton (from nemo_toolkit[asr])
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Collecting wget (from nemo_toolkit[asr])
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting braceexpand (from nemo_toolkit[asr])
  Downloading braceexpand-0.1.7-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting editdistance (from nemo_toolkit[asr])
  Downloading editdistance-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting g2p-en (from nemo_toolkit[asr])
  Downloading g2p_en-2.1.0-py3-none-any.whl.metada

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset, DatasetDict

# Load the dataset
dataset = load_dataset("AILAB-VNUHCM/vivos", trust_remote_code=True)

# Split the dataset into train, validation, and test sets (80-10-10)
def split_dataset(dataset, train_size=0.8, val_size=0.1):
    # Split the dataset into train and remaining (val+test)
    train_test_split_data = dataset['train'].train_test_split(test_size=1 - train_size)
    train_dataset = train_test_split_data['train']
    remaining_dataset = train_test_split_data['test']

    # Split the remaining dataset into validation and test
    val_test_split_data = remaining_dataset.train_test_split(test_size=val_size / (1 - train_size))
    val_dataset = val_test_split_data['train']
    test_dataset = val_test_split_data['test']

    # Create a DatasetDict
    dataset_dict = DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })
    
    return dataset_dict

# Split the dataset
dataset = split_dataset(dataset)

Downloading builder script:   0%|          | 0.00/6.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.00k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/345k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11660 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/760 [00:00<?, ? examples/s]

In [4]:
dataset = dataset.select_columns(['audio','sentence'])
dataset 

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 9328
    })
    validation: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 1165
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 1167
    })
})

In [5]:
# Function to extract duration from audio array and sample rate
def extract_duration(batch):
    audio_array = batch["audio"]["array"]
    sample_rate = batch["audio"]["sampling_rate"]
    batch["duration"] = len(audio_array) / sample_rate
    return batch

# Apply the function to compute duration and add it as a new column
dataset = dataset.map(extract_duration, num_proc=4)

# Verify the dataset with the new duration column
print(dataset)

Map (num_proc=4):   0%|          | 0/9328 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1165 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1167 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'duration'],
        num_rows: 9328
    })
    validation: Dataset({
        features: ['audio', 'sentence', 'duration'],
        num_rows: 1165
    })
    test: Dataset({
        features: ['audio', 'sentence', 'duration'],
        num_rows: 1167
    })
})


In [6]:
import re

number_to_words = {
    '0': 'KHÔNG',
    '1': 'MỘT',
    '2': 'HAI',
    '3': 'BA',
    '4': 'BỐN',
    '5': 'NĂM',
    '6': 'SÁU',
    '7': 'BẢY',
    '8': 'TÁM',
    '9': 'CHÍN'
}

chars_to_ignore_regex = r'[\,\?\.\!\-\;\:\"]'

def normalize_text(text):
    # Replace numbers with words
    text = re.sub(r'\d', lambda x: number_to_words[x.group()], text)
    # Remove unwanted characters
    text = re.sub(chars_to_ignore_regex, ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Convert to lowercase
    text = text.lower()
    return text


In [7]:
dataset = dataset.map(lambda x: {'text': normalize_text(x['sentence'])}, remove_columns=['sentence'])

Map:   0%|          | 0/9328 [00:00<?, ? examples/s]

Map:   0%|          | 0/1165 [00:00<?, ? examples/s]

Map:   0%|          | 0/1167 [00:00<?, ? examples/s]

In [8]:
import random
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
import numpy as np

# Define audio augmentation pipeline
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.01, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.1, p=0.5),
    PitchShift(min_semitones=-3, max_semitones=3, p=0.5),
    Shift(min_shift=-0.3, max_shift=0.3, p=0.5),
])

# Function to augment audio
def augment_audio(batch):
    audio = batch["audio"]
    samples = np.array(audio["array"])
    sample_rate = audio["sampling_rate"]
    
    augmented_samples = augment(samples=samples, sample_rate=sample_rate)
    
    batch["audio"] = {
        "array": augmented_samples,
        "sampling_rate": sample_rate
    }
    return batch

In [8]:
# Apply audio augmentation
dataset = dataset.map(augment_audio, num_proc=4)

# dataset['train'] = dataset['train'].map(augment_audio, num_proc=4)
# dataset['validation'] = dataset['validation'].map(augment_audio, num_proc=4)

NameError: name 'augment_audio' is not defined

In [8]:
import os
import soundfile as sf
import tqdm
import json

def save_audio_file(array, path, sr=16000):
    sf.write(path, array, sr)

output_base_dir = "/kaggle/working/"

# Loop through each split (train, validation, test)
for split_name in ['train', 'validation', 'test']:
    split_data = dataset[split_name]
    output_dir = os.path.join(output_base_dir, split_name)
    output_wavs_dir = os.path.join(output_dir, "wavs")

    # Create output directories if they don't exist
    os.makedirs(output_wavs_dir, exist_ok=True)

    # JSON manifest file for the split
    json_file = os.path.join(output_base_dir, f'{split_name}_manifest.json')

    # Create and open the JSON manifest file
    with open(json_file, 'w', encoding='utf8') as jf:

        # Loop through dataset split and save audio files
        for index, row in tqdm.tqdm(enumerate(split_data)):
            text = row["text"]
            duration = row["duration"]
            path_wav_out = os.path.join(output_wavs_dir, f"{index}.wav")

            # Save audio file
            save_audio_file(row["audio"]["array"], path_wav_out)

            # Write metadata to JSON
            if os.path.exists(path_wav_out):
                manifest_item = {
                    "audio_filepath": path_wav_out,
                    "duration": duration,
                    "text": text
                }
                json.dump(manifest_item, jf, ensure_ascii=False)
                jf.write('\n')  # Write newline character after each JSON object

print("\nDataset processed successfully.")


9328it [01:05, 141.53it/s]
1165it [00:08, 145.13it/s]
1167it [00:08, 144.52it/s]


Dataset processed successfully.





In [9]:
import json
import textwrap

def create_transcript_all(dataset_dict, output_file):
    all_transcripts = []
    for split in dataset_dict.values():
        for item in split:
            all_transcripts.append(item["text"])

    # Nối tất cả các đoạn text lại với nhau bằng dấu cách
    combined_transcript = " ".join(all_transcripts)

    # Tách chuỗi thành các dòng không quá 10 từ
    wrapped_lines = textwrap.wrap(combined_transcript, width=40)

    # Ghi vào file JSON theo định dạng yêu cầu
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in wrapped_lines:
            f.write(line + "\n")

# Gọi hàm để tạo transcript file cho tất cả các phần của dataset
create_transcript_all(dataset, "transcript_all.json")

print("Transcript file created successfully.")


Transcript file created successfully.


In [10]:
import os

# Create the directory
os.makedirs('/kaggle/working/Fast-Conformer/hparam', exist_ok=True)
os.makedirs('/kaggle/working/Fast-Conformer/dict_N', exist_ok=True)

print("Created successfully.")


Created successfully.


In [11]:
!wget https://raw.githubusercontent.com/ThienKym123/Fast-Conformer/main/process_asr_text_tokenizer.py -O /kaggle/working/Fast-Conformer/process_asr_text_tokenizer.py
!wget https://raw.githubusercontent.com/ThienKym123/Fast-Conformer/main/hparam/fast-conformer_ctc_bpe.yaml -O /kaggle/working/Fast-Conformer/hparam/fast-conformer_ctc_bpe.yaml
!wget https://raw.githubusercontent.com/ThienKym123/Fast-Conformer/main/speech_to_text_ctc_bpe.py -O /kaggle/working/Fast-Conformer/speech_to_text_ctc_bpe.py

--2024-09-12 10:12:26--  https://raw.githubusercontent.com/ThienKym123/Fast-Conformer/main/process_asr_text_tokenizer.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9132 (8.9K) [text/plain]
Saving to: '/kaggle/working/Fast-Conformer/process_asr_text_tokenizer.py'


2024-09-12 10:12:26 (79.5 MB/s) - '/kaggle/working/Fast-Conformer/process_asr_text_tokenizer.py' saved [9132/9132]

--2024-09-12 10:12:27--  https://raw.githubusercontent.com/ThienKym123/Fast-Conformer/main/hparam/fast-conformer_ctc_bpe.yaml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting res

In [2]:
!python /kaggle/working/Fast-Conformer/process_asr_text_tokenizer.py\
        --data_file="/kaggle/working/transcript_all.json" \
        --data_root="/kaggle/working/Fast-Conformer/dict_N" \
        --vocab_size=1024 \
        --tokenizer="spe" \
        --no_lower_case \
        --spe_type="bpe" \
        --spe_character_coverage=1.0 \
        --log

Traceback (most recent call last):
  File "/kaggle/working/Fast-Conformer/process_asr_text_tokenizer.py", line 10, in <module>
    from nemo.collections.common.tokenizers.sentencepiece_tokenizer import create_spt_model
  File "/opt/conda/lib/python3.10/site-packages/nemo/collections/common/__init__.py", line 16, in <module>
    from nemo.collections.common import data, losses, parts, tokenizers
  File "/opt/conda/lib/python3.10/site-packages/nemo/collections/common/losses/__init__.py", line 15, in <module>
    from nemo.collections.common.losses.aggregator import AggregatorLoss
  File "/opt/conda/lib/python3.10/site-packages/nemo/collections/common/losses/aggregator.py", line 19, in <module>
    from nemo.core.classes import Loss, typecheck
  File "/opt/conda/lib/python3.10/site-packages/nemo/core/__init__.py", line 16, in <module>
    from nemo.core.classes import *
  File "/opt/conda/lib/python3.10/site-packages/nemo/core/classes/__init__.py", line 20, in <module>
    from nemo.core.

In [13]:
# import os

# file_path = '/kaggle/working/Fast-Conformer/hparam/fast-conformer_ctc_bpe.yaml'

# # Kiểm tra xem file tồn tại trước khi xóa
# if os.path.exists(file_path):
#     os.remove(file_path)
#     print(f"Đã xóa file {file_path}")
# else:
#     print(f"File {file_path} không tồn tại.")


In [14]:
!wandb login edfa6bb2b01b5b2e74dd9f112d7ba0da853ebd0e

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [15]:
!HYDRA_FULL_ERROR=1 python /kaggle/working/Fast-Conformer/speech_to_text_ctc_bpe.py --config-path="/kaggle/working/Fast-Conformer/hparam"

Traceback (most recent call last):
  File "/kaggle/working/Fast-Conformer/speech_to_text_ctc_bpe.py", line 3, in <module>
    import nemo.collections.asr as nemo_asr
  File "/opt/conda/lib/python3.10/site-packages/nemo/collections/asr/__init__.py", line 15, in <module>
    from nemo.collections.asr import data, losses, models, modules
  File "/opt/conda/lib/python3.10/site-packages/nemo/collections/asr/losses/__init__.py", line 15, in <module>
    from nemo.collections.asr.losses.angularloss import AngularSoftmaxLoss
  File "/opt/conda/lib/python3.10/site-packages/nemo/collections/asr/losses/angularloss.py", line 18, in <module>
    from nemo.core.classes import Loss, Typing, typecheck
  File "/opt/conda/lib/python3.10/site-packages/nemo/core/__init__.py", line 16, in <module>
    from nemo.core.classes import *
  File "/opt/conda/lib/python3.10/site-packages/nemo/core/classes/__init__.py", line 20, in <module>
    from nemo.core.classes.common import (
  File "/opt/conda/lib/python3.1

In [16]:
import nemo
from nemo.core import ModelPT

# Đường dẫn đến file .nemo
nemo_file = '/kaggle/working/nemo_experiments/FastConformer-CTC-BPE/2024-09-10_07-12-52/checkpoints/FastConformer-CTC-BPE.nemo'

# Load mô hình từ file .nemo
model = ModelPT.restore_from(restore_path=nemo_file)


[NeMo I 2024-09-10 08:56:29 mixins:172] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2024-09-10 08:56:30 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /kaggle/working/train_manifest.json
    sample_rate: 16000
    batch_size: 32
    shuffle: true
    num_workers: 8
    pin_memory: true
    max_duration: 20
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: fully_randomized
    bucketing_batch_size: null
    
[NeMo W 2024-09-10 08:56:30 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: /kaggle/working/validation_manifest.json
    sample_rate: 16000
    batch_size: 32
    shuffle: false
    use_start_e

[NeMo I 2024-09-10 08:56:30 features:289] PADDING: 0
[NeMo I 2024-09-10 08:56:32 save_restore_connector:249] Model EncDecCTCModelBPE was successfully restored from /kaggle/working/nemo_experiments/FastConformer-CTC-BPE/2024-09-10_07-12-52/checkpoints/FastConformer-CTC-BPE.nemo.


In [17]:
model.eval()

EncDecCTCModelBPE(
  (preprocessor): AudioToMelSpectrogramPreprocessor(
    (featurizer): FilterbankFeatures()
  )
  (encoder): ConformerEncoder(
    (pre_encode): ConvSubsampling(
      (out): Linear(in_features=2560, out_features=512, bias=True)
      (conv): Sequential(
        (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (1): ReLU(inplace=True)
        (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=256)
        (3): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
        (4): ReLU(inplace=True)
        (5): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=256)
        (6): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
        (7): ReLU(inplace=True)
      )
    )
    (pos_enc): RelPositionalEncoding(
      (dropout): Dropout(p=0.05, inplace=False)
      (dropout_emb): Dropout(p=0.05, inplace=False)
    )
    (layers): ModuleList(
      (0-18): 19 x ConformerLayer(
        (norm_fe

In [19]:
##### import soundfile as sf
from IPython.display import Audio

# Đường dẫn đến file âm thanh
audio_file = '/kaggle/working/test/wavs/55.wav'

# Đọc file âm thanh
audio_data, sample_rate = sf.read(audio_file)

# Tiến hành nhận dạng tiếng nói (assuming `model` is your ASR model)
transcription = model.transcribe([str(audio_file)])

print(transcription)

Audio(data=audio_data, rate=sample_rate)

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

['tuy nhiên ngày ch tỉnh cuối cùng này không diễn ra bình y']


In [None]:
import json
from nemo.collections.asr.metrics.wer import word_error_rate

# Define the path to your test manifest file
test_manifest = "/kaggle/working/test_manifest.json"

# Read data from the manifest file
with open(test_manifest, 'r', encoding='utf-8') as f:
    manifest_data = [json.loads(line.strip()) for line in f]

predictions = []
references = []

# Loop through each sample in the manifest and perform predictions
for sample in manifest_data:
    audio_file = sample['audio_filepath']
    transcript = sample['text']

    # Perform inference with each audio sample
    transcription = model.transcribe([audio_file])[0]

    # Add the prediction and the reference to the respective lists
    predictions.append(transcription)
    references.append(transcript)

    # Print the result after each sample
    print(f"Reference: {transcript}")
    print(f"Prediction: {transcription}")
    print("------")


In [None]:
# Calculate Word Error Rate (WER)
wer = word_error_rate(predictions, references)
print(f"\nWord Error Rate (WER): {wer * 100:.2f}%")

In [None]:

# Calculate accuracy based on the number of correct sentences
correct_count = sum(1 for pred, ref in zip(predictions, references) if pred == ref)
total_count = len(predictions)
accuracy = correct_count / total_count

print(f"\nAccuracy: {accuracy * 100:.2f}%")