<a href="https://colab.research.google.com/github/monoramasn/speech-recognition/blob/main/Seamless.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q git+https://github.com/huggingface/peft.git@main
!pip install -U accelerate
!pip install evaluate
!pip install jiwer
!pip install --upgrade transformers bitsandbytes datasets torch torchvision torchaudio

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [

In [None]:
import os
import torch
import torchaudio
import torch.nn as nn
import torch.optim as optim
import argparse
import evaluate
from scipy import signal
import numpy as np
from dataclasses import dataclass
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from typing import Any, Dict, List, Union
from sklearn.model_selection import train_test_split
from datasets import load_dataset, DatasetDict, Audio, load_from_disk, concatenate_datasets, load_metric
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
from transformers import AutoProcessor, SeamlessM4TModel,SeamlessM4TTokenizer, SeamlessM4TForSpeechToText
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments



In [None]:
# Load datasets
edacc_dev = load_dataset("edinburghcstr/edacc", split="validation")
edacc_test = load_dataset("edinburghcstr/edacc", split="test")

# Create directories for each language in the test, train, and validation sets
base_dir = "organized_data"
os.makedirs(base_dir, exist_ok=True)

languages = edacc_test.unique("l1")
for split in ['test', 'train', 'validation']:
    for lang in languages:
        os.makedirs(os.path.join(base_dir, split, lang), exist_ok=True)

# Function to save audio data and corresponding text
def save_audio(example, folder):
    lang_folder = os.path.join(folder, example["l1"])
    os.makedirs(lang_folder, exist_ok=True)
    audio_filename = example['audio']['path'].split('/')[-1]
    audio_path = os.path.join(lang_folder, audio_filename)
    # Save audio file
    torchaudio.save(audio_path, torch.tensor(example['audio']['array']).unsqueeze(0), example['audio']['sampling_rate'])
    # Save corresponding text file
    text_path = os.path.join(lang_folder, f"{audio_filename}.txt")
    with open(text_path, "w") as f:
        f.write(example["text"])
    print(f"Saved audio: {audio_path}")
    print(f"Saved text: {text_path}")

# Save test audio and text to respective language directories
print("Saving test data...")
for example in edacc_test:
    save_audio(example, os.path.join(base_dir, 'test'))

# Split the validation dataset into train and validation sets
X = range(len(edacc_dev))
X_train, X_val = train_test_split(X, test_size=0.2, random_state=42)

train_dataset = edacc_dev.select(X_train)
val_dataset = edacc_dev.select(X_val)

# Save train audio and text to respective language directories
print("Saving train data...")
for example in train_dataset:
    save_audio(example, os.path.join(base_dir, 'train'))

# Save validation audio and text to respective language directories
print("Saving validation data...")
for example in val_dataset:
    save_audio(example, os.path.join(base_dir, 'validation'))


Downloading readme:   0%|          | 0.00/6.95k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/457M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/478M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/446M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/492M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/787M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/676M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/274M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/333M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/498M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/282M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/354M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/439M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/409M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/299M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/9848 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9289 [00:00<?, ? examples/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Saved text: organized_data/train/Bulgarian/EDACC-C22-203.wav.txt
Saved audio: organized_data/train/Italian/EDACC-C01-146.wav
Saved text: organized_data/train/Italian/EDACC-C01-146.wav.txt
Saved audio: organized_data/train/Scottish English/EDACC-C46_P1-29.wav
Saved text: organized_data/train/Scottish English/EDACC-C46_P1-29.wav.txt
Saved audio: organized_data/train/Arabic/EDACC-C40_P3-40.wav
Saved text: organized_data/train/Arabic/EDACC-C40_P3-40.wav.txt
Saved audio: organized_data/train/Romanian/EDACC-C30-616.wav
Saved text: organized_data/train/Romanian/EDACC-C30-616.wav.txt
Saved audio: organized_data/train/Arabic/EDACC-C40_P3-110.wav
Saved text: organized_data/train/Arabic/EDACC-C40_P3-110.wav.txt
Saved audio: organized_data/train/Irish English/EDACC-C50-255.wav
Saved text: organized_data/train/Irish English/EDACC-C50-255.wav.txt
Saved audio: organized_data/train/Romanian/EDACC-C30-100.wav
Saved text: organized_data/tr

In [None]:
model_name_or_path = "facebook/hf-seamless-m4t-medium"
model = SeamlessM4TForSpeechToText.from_pretrained(model_name_or_path)

In [None]:
# Initialize processor and model
model_name_or_path = "facebook/hf-seamless-m4t-medium"  # Replace with the actual SeamlessM4T model identifier
processor = AutoProcessor.from_pretrained(model_name_or_path)
tokenizer = SeamlessM4TTokenizer.from_pretrained(model_name_or_path)
model = SeamlessM4TForSpeechToText.from_pretrained(model_name_or_path)

# Function to downsample audio samples to 16 kHz
def downsample_audio(audio_array, source_sr, target_sr=16000):
    waveform = torch.tensor(audio_array, dtype=torch.float32).unsqueeze(0)  # Add channel dimension
    resampled_waveform = torchaudio.transforms.Resample(source_sr, target_sr)(waveform)
    resampled_audio = resampled_waveform.squeeze(0).numpy()  # Remove channel dimension
    return resampled_audio

# Prepare the dataset
def prepare_dataset(batch):
    audio = batch["audio"]
    if audio["sampling_rate"] != 16000:
        audio["array"] = downsample_audio(audio["array"], audio["sampling_rate"], 16000)
        audio["sampling_rate"] = 16000

    # Process audio
    audio_inputs = processor(audios=audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt")

    input_features = audio_inputs.input_features[0]

    text = batch["text"]
    if isinstance(text, list):
        text = text[0]  # Ensure text is a string
    if not isinstance(text, str):
        text = str(text)  # Convert text to string if it's not already
    if not text:  # Check for empty string
        text = "empty"

    # Debug print to check the type and value of text

    labels = tokenizer(text, return_tensors="pt").input_ids

    batch["input_features"] = input_features
    batch["labels"] = labels.squeeze()  # Ensure labels are properly formatted
    batch["l1"] = batch["l1"]
    batch["audio"] = audio
    batch["text"] = text

    return batch

preprocessor_config.json:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.33k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.29k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/2.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/5.35k [00:00<?, ?B/s]

In [None]:
train_dataset = train_dataset.map(prepare_dataset, remove_columns=["audio"])
val_dataset = val_dataset.map(prepare_dataset, remove_columns=["audio"])

Map:   0%|          | 0/7878 [00:00<?, ? examples/s]

Map:   0%|          | 0/1970 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoProcessor, SeamlessM4TTokenizer, SeamlessM4TModel, Trainer, TrainingArguments
from dataclasses import dataclass
from typing import Any, Dict, List, Union, Optional
from torch.nn.utils.rnn import pad_sequence

# Define data collator
@dataclass
class DataCollatorCTCWithPadding:
    processor: Any
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [torch.tensor(feature["input_features"], dtype=torch.float32) for feature in features]
        labels = [torch.tensor(feature["labels"], dtype=torch.long) for feature in features]

        # Pad input_features and labels
        batch_input_features = pad_sequence(input_features, batch_first=True, padding_value=0.0)
        batch_labels = pad_sequence(labels, batch_first=True, padding_value=self.processor.tokenizer.pad_token_id)

        # Create attention mask based on input_features dimensions
        attention_mask = (batch_input_features.sum(dim=-1) != 0).long()

        # Replace padding with -100 to ignore in loss
        labels_mask = batch_labels != self.processor.tokenizer.pad_token_id
        batch_labels = batch_labels.masked_fill(~labels_mask, -100)

        return {
            "input_features": batch_input_features,
            "labels": batch_labels,
            "attention_mask": attention_mask
        }

data_collator = DataCollatorCTCWithPadding(processor=processor)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    warmup_steps=100,
    num_train_epochs=3,
    evaluation_strategy="steps",
    fp16=True,
    per_device_eval_batch_size=2,
    logging_steps=600,
    save_steps=600,
    save_total_limit=2,
    remove_unused_columns=False
    gradient_checkpointing=True,  # Enable gradient checkpointing
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=processor
)

model.config.use_cache = False



In [None]:
# Train the model
trainer.train()



Step,Training Loss,Validation Loss
600,1.2213,0.84436




OutOfMemoryError: CUDA out of memory. Tried to allocate 3.70 GiB. GPU 

In [None]:
# Save the model and processor
model.save_pretrained("./saved_seamless_m4t_model")
processor.save_pretrained("./saved_seamless_m4t_model")

In [None]:
import os
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
from datasets import load_metric
import torchaudio

# Load the saved model and processor
model = Wav2Vec2ForCTC.from_pretrained("./saved_seamless_m4t_model")
processor = Wav2Vec2Processor.from_pretrained("./saved_seamless_m4t_model")

# Function to downsample audio
def downsample_audio(audio_array, source_sr, target_sr=16000):
    waveform = torch.tensor(audio_array, dtype=torch.float32).unsqueeze(0)  # Add channel dimension
    resampled_waveform = torchaudio.transforms.Resample(source_sr, target_sr)(waveform)
    resampled_audio = resampled_waveform.squeeze(0).numpy()  # Remove channel dimension
    return resampled_audio

# Function to evaluate WER for each language folder
def evaluate_language_wer(folder_path, processor, model, batch_size=16):
    wer_metric = load_metric("wer", trust_remote_code=True)
    audio_files = [f for f in os.listdir(folder_path) if f.endswith(".wav")]
    predictions = []
    references = []

    for i in range(0, len(audio_files), batch_size):
        batch_files = audio_files[i:i+batch_size]
        input_values = []
        for file in batch_files:
            audio_path = os.path.join(folder_path, file)
            text_path = audio_path + ".txt"
            with open(text_path, "r") as f:
                references.append(f.read().strip())
            audio, sampling_rate = torchaudio.load(audio_path)
            audio = audio.squeeze().numpy()
            if sampling_rate != 16000:
                audio = downsample_audio(audio, sampling_rate)
            features = processor(audio, sampling_rate=16000).input_values[0]
            input_values.append(features)

        input_values = torch.tensor(input_values).to(model.device)
        with torch.no_grad():
            logits = model(input_values).logits
        pred_ids = torch.argmax(logits, dim=-1)
        transcriptions = processor.batch_decode(pred_ids)
        predictions.extend(transcriptions)

    wer = wer_metric.compute(predictions=predictions, references=references)
    return wer

# Directory where test data is organized
base_dir = "organized_data"
languages = edacc_test.unique("l1")

# Evaluate and print WER for each language
for lang in languages:
    folder_path = os.path.join(base_dir, 'test', lang)
    wer = evaluate_language_wer(folder_path, processor, model)
    print(f"Language: {lang}, WER: {wer:.4f}")
