In [None]:
# Copying packages and extracting them for installation
!cp -r ../input/python-packages2 ./

# Install jiwer (Word Error Rate calculation)
!tar xvfz ./python-packages2/jiwer.tgz
!pip install ./jiwer/jiwer-2.3.0-py3-none-any.whl -f ./ --no-index

# Install bnunicodenormalizer (Bengali Unicode normalization)
!tar xvfz ./python-packages2/normalizer.tgz
!pip install ./normalizer/bnunicodenormalizer-0.0.24.tar.gz -f ./ --no-index

# Install pyctcdecode (CTC decoding for Wav2Vec2)
!tar xvfz ./python-packages2/pyctcdecode.tgz
!pip install ./pyctcdecode/attrs-22.1.0-py2.py3-none-any.whl -f ./ --no-index --no-deps
!pip install ./pyctcdecode/exceptiongroup-1.0.0rc9-py3-none-any.whl -f ./ --no-index --no-deps
!pip install ./pyctcdecode/hypothesis-6.54.4-py3-none-any.whl -f ./ --no-index --no-deps
!pip install ./pyctcdecode/numpy-1.21.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl -f ./ --no-index --no-deps
!pip install ./pyctcdecode/pygtrie-2.5.0.tar.gz -f ./ --no-index --no-deps
!pip install ./pyctcdecode/sortedcontainers-2.4.0-py2.py3-none-any.whl -f ./ --no-index --no-deps
!pip install ./pyctcdecode/pyctcdecode-0.4.0-py2.py3-none-any.whl -f ./ --no-index --no-deps

# # Install pypi-kenlm (KenLM language model)
!tar xvfz ./python-packages2/pypikenlm.tgz
!pip install ./pypikenlm/pypi-kenlm-0.1.20220713.tar.gz -f ./ --no-index --no-deps

In [None]:
rm -r python-packages2 jiwer normalizer pyctcdecode pypikenlm


In [None]:
# Импорт необходимых библиотек
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer,Wav2Vec2Processor, Wav2Vec2ProcessorWithLM, Trainer, TrainingArguments
import torch
import os
import string
import librosa
from sklearn.model_selection import train_test_split
from pydub import AudioSegment
import IPython.display as ipd
from collections import Counter
from functools import partial
from dataclasses import dataclass, field
from bnunicodenormalizer import Normalizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tqdm import tqdm
import typing as tp  # Typing module for type hints
from pathlib import Path  # For working with file paths
import torch
from torch.utils.data import Dataset
import pyctcdecode
import kenlm
import cloudpickle as cpkl

# Путь к папке с аудиофайлами
audio_dir = '/kaggle/input/bengaliai-speech/examples/'

# Получение списка аудиофайлов
audio_files = [f for f in os.listdir(audio_dir) if f.endswith('.wav')]

BASE_DIR = '/kaggle/input/bengaliai-speech'
train_data_dir = f"{BASE_DIR}/train_mp3s/"  
test_data_dir = f"{BASE_DIR}/test_mp3s/" 
train_csv_path = f"{BASE_DIR}/train.csv" 
domains = f"{BASE_DIR}/examples/" 
DOMAINS = os.listdir(f'{BASE_DIR}/examples')


# Создание DataFrame
df = pd.DataFrame({
    'audio_path': [os.path.join(audio_dir, file) for file in audio_files]
})


In [None]:
# Load the train.csv file using pandas
train_df = pd.read_csv(train_csv_path)

# Preview the first few rows of the DataFrame
train_df.head()

In [None]:
train_df.split.unique()

In [None]:
n_train_samples = sum(train_df["split"]=="train")
n_valid_samples = sum(train_df["split"]=="valid")
print(f"Total training samples : ",n_train_samples)
print(f"Total validation samples : ",n_valid_samples)
print("Validation/Train ratio : ",n_valid_samples/n_train_samples)

Okay so validation set is very small compared to training set. We might need to use additional data from the train set as validation set.


In [None]:
plt.bar(["train","valid"],[n_train_samples,n_valid_samples],color = ['blue', 'yellow'])

Now let's hear some audio files and see the corresponding text

In [None]:
for idx in range(1,len(train_df),99999):
    
    mp3_path = os.path.join(train_data_dir,train_df['id'].iloc[idx])+ ".mp3"
    text = train_df['sentence'].iloc[idx]
    display(AudioSegment.from_file(mp3_path))
    print("Original transcription : ",text)

For the test set, we'll have data from 17 domains. They have provided a sample for each of them. Let's hear some of them.



In [None]:
mp3_path = f"{BASE_DIR}/examples/Slang Profanity.mp3"
print("Slang Profanity")
display(AudioSegment.from_file(mp3_path))

mp3_path = f"{BASE_DIR}/examples/Telemedicine.mp3"
print("Telemedicine")
display(AudioSegment.from_file(mp3_path))


In [None]:
# Choose some random indices for checking
random_indices = [0, 10, 20, 30, 40]

for idx in random_indices:
    row = train_df.iloc[idx]
    audio_file_path = os.path.join(train_data_dir, f"{row['id']}.mp3")

    # Load the audio file using librosa
    audio, sr = librosa.load(audio_file_path, sr=None)

    # Plot the waveform
    plt.figure(figsize=(10, 4))
    librosa.display.waveshow(audio, sr=sr)
    plt.title(f"Waveform - Audio File ID: {row['id']}")
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    plt.tight_layout()
    plt.show()

    # Plot the log Mel spectrogram
    plt.figure(figsize=(10, 4))
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    librosa.display.specshow(mel_spec_db, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title(f"Log Mel Spectrogram - Audio File ID: {row['id']}")
    plt.xlabel("Time (s)")
    plt.ylabel("Mel Frequency")
    plt.tight_layout()
    plt.show()

# EDA

how many unique sentences?

In [None]:
print("Total sentences :",len(train_df))
print("Total unique sentences : ",train_df.sentence.nunique())
print("Percentage of unique sentences ; ",train_df.sentence.nunique()/len(train_df))

In [None]:
x = train_df.sentence.apply(lambda x: len(x))
plt.xlabel("Sentence Length")
plt.ylabel("Frequency")
plt.title("Sentence Length Distribuition")
plt.hist(x)

Most of the sentences have length<=100

Let's look at the overall vocabulary size and the most frequent words

In [None]:
# Тут у нас полный словарь всех слов
vocab = {}
for sen in tqdm(train_df.sentence):
    for j in sen.split(" "):
        try:
            vocab[j]+=1
        except:
            vocab[j]=1
print("Total words in vocabulary : ",len(vocab))

In [None]:
sorted_vocab = sorted(vocab.items(),key = lambda kv:kv[1],reverse=True)
sorted_vocab[:30]

In [None]:
# Select the first 5 transcriptions
transcriptions = train_df['sentence'][:5].tolist()

# Convert transcriptions to lowercase
transcriptions_lower = [transcription.lower() for transcription in transcriptions]

# Remove punctuation
translator = str.maketrans("", "", string.punctuation)
transcriptions_no_punct = [transcription.translate(translator) for transcription in transcriptions_lower]

# Tokenization
nltk.download('punkt')  
transcriptions_tokens = [word_tokenize(transcription) for transcription in transcriptions_no_punct]


nltk.download('stopwords')  
stop_words = set(stopwords.words('bengali'))
transcriptions_no_stopwords = [
    [word for word in tokens if word not in stop_words]
    for tokens in transcriptions_tokens
]

nltk.download('wordnet')  
stemmer = PorterStemmer()
transcriptions_stemmed = [
    [stemmer.stem(word) for word in tokens]
    for tokens in transcriptions_no_stopwords
]

# Print the preprocessed transcriptions
for i, transcription in enumerate(transcriptions_stemmed):
    print(f"Preprocessed transcription {i+1}: {' '.join(transcription)}")

In [None]:
# Select the first 100 transcriptions 
transcriptions = train_df['sentence'].tolist()

# Tokenization
nltk.download('punkt')  # Download the Punkt tokenizer
transcriptions_tokens = [word_tokenize(transcription) for transcription in transcriptions]

# Print the tokenized transcriptions
for i, transcription_tokens in enumerate(transcriptions_tokens):
    if i%100000 == 0:
        print(f"Tokenized transcription {i+1}: {transcription_tokens}")

In [None]:
# Compute descriptive statistics
sentence_lengths = [len(tokens) for tokens in transcriptions_tokens]
min_length = min(sentence_lengths)
max_length = max(sentence_lengths)
mean_length = sum(sentence_lengths) / len(sentence_lengths)
median_length = sorted(sentence_lengths)[len(sentence_lengths) // 2]

# Plot the distribution of sentence lengths
plt.figure(figsize=(10, 6))
plt.hist(sentence_lengths, bins=50, color='skyblue', edgecolor='black')
plt.axvline(mean_length, color='red', linestyle='dashed', linewidth=2, label='Mean')
plt.axvline(median_length, color='green', linestyle='dashed', linewidth=2, label='Median')
plt.xlabel('Sentence Length')
plt.ylabel('Frequency')
plt.title('Distribution of Sentence Lengths')
plt.legend()
plt.show()

# Print the descriptive statistics
print("Descriptive Statistics:")
print(f"Minimum Sentence Length: {min_length}")
print(f"Maximum Sentence Length: {max_length}")
print(f"Mean Sentence Length: {mean_length:.2f}")
print(f"Median Sentence Length: {median_length}")

In [None]:
# Calculate the number of unique words in each sentence
sentences = train_df['sentence'].tolist()
unique_word_counts = [len(set(sentence.split())) for sentence in sentences]

# Compute descriptive statistics
min_unique_words = min(unique_word_counts)
max_unique_words = max(unique_word_counts)
mean_unique_words = sum(unique_word_counts) / len(unique_word_counts)
median_unique_words = sorted(unique_word_counts)[len(unique_word_counts) // 2]

# Plot the distribution of unique word counts
plt.figure(figsize=(10, 6))
plt.hist(unique_word_counts, bins=50, color='lightcoral', edgecolor='black')
plt.axvline(mean_unique_words, color='red', linestyle='dashed', linewidth=2, label='Mean')
plt.axvline(median_unique_words, color='green', linestyle='dashed', linewidth=2, label='Median')
plt.xlabel('Number of Unique Words')
plt.ylabel('Frequency')
plt.title('Distribution of Unique Words in Transcriptions')
plt.legend()
plt.show()

# Print the descriptive statistics
print("Descriptive Statistics:")
print(f"Minimum Number of Unique Words: {min_unique_words}")
print(f"Maximum Number of Unique Words: {max_unique_words}")
print(f"Mean Number of Unique Words: {mean_unique_words:.2f}")
print(f"Median Number of Unique Words: {median_unique_words}")

# Domain Analysis



In [None]:
for idx in np.arange(5):
    audio_file_path = f'{domains}/{DOMAINS[idx]}'

    # Load the audio file using librosa
    audio, sr = librosa.load(audio_file_path, sr=None)

    # Print DOMAIN
    print(DOMAINS[idx])
    ipd.display(ipd.Audio(audio, rate=sr))

In [None]:
# Define the list of domains and their corresponding audio files
DOMAINS = [
    'Audiobook.wav', 'Parliament Session.wav', 'Bangladeshi TV Drama.wav',
    'Poem Recital.wav', 'Bengali Advertisement.wav', 'Puthi Literature.wav',
    'Cartoon.wav', 'Slang Profanity.mp3', 'Debate.wav', 'Stage Drama Jatra.wav',
    'Indian TV Drama.wav', 'Talk Show Interview.wav', 'Movie.wav', 'Telemedicine.mp3',
    'News Presentation.wav', 'Waz Islamic Sermon.wav', 'Online Class.wav'
]

# Visualize the audio files and play them
for idx in np.arange(5):
    audio_file_path = os.path.join(domains, DOMAINS[idx])

    # Load the audio file using librosa
    audio, sr = librosa.load(audio_file_path, sr=None)

    # Plot the waveform
    plt.figure(figsize=(10, 4))
    librosa.display.waveshow(audio, sr=sr)
    plt.title(f'Waveform - {DOMAINS[idx]}')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.show()

    # Plot the spectrogram
    plt.figure(figsize=(10, 4))
    D = librosa.amplitude_to_db(np.abs(librosa.stft(audio)), ref=np.max)
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='linear')
    plt.colorbar(format='%+2.0f dB')
    plt.title(f'Spectrogram - {DOMAINS[idx]}')
    plt.xlabel('Time (s)')
    plt.ylabel('Frequency (Hz)')
    plt.show()

    # Play the audio
    print(f"Audio: {DOMAINS[idx]}")
    ipd.display(ipd.Audio(audio, rate=sr))

# Model Building

Public Wav2Vec2 model - no FT - inference only
In this notebook I am using this baseline model to understand the leaderboard. After that we will fine-tune or add some new models

In [None]:
# Define paths and parameters for the project
ROOT = Path.cwd().parent
INPUT = ROOT / "input"
DATA = INPUT / "bengaliai-speech"
TRAIN = DATA / "train_mp3s"
TEST = DATA / "test_mp3s"
SAMPLING_RATE = 16_000
MODEL_PATH = INPUT / "bengali-sr-download-public-trained-models/indicwav2vec_v1_bengali/"
LM_PATH = INPUT / "bengali-sr-download-public-trained-models/wav2vec2-xls-r-300m-bengali/language_model/"

In [None]:
# Load Wav2Vec2 model and processor
model = Wav2Vec2ForCTC.from_pretrained(MODEL_PATH)  # CTC instance
# processor will be responsible for handling the audion data
processor = Wav2Vec2Processor.from_pretrained(MODEL_PATH)

In [None]:
# build the vocabulary and a decoder

# Get the vocabulary from the model's tokenizer
vocab_dict = processor.tokenizer.get_vocab()
print('lENGTH OF THE VOCABULARY: ',len(vocab_dict))
vocab_dict

In [None]:
# Sort the vocabulary based on token IDs
sorted_vocab_dict = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}

# Build a CTC decoder using the sorted vocabulary and a language model
decoder = pyctcdecode.build_ctcdecoder(
    list(sorted_vocab_dict.keys()),  # Vocabulary keys
    str(LM_PATH / "5gram.bin"),  # Path to the language model file
)

In [None]:
# Create a combined processor for Wav2Vec2 model input and language model decoding
processor_with_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,  # Feature extractor for audio data
    tokenizer=processor.tokenizer,  # Tokenizer for text data
    decoder=decoder  # Decoder for converting model output to text
)

In [None]:
class BengaliSRTestDataset(Dataset):
    # A custom dataset class for handling Bengali speech test data
    
    def __init__(self, audio_paths: list[str], sampling_rate: int):
        # Constructor to initialize the dataset
        
        # Store the list of audio file paths
        self.audio_paths = audio_paths
        
        # Store the sampling rate used for audio processing
        self.sampling_rate = sampling_rate
    
    def __len__(self):
        # Return the total number of samples in the dataset
        return len(self.audio_paths)
    
    def __getitem__(self, index: int):
        # Get a sample from the dataset given an index
        
        # Get the audio file path corresponding to the index
        audio_path = self.audio_paths[index]
        
        # Get the sampling rate from the dataset settings
        sr = self.sampling_rate
        
        # Load the audio file using librosa, specifying the desired sampling rate
        # 'mono=False' indicates to load the audio as a multi-channel signal
        # [0] at the end gets the audio signal (the first element of the returned tuple)
        audio_signal = librosa.load(audio_path, sr=sr, mono=False)[0]
        
        # Return the loaded audio signal as the sample
        return audio_signal

In [None]:
test = pd.read_csv(DATA / "sample_submission.csv", dtype={"id": str})
print(test.head())

In [None]:
test_audio_paths = [str(TEST / f"{aid}.mp3") for aid in test["id"].values]


In [None]:
# Create a dataset for testing using the list of test audio paths and specified sampling rate
test_dataset = BengaliSRTestDataset(
    test_audio_paths, SAMPLING_RATE
)

# Define a partial function for collating samples into batches
collate_func = partial(
    processor_with_lm.feature_extractor,
    return_tensors="pt", sampling_rate=SAMPLING_RATE,
    padding=True,
)

# Create a data loader for testing
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=4, shuffle=False,
    num_workers=4, collate_fn=collate_func, drop_last=False,
    pin_memory=True,
)

In [None]:
if not torch.cuda.is_available():
    device = torch.device("cpu")
else:
    device = torch.device("cuda")
print(device)

# attach cpu or gpu
model = model.to(device)
model = model.eval()
model = model.half()
torch.cuda.empty_cache()

In [None]:
pred_sentence_list = []  # Initialize an empty list to store predicted sentences

# Perform inference without gradient computation because we are not fine tuning so we don't want to change the weights
with torch.no_grad():
    for batch in tqdm(test_loader):  # Iterate through batches of test data
        x = batch["input_values"]  # Extract the input audio features from the batch
        x = x.to(device, non_blocking=True)  # Move the input data to the device (GPU)
        
        # Use automatic mixed precision for faster and more memory-efficient inference
        with torch.cuda.amp.autocast(True):
            y = model(x).logits  # Get the model's output logits
        del x
        y = y.detach().cpu().numpy()  # Move the logits to the CPU and convert to a numpy array
        
        for l in y:  # Iterate through the logits of the batch
            # Decode the logits into a sentence using the LM with beam search decoding
            sentence = processor_with_lm.decode(l, beam_width=64).text
            pred_sentence_list.append(sentence)  # Append the predicted sentence to the list

In [None]:
bnorm = Normalizer()  # Create a Normalizer object for text normalization

def postprocess(sentence):
    # Define a postprocessing function to clean up and format predicted sentences
    
    period_set = set([".", "?", "!", "।"])  # Set of sentence-ending punctuation
    
    # Split the sentence into words and apply normalization using the Normalizer
    _words = [bnorm(word)['normalized'] for word in sentence.split() if word]
    
    sentence = " ".join(_words)
    
    if not sentence.endswith(tuple(period_set)):
        sentence += "।"
    return sentence

In [None]:
pp_pred_sentence_list = [
    postprocess(s) for s in tqdm(pred_sentence_list)]

In [None]:
test["sentence"] = pp_pred_sentence_list

test.to_csv("submission.csv", index=False)

print(test.head())