# Package and dependencies 

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [54]:
model_checkpoint = "facebook/wav2vec2-large-xlsr-53"
batch_size = 16
train_size = 2

In [None]:
!pip install torchaudio
!pip install seaborn
!pip install kaggle
!pip install -U transformers
!pip install datasets
!pip install fsspec==2021.5.0
!pip install jiwer==2.2.0
!pip install pydub
!pip install librosa
!pip install bnunicodenormalizer
!pip install pandarallel
!pip install accelerate 
!pip install nvidia-ml-py3

In [None]:
#!pip install cloud-tpu-client==0.10 torch==1.11.0 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.11-cp37-cp37m-linux_x86_64.whl
#! pip install git+https://github.com/huggingface/accelerate

In [None]:
# unhash for collab only
'''
!mkdir /root/.kaggle


import json
dictionary ={
"username":"mushrafimunim",
"key":"e5c337a189ee0a5c867ff83c21df4d2a"
}
  
json_object = json.dumps(dictionary, indent = 4)
  
with open("kaggle.json", "w") as outfile:
    outfile.write(json_object)

%mv kaggle.json /root/.kaggle/kaggle.json
'''

In [2]:
import numpy as np
import pandas as pd
import random
import ast
from tqdm import tqdm
from IPython import display as ipd

# visualization
import matplotlib.pyplot as plt
from tabulate import tabulate
import seaborn as sns
#system files
import os
import json
import re
import glob

import zipfile
import shutil
import gc
from pydub import AudioSegment
from joblib import Parallel, delayed

#transformers
from transformers import AdamW,AutoTokenizer,AutoFeatureExtractor,AutoConfig,AutoModel,Wav2Vec2CTCTokenizer,Wav2Vec2ForCTC,Wav2Vec2Processor,Trainer,TrainingArguments,Wav2Vec2FeatureExtractor,get_linear_schedule_with_warmup,set_seed


# PyTorch 
import torch
import torchaudio
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp
import torch.nn.functional as F
import torchaudio.functional as FT
import torchaudio.transforms as TT


#sklearn
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_metric,Dataset,concatenate_datasets,set_caching_enabled, ClassLabel

import librosa

#normalization
from pandarallel import pandarallel
from bnunicodenormalizer import Normalizer 
pandarallel.initialize(progress_bar=True,nb_workers=8)
tqdm.pandas()
bnorm=Normalizer()

# Set environment variables
import warnings
warnings.filterwarnings('ignore')

#accelerator
from accelerate import Accelerator, DistributedType
from datasets import load_dataset, load_metric 
import datasets
import transformers

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
#unhash for collab only
'''
!kaggle competitions download -c dlsprint

!unzip dlsprint.zip

!rm -rf dlsprint.zip
!git clone https://gitlab.com/mushrafi88/dlsprint.git
!cp /content/dlsprint/vocab.json /content/
'''

In [None]:
df_train = pd.read_csv('train.csv')
df_validation = pd.read_csv('validation.csv')

In [None]:
#!cd train_files;for file in *.mp3;do l=$(echo -n "$file,"echo $(ffprobe $file 2>&1 | grep 'Duration' | cut -d',' -f1 | cut -d' ' -f4 | cut -d'.' -f1));echo "$l" >> train_files_duration.csv ;done
#!sed -i '1s/^/path,duration\n/' train_files_duration.csv

In [None]:
df_test_files_duration = pd.read_csv('dlsprint/test_files_duration.csv')
df_validation_files_duration = pd.read_csv('dlsprint/validation_files_duration.csv')
df_train_files_duration = pd.read_csv('dlsprint/train_files_duration.csv')

In [None]:
df_test_files_duration

In [None]:
df_train

In [None]:
#df_train = df_train.sample(n=1)
#df_validation = df_validation.sample(n=1)

In [None]:
df_train.info()

In [None]:
df_validation.info()

# Exploring the data

In [None]:
df_common = pd.merge(df_train, df_validation, on=['path'], how='inner')

In [None]:
df_common.info()

In [None]:
'''
!cd test_files && echo "Test files" && find . -maxdepth 1 -type f | sed 's/.*\.//' | sort | uniq -c && # for file in *.mp3;do ffprobe -v error -select_streams a:0 -show_entries stream=duration -of default=noprint_wrappers=1:nokey=1 "$file";done|paste -sd+|bc -l
!cd train_files && echo "Train files" &&find . -maxdepth 1 -type f | sed 's/.*\.//' | sort | uniq -c
!cd validation_files && echo "Validation files" && find . -maxdepth 1 -type f | sed 's/.*\.//' | sort | uniq -c
'''

In [None]:
def get_all_unique_values(df):
    df_empty = {}
    for i in df.columns:
        df_empty[i] = df[i].unique()
    return df_empty

In [None]:
def missing_data(df):
    a = df.isnull().sum()
    a=a.tolist()
    percent_missing = [ int(x)* 100 / len(df) for x in a]
    error_per = pd.DataFrame({
        'Columns':df.columns,         
        'Total error': a,
        'Error Percentage': percent_missing
    })
    print(tabulate(error_per, headers=['column','Total missing','% missing']))
    sns.barplot(x='Columns',y='Error Percentage',data=error_per)
    plt.ylabel('missing  %')
    plt.xticks(rotation=70)

In [None]:
missing_data(df_train)

In [None]:
missing_data(df_validation)

In [None]:
#get_all_unique_values(df_train)

In [None]:
#get_all_unique_values(df_validation)

# Preprocessing Data

In [None]:
errors=["common_voice_bn_31727562",
        'common_voice_bn_30998934',
        'common_voice_bn_31595526',
        'common_voice_bn_31534853',
        'common_voice_bn_31518061',
        'common_voice_bn_31518373',
        'common_voice_bn_31613621',
        'common_voice_bn_31555333',
        'common_voice_bn_31772113',
        'common_voice_bn_31605391',
        'common_voice_bn_31631175',
        'common_voice_bn_31563901',
        'common_voice_bn_31691690',
        'common_voice_bn_31692010',
        'common_voice_bn_31683653',
        'common_voice_bn_31692182',
        'common_voice_bn_31519976',
        'common_voice_bn_31675793',
        'common_voice_bn_31019914',
        'common_voice_bn_31660287',
        'common_voice_bn_31660384',
        'common_voice_bn_31557261',
        'common_voice_bn_31633101',
        'common_voice_bn_31599243',
        'common_voice_bn_31521515',
        'common_voice_bn_31777802',
        'common_voice_bn_31777848',
        'common_voice_bn_31669646',
        'common_voice_bn_31566083',
        'common_voice_bn_31530331',
        'common_voice_bn_31727697',
        'common_voice_bn_31513270',
        'common_voice_bn_31686295',
        'common_voice_bn_31753693',
        'common_voice_bn_31686334',
        'common_voice_bn_31765546',
        'common_voice_bn_31765548',
        'common_voice_bn_31662742',
        'common_voice_bn_31704856',
        'common_voice_bn_31635344',
        'common_voice_bn_31618327',
        'common_voice_bn_31743074',
        'common_voice_bn_31678862',
        'common_voice_bn_31626674',
        'common_voice_bn_31626677',
        'common_voice_bn_31523889',
        'common_voice_bn_31610804',
        'common_voice_bn_31769538',
        'common_voice_bn_31533273',
        'common_voice_bn_31445621',
        'common_voice_bn_31620650']
errors = [i+'.mp3' for i in errors]

In [None]:
index_to_be_dropped_train = df_train[df_train['path'].isin(errors)].index
index_to_be_dropped_validation = df_validation[df_validation['path'].isin(errors)].index

In [None]:
df_validation = df_validation.drop(index_to_be_dropped_validation)
df_train = df_train.drop(index_to_be_dropped_train)

## Text preprocessing

In [None]:
get_all_unique_values(df_train)['up_votes']

In [None]:
get_all_unique_values(df_train)['down_votes']

In [None]:
df_train[df_train['down_votes'] == 0].sort_values(by=['down_votes'],ascending=False)

In [None]:
def cleaning_csv(df):
    df['votes'] = df['up_votes'] - df['down_votes']
    df['votes'] = df['votes'] + abs(df['votes'].min())
    df.replace(0,df['votes'].mean(axis=0),inplace=True)
    df['votes'] = df['votes']/df['votes'].max() 
    df = df.drop(['client_id','age','gender','accents','locale','up_votes','down_votes'],axis=1)
    df = df.dropna(how='all')
    if len(df) > 200000:
        df['path'] = 'train_files/' + df['path']
    if len(df) < 10000:
        df['path'] = 'validation_files/' + df['path']
    return df

In [None]:
def remove_punctuations(my_str):
    punctuations = '''````¬£|¬¢|√ë+-*/=EROero‡ß≥‡ß¶‡ßß‡ß®‡ß©‡ß™‡ß´‡ß¨‡ß≠‡ßÆ‡ßØ012‚Äì34567‚Ä¢89‡•§!()-[]{};:'"‚Äú\‚Äô‚Ä¶,<>.‚Äö/?@#$%^&*_~‚Äò‚Äî‡••‚Äù‚Ä∞ü§£‚öΩÔ∏è‚úåÔøΩÔø∞‡ß∑Ôø∞'''
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct

In [None]:
def normalize(sen):
    _words = [bnorm(word)['normalized']  for word in sen.split()]
    return " ".join([word for word in _words if word is not None]) 

In [None]:
df_train = cleaning_csv(df_train)
df_validation = cleaning_csv(df_validation)

In [None]:
df_train['sentence'] = df_train['sentence'].apply(lambda x : remove_punctuations(x))
df_validation['sentence'] = df_validation['sentence'].apply(lambda x : remove_punctuations(x))

In [None]:
df_train = pd.concat([df_train, df_validation], ignore_index=True)

In [None]:
to_drop_train = df_train[df_train['sentence'].str.contains('V')]
df_train = df_train.drop(to_drop_train.index)
to_drop_train = df_train[df_train['sentence'].str.contains('A')]
df_train = df_train.drop(to_drop_train.index)
to_drop_train = df_train[df_train['sentence'].str.contains('B')]
df_train = df_train.drop(to_drop_train.index)

In [None]:
df_train["sentence"]=df_train["sentence"].parallel_apply(lambda x:normalize(x))

In [None]:
df_train.to_csv('dlsprint/df_sentence_normalized.csv',index=False)

## checkpoint 1

In [3]:
df_train = pd.read_csv('dlsprint/df_sentence_normalized.csv')

In [None]:
df_train.info(memory_usage='deep')

## Audio preprocessing

In [None]:
def audio_info_cleaning(df):
    df['duration'] = df['duration'].str.replace(':','0')
    df['duration'] = df['duration'].astype(int)
    return df

In [None]:
df_train_files_duration = audio_info_cleaning(df_train_files_duration)
df_train_files_duration['path'] = 'train_files/' + df_train_files_duration['path']

df_validation_files_duration = audio_info_cleaning(df_validation_files_duration)
df_validation_files_duration['path'] = 'validation_files/' + df_validation_files_duration['path']

df_train_files_duration = pd.concat([df_train_files_duration, df_validation_files_duration], ignore_index=True)

In [None]:
df_train = df_train.merge(df_train_files_duration,on='path')

In [None]:
df_train

In [None]:
get_all_unique_values(df_train)['duration']

In [None]:
df_train['duration'].value_counts()

In [None]:
# wav2vec2 works best for less than 5 s data
# we can try lstm in this part for greater than 5s data

df_train = df_train[(df_train['duration'] <= 5) & (df_train['duration'] >=2)]

In [None]:
df_train

In [None]:
df_train = df_train.drop(['votes','duration'],axis=1)

In [None]:
df_train.to_csv('dlsprint/df_train_sen+duration.csv',index=False)

## checkpoint 2

In [4]:
df_train = pd.read_csv('dlsprint/df_train_sen+duration.csv')

## Mp3 to Array

In [55]:
df_train

Unnamed: 0,path,sentence
0,train_files/common_voice_bn_30991326.mp3,‡¶¨‡¶æ‡¶¨‡¶æ ‡¶∏‡¶§‡ßç‡¶Ø‡ßá‡¶® ‡¶ò‡ßã‡¶∑
1,train_files/common_voice_bn_30991432.mp3,‡¶Ü‡¶™‡¶®‡¶ø ‡¶ñ‡ßÅ‡¶¨ ‡¶è‡¶ï‡¶ü‡¶æ ‡¶ï‡¶•‡¶æ ‡¶¨‡¶≤‡¶æ‡¶∞ ‡¶≤‡ßã‡¶ï ‡¶®‡¶® ‡¶§‡¶æ‡¶á ‡¶®‡¶æ
2,train_files/common_voice_bn_30991478.mp3,‡¶Ü‡¶™‡¶®‡¶ø ‡¶ñ‡ßÅ‡¶¨ ‡¶è‡¶ï‡¶ü‡¶æ ‡¶ï‡¶•‡¶æ ‡¶¨‡¶≤‡¶æ‡¶∞ ‡¶≤‡ßã‡¶ï ‡¶®‡¶® ‡¶§‡¶æ‡¶á ‡¶®‡¶æ
3,train_files/common_voice_bn_30991480.mp3,‡¶§‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶Æ‡ßç‡¶∞‡¶æ‡¶ú‡ßç‡¶Ø ‡¶õ‡¶ø‡¶≤ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶ú‡ßÅ‡ßú‡ßá
4,train_files/common_voice_bn_30991488.mp3,‡¶Ü‡¶ï‡ßç‡¶∞‡¶Æ‡¶£‡¶æ‡¶§‡ßç‡¶Æ‡¶ï ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡¶ø‡¶Ç ‡¶ì ‡¶¶‡ßç‡¶∞‡ßÅ‡¶§ ‡¶∞‡¶æ‡¶® ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶æ‡¶π‡¶ï ‡¶π‡¶ø‡¶∏‡ßá‡¶¨...
...,...,...
107860,validation_files/common_voice_bn_31520874.mp3,‡¶¨‡ßç‡¶∞‡¶ø‡¶∏‡ßç‡¶ü‡¶≤ ‡¶®‡¶§‡ßÅ‡¶® ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨ ‡¶Ö‡¶®‡ßÅ‡¶∏‡¶®‡ßç‡¶ß‡¶æ‡¶®‡ßá‡¶∞ ‡¶Ø‡¶æ‡¶§‡ßç‡¶∞‡¶æ ‡¶∂‡ßÅ‡¶∞‡ßÅ ‡¶ï‡¶∞...
107861,validation_files/common_voice_bn_31541319.mp3,‡¶™‡¶æ‡¶£‡ßç‡¶°‡ßÅ‡¶∞ ‡¶¶‡ßÅ‡¶á ‡¶∏‡ßç‡¶§‡ßç‡¶∞‡ßÄ
107862,validation_files/common_voice_bn_31541568.mp3,‡¶§‡¶æ‡¶ï‡ßá ‡¶ß‡¶∞‡ßá ‡¶¨‡ßç‡¶∞‡¶ø‡¶ü‡¶ø‡¶∂ ‡¶™‡ßÅ‡¶≤‡¶ø‡¶∂
107863,validation_files/common_voice_bn_30998752.mp3,‡¶§‡ßÅ‡¶Æ‡¶ø ‡¶ì‡¶ï‡ßá ‡¶≠‡¶æ‡¶≤‡ßã ‡¶ï‡¶∞‡ßá ‡¶¨‡ßÅ‡¶ù‡¶ø‡ßü‡ßá ‡¶¶‡¶ø‡¶ì


In [56]:
train = Dataset.from_pandas(df_train.sample(train_size))
#submission = Dataset.from_pandas(df_submission)

In [57]:
train

Dataset({
    features: ['path', 'sentence', '__index_level_0__'],
    num_rows: 2
})

In [None]:
!du -hs * | grep "dlsprint"

In [58]:
train.save_to_disk("dlsprint/train")
#submission.save_to_disk("dlsprint/submission")

In [None]:
!du -hs * | grep "dlsprint"

## checkpoint 3

In [110]:
train = Dataset.load_from_disk("dlsprint/train")
#submission = Dataset.load_from_disk("dlsprint/submission")

In [6]:
train

Dataset({
    features: ['path', 'sentence', '__index_level_0__'],
    num_rows: 1000
})

In [8]:
def ran_(df):
    df['ra'] = df['path']
    return df

In [10]:
train = train.map(ran_)

  0%|          | 0/1000 [00:00<?, ?ex/s]

In [7]:
type(train)

datasets.arrow_dataset.Dataset

In [None]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    return df

In [None]:
show_random_elements(train.remove_columns(["path"]), num_examples=20)

In [111]:
resamplers = {  
    48000: torchaudio.transforms.Resample(48000, 16000),
    44100: torchaudio.transforms.Resample(44100, 16000),
    32000: torchaudio.transforms.Resample(32000, 16000),
}

In [112]:
def speech_file_to_array_torch(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    batch["speech"] = resamplers[sampling_rate](speech_array).squeeze().numpy()
    batch["speech"] = np.trim_zeros(batch["speech"])
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["sentence"]
    return batch
def speech_file_to_array_submission_torch(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    batch["speech"] = resamplers[sampling_rate](speech_array).squeeze().numpy()
    batch["speech"] = np.trim_zeros(batch["speech"])
    batch["sampling_rate"] = sampling_rate
    return batch

In [18]:
gc.collect()

519

In [19]:
train

Dataset({
    features: ['path', 'sentence', '__index_level_0__'],
    num_rows: 1000
})

In [10]:
!du -hs * | grep "dlsprint"

984M	dlsprint
420K	dlsprint-Copy1.ipynb
420K	dlsprint.ipynb
540K	dlsprint_mp3_torch_audio.ipynb
56K	dlsprint_xla_tpu.ipynb


In [113]:
train = train.map(speech_file_to_array_torch, remove_columns=train.column_names,batch_size=500)

  0%|          | 0/2 [00:00<?, ?ex/s]

In [None]:
!du -hs * | grep "dlsprint"

In [8]:
train.features

{'speech': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None),
 'sampling_rate': Value(dtype='int64', id=None),
 'target_text': Value(dtype='string', id=None)}

In [None]:
train.cleanup_cache_files()

In [None]:
gc.collect()

In [None]:
!du -hs * | grep "dlsprint"

In [114]:
train.set_format("numpy", columns=["speech","sampling_rate"], output_all_columns=True)

In [None]:
import psutil
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

## Tokenizer

In [None]:
df_all_for_token = pd.read_csv('dlsprint/df_sentence_normalized.csv')

In [None]:
def extract_all_chars(df):
    all_text = " ".join(df["sentence"])
    vocab = list(set(all_text))
    vocab_dict = {v: k for k, v in enumerate(vocab)}
    return vocab_dict

In [None]:
vocab_dict = extract_all_chars(df_all_for_token)
vocab_dict

In [None]:
def vocab_dict_cleaner(vocab_dict):
    del vocab_dict[' ']

    vocab_dict["[UNK]"] = len(vocab_dict)
    vocab_dict["[PAD]"] = len(vocab_dict)
    print(len(vocab_dict))
    return vocab_dict

In [None]:
vocab_dict = vocab_dict_cleaner(vocab_dict)
vocab_dict

In [None]:
vocab=[ '\u200d',
        ' ','!',"'",',','-','.',':',';','=','?','‡•§',
        '‡¶Å','‡¶Ç','‡¶É',
        '‡¶Ö','‡¶Ü','‡¶á','‡¶à','‡¶â','‡¶ä','‡¶ã','‡¶è','‡¶ê','‡¶ì','‡¶î',
        '‡¶ï','‡¶ñ','‡¶ó','‡¶ò','‡¶ô',
        '‡¶ö','‡¶õ','‡¶ú','‡¶ù','‡¶û',
        '‡¶ü','‡¶†','‡¶°','‡¶¢','‡¶£',
        '‡¶§','‡¶•','‡¶¶','‡¶ß','‡¶®',
        '‡¶™','‡¶´','‡¶¨','‡¶≠','‡¶Æ',
        '‡¶Ø','‡¶∞','‡¶≤',
        '‡¶∂','‡¶∑','‡¶∏','‡¶π',
        '‡¶æ','‡¶ø','‡ßÄ','‡ßÅ','‡ßÇ','‡ßÉ','‡ßá','‡ßà','‡ßã','‡ßå','‡ßç',
        '‡ßé','‡ßú','‡ßù','‡ßü',
        '‡ß¶','‡ßß','‡ß®','‡ß©','‡ß™','‡ß´','‡ß¨','‡ß≠','‡ßÆ','‡ßØ']

In [None]:
vocab_dict_munim = list(vocab_dict.keys())

In [None]:
vocab_dict_munim

In [None]:
odertateaseamartatenai = [x for x in vocab if x not in vocab_dict_munim]

In [None]:
odertateaseamartatenai

In [None]:
amrtayase_odertay_nai = [x for x in vocab_dict_munim if x not in vocab]

In [None]:
amrtayase_odertay_nai

In [None]:
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [None]:
!cp vocab.json dlsprint/

In [14]:
config = AutoConfig.from_pretrained(model_checkpoint)
tokenizer_type = config.model_type if config.tokenizer_class is None else None
config = config if config.tokenizer_class is not None else None

In [102]:
tokenizer = AutoTokenizer.from_pretrained(
  "./",
  config=config,
  tokenizer_type=tokenizer_type,
  unk_token="[UNK]",
  pad_token="[PAD]",
  word_delimiter_token="‡•§",
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [104]:
feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)

In [105]:
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

In [106]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [107]:
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: PreTrainedTokenizer(name_or_path='./', vocab_size=64, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'pad_token': '[PAD]', 'additional_special_tokens': [AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True)]})

In [29]:
#%mkdir model

In [21]:
processor.save_pretrained("model/wav2vec2")

## Preparing the inputs and labels

In [64]:
train

Dataset({
    features: ['speech', 'sampling_rate', 'target_text'],
    num_rows: 2
})

In [None]:
type(train[0]['speech'])

In [108]:
def prepare_dataset(batch):
    batch["input_values"] = processor(batch["speech"], sampling_rate=16000).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

In [32]:
def prepare_dataset_submission(batch):
    batch["input_values"] = processor(batch["speech"], sampling_rate=16000).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    return batch

In [None]:
gc.collect()

In [None]:
type

In [115]:
train=train.map(prepare_dataset, remove_columns=train.column_names)

  0%|          | 0/2 [00:00<?, ?ex/s]

In [None]:
!du -hs * | grep "dlsprint"

In [24]:
train.cleanup_cache_files()

5

In [None]:
gc.collect()

In [None]:
!du -hs * | grep "dlsprint"

In [67]:
train

Dataset({
    features: ['speech', 'sampling_rate', 'target_text', 'input_values', 'input_length', 'labels'],
    num_rows: 2
})

In [36]:
train[0]

{'speech': array([ 7.3093590e-14, -2.1868138e-11,  7.7021632e-11, ...,
         9.4649629e-05,  1.0731552e-04,  1.4034419e-04], dtype=float32),
 'sampling_rate': 32000,
 'input_values': array([-4.0990511e-05, -4.0990752e-05, -4.0989675e-05, ...,
         9.8870834e-04,  1.1265014e-03,  1.4858220e-03], dtype=float32),
 'input_length': 91480,
 'labels': array([61, 11, 22, 23, 42, 11, 38, 23,  1, 49, 55, 58, 32, 45, 23, 38, 11,
        45, 49, 16,  9, 22,  9, 33, 23, 21, 49, 58, 22, 11, 47, 50, 58, 49,
        45, 22, 33, 32, 49, 15, 45, 32, 49, 21, 11, 20, 11, 61, 23, 61, 49,
        35, 34, 29, 23, 14, 50, 49, 42, 23, 21, 50, 22, 50, 49, 54, 11, 28,
        23, 61, 11, 33, 49, 45, 22, 50, 49, 28, 27, 32,  5, 32]),
 'target_text': '‡¶®‡¶ø‡¶∞‡ßç‡¶¶‡¶ø‡¶∑‡ßç‡¶ü ‡¶Ü‡¶™‡ßá‡¶ï‡ßç‡¶∑‡¶ø‡¶ï ‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßç‡¶¨ ‡¶™‡¶∞‡¶ø‡¶Æ‡¶æ‡¶™ ‡¶ï‡¶∞‡¶§‡ßá ‡¶è‡¶ï‡ßá ‡¶¨‡¶ø‡¶≠‡¶ø‡¶®‡ßç‡¶® ‡¶∏‡¶Ç‡¶ñ‡ßç‡¶Ø‡¶æ ‡¶¶‡ßç‡¶¨‡¶æ‡¶∞‡¶æ ‡¶ö‡¶ø‡¶π‡ßç‡¶®‡¶ø‡¶§ ‡¶ï‡¶∞‡¶æ ‡¶π‡ßü‡ßá‡¶õ‡ßá'}

In [37]:
train.features

{'speech': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None),
 'sampling_rate': Value(dtype='int64', id=None),
 'target_text': Value(dtype='string', id=None),
 'input_values': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None),
 'input_length': Value(dtype='int64', id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

## Train test Split

In [None]:
train_test=train.train_test_split(test_size=0.1, shuffle=True)
train = train_test['train']
test = train_test['test']

In [None]:
train

In [None]:
test

In [68]:
train=train.remove_columns(['speech','sampling_rate','target_text','input_length'])

In [64]:
test=test.remove_columns(['speech','sampling_rate','target_text','input_length'])

In [None]:
rand_int = random.randint(0, len(train)-1)
print("Target text:", train[rand_int]["target_text"])
print("Input array shape:",train[rand_int]["speech"].shape)
print("Sampling rate:", train[rand_int]["sampling_rate"])
ipd.Audio(data=train['speech'][rand_int], autoplay=True, rate=16000)

In [None]:
rand_int = random.randint(0, len(test)-1)
print("Target text:", test[rand_int]["labels"])
#print("Input array shape:", test[rand_int]["input_values"])
ipd.Audio(data=test[rand_int]["input_values"], autoplay=True, rate=16000)

In [None]:
train[0]['input_values']

In [66]:
train[0]

{'input_values': array([-4.0990511e-05, -4.0990752e-05, -4.0989675e-05, ...,
         9.8870834e-04,  1.1265014e-03,  1.4858220e-03], dtype=float32),
 'labels': array([61, 11, 22, 23, 42, 11, 38, 23,  1, 49, 55, 58, 32, 45, 23, 38, 11,
        45, 49, 16,  9, 22,  9, 33, 23, 21, 49, 58, 22, 11, 47, 50, 58, 49,
        45, 22, 33, 32, 49, 15, 45, 32, 49, 21, 11, 20, 11, 61, 23, 61, 49,
        35, 34, 29, 23, 14, 50, 49, 42, 23, 21, 50, 22, 50, 49, 54, 11, 28,
        23, 61, 11, 33, 49, 45, 22, 50, 49, 28, 27, 32,  5, 32])}

# Training the data

## Set up trainer

In [53]:
train

Dataset({
    features: ['input_values', 'labels'],
    num_rows: 1000
})

In [89]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, df: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        input_features = [{"input_values": row["input_values"]} for row in df]
        label_features = [{"input_ids": row["labels"]} for row in df]
        
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )
        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [90]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [87]:
print([x for x in train_debug])

{'input_values': array([-0.00016368, -0.00016368, -0.00016368, ..., -0.00138846,
       -0.00145205, -0.00079389], dtype=float32), 'labels': array([35, 32, 49, 35, 47, 27, 32, 49, 33, 11, 61, 11, 49, 45,  9, 47, 11,
       44, 23, 44, 50, 49, 35, 32, 61, 50, 61, 11, 21, 50, 35, 32, 49, 45,
       22, 23, 47, 22, 33, 49,  5, 11, 44, 32, 61])}


In [91]:
padded_data_=data_collator(train_debug)

Dataset({
    features: ['input_values', 'labels'],
    num_rows: 2
})


[{'input_values': array([-0.00016368, -0.00016368, -0.00016368, ..., -0.00138846,
       -0.00145205, -0.00079389], dtype=float32), 'labels': array([35, 32, 49, 35, 47, 27, 32, 49, 33, 11, 61, 11, 49, 45,  9, 47, 11,
       44, 23, 44, 50, 49, 35, 32, 61, 50, 61, 11, 21, 50, 35, 32, 49, 45,
       22, 23, 47, 22, 33, 49,  5, 11, 44, 32, 61])}, {'input_values': array([-9.1227761e-05, -9.1230278e-05, -9.1220834e-05, ...,
        4.1401412e-02,  4.2974263e-01,  1.0727605e+00], dtype=float32), 'labels': array([15, 45,  1, 11, 49, 44, 50, 17,  1, 49, 17, 61, 51, 22, 47, 32, 53,
       61, 49, 48, 49, 39, 61, 23, 14,  1, 11, 49, 45, 50, 22, 32, 61, 23,
        1, 49, 39, 23, 14, 50, 51, 32, 27, 50, 22, 23, 35])}]


[{'input_values': array([-0.00016368, -0.00016368, -0.00016368, ..., -0.00138846,
       -0.00145205, -0.00079389], dtype=float32)}, {'input_values': array([-9.1227761e-05, -9.1230278e-05, -9.1220834e-05, ...

In [80]:
padded_data_train=data_collator(train_debug)

Dataset({
    features: ['input_values', 'labels'],
    num_rows: 2
})


[{'input_values': array([-0.00016368, -0.00016368, -0.00016368, ..., -0.00138846,
       -0.00145205, -0.00079389], dtype=float32), 'labels': array([35, 32, 49, 35, 47, 27, 32, 49, 33, 11, 61, 11, 49, 45,  9, 47, 11,
       44, 23, 44, 50, 49, 35, 32, 61, 50, 61, 11, 21, 50, 35, 32, 49, 45,
       22, 23, 47, 22, 33, 49,  5, 11, 44, 32, 61])}, {'input_values': array([-9.1227761e-05, -9.1230278e-05, -9.1220834e-05, ...,
        4.1401412e-02,  4.2974263e-01,  1.0727605e+00], dtype=float32), 'labels': array([15, 45,  1, 11, 49, 44, 50, 17,  1, 49, 17, 61, 51, 22, 47, 32, 53,
       61, 49, 48, 49, 39, 61, 23, 14,  1, 11, 49, 45, 50, 22, 32, 61, 23,
        1, 49, 39, 23, 14, 50, 51, 32, 27, 50, 22, 23, 35])}]


[{'input_values': array([-0.00016368, -0.00016368, -0.00016368, ..., -0.00138846,
       -0.00145205, -0.00079389], dtype=float32)}, {'input_values': array([-9.1227761e-05, -9.1230278e-05, -9.1220834e-05, ...

In [74]:
padded_data_train

{'input_values': tensor([[-1.6368e-04, -1.6368e-04, -1.6368e-04,  ..., -1.3885e-03,
         -1.4520e-03, -7.9389e-04],
        [-9.1228e-05, -9.1230e-05, -9.1221e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32), 'labels': tensor([[  35,   32,   49,   35,   47,   27,   32,   49,   33,   11,   61,   11,
           49,   45,    9,   47,   11,   44,   23,   44,   50,   49,   35,   32,
           61,   50,   61,   11,   21,   50,   35,   32,   49,   45,   22,   23,
           47,   22,   33,   49,    5,   11,   44,   32,   61, -100, -100],
        [  15,   45,    1,   11,   49,   44,   50,   17,    1,   49,   17,   61,
           51,   22,   47,   32,   53,   61,   49,   48,   49,   39,   61,   23,
           14,    1,   11,   49,   45,   50,   22,   32,   61,   23,    1,   49,
           39,   23,   14,   50,   51,   32,   27,   50,   22,   23,   35]])}

In [41]:
def padma(batch):
    padded_data_train = data_collator(batch)
    return padded_data_train

In [52]:
padded_data_train['labels'].size()

torch.Size([1000, 162])

In [53]:
padded_data_train['input_values'].size()

torch.Size([1000, 94996])

In [83]:
padded_data_test=data_collator(train)

In [None]:
def create_dataloaders(train_batch_size=8, eval_batch_size=8):
    train_dataloader = DataLoader(
        data_collator(train), shuffle=True, batch_size=train_batch_size
    )
    eval_dataloader = DataLoader(
        data_collator(test), shuffle=False, batch_size=eval_batch_size
    )
    return train_dataloader, eval_dataloader

In [None]:
train_dataloader, eval_dataloader = create_dataloaders()

In [None]:
metric = load_metric("wer")

In [None]:
metric

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
len(processor.tokenizer)

In [None]:
from transformers import AutoModelForCTC

model = AutoModelForCTC.from_pretrained(
    model_checkpoint,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

In [None]:
model

In [None]:
#if hasattr(model, "freeze_feature_extractor"):
#    model.freeze_feature_extractor()

In [None]:
hyperparameters = {
    "learning_rate": 3e-5,
    "num_epochs": 30,
    "train_batch_size": 8, # Actual batch size will this x 8
    "eval_batch_size": 32, # Actual batch size will this x 8
    "seed": 42,
}

## Training the model

In [None]:
def training_function(model):
    # Initialize accelerator
    accelerator = Accelerator()

    # To have only one message (and not 8) per logs of Transformers or Datasets, we set the logging verbosity
    # to INFO for the main process only.
    if accelerator.is_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    train_dataloader, eval_dataloader = create_dataloaders(
        train_batch_size=hyperparameters["train_batch_size"], eval_batch_size=hyperparameters["eval_batch_size"]
    )
    # The seed need to be set before we instantiate the model, as it will determine the random head.
    set_seed(hyperparameters["seed"])

    # Instantiate optimizer
    optimizer = AdamW(params=model.parameters(), lr=hyperparameters["learning_rate"])

    # Prepare everything
    # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
    # prepare method.
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )

    num_epochs = hyperparameters["num_epochs"]
    # Instantiate learning rate scheduler after preparing the training dataloader as the prepare method
    # may change its length.
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=len(train_dataloader) * num_epochs,
    )

    # Instantiate a progress bar to keep track of training. Note that we only enable it on the main
    # process to avoid having 8 progress bars.
    progress_bar = tqdm(range(num_epochs * len(train_dataloader)), disable=not accelerator.is_main_process)
    # Now we train the model
    for epoch in range(num_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

In [None]:
from accelerate import notebook_launcher

notebook_launcher(training_function, (model,))

In [None]:
#for collab only
'''
!cp -R /content/wav2vec2_bn /content/drive/MyDrive/buet_cse_fest_dlsprint
from google.colab import drive
drive.flush_and_unmount()
'''

# Testing and evaluation

# Submission

In [None]:
from transformers import AutoModelForCTC, Wav2Vec2Processor

model = AutoModelForCTC.from_pretrained("wav2vec2_bn")
processor = Wav2Vec2Processor.from_pretrained("wav2vec2_bn")

In [None]:
df_submission = pd.DataFrame()
df_submission['path'] = glob.glob("test_files/*.mp3")

In [None]:
#submission = submission.map(speech_file_to_array_submission_torch, remove_columns=submission.column_names)

In [None]:
#submission.set_format("numpy", columns=["speech","sampling_rate"], output_all_columns=True)