Installing Librosa

In [1]:
!pip install librosa

[0m

Importing Required Lib

In [2]:
from datasets import load_dataset,Dataset,Audio #for Huggingface Dataset Works
import librosa
import pandas as pd 
import os
import re

Path of Download data

In [3]:
dataset_repo='Roh/ryanspeech'

Getting Dataset

In [4]:
ryanspeech=load_dataset(dataset_repo,ignore_verifications=True)

No config specified, defaulting to: ryanspeech/male
Reusing dataset ryanspeech (/root/.cache/huggingface/datasets/Roh___ryanspeech/male/1.0.0/6aa3c2ab705fac229e230a018a6e7a808e0b67053481894fbe2f8265f2b1acb7)


  0%|          | 0/3 [00:00<?, ?it/s]

In [95]:
def huggingface_to_audiodataset(dataset):
    '''
    ##### IDRAK AI Experiments #####
    this function get a dataset object of hugging face and return us audio_paths, and texts
    
    arguments: 
        dataset: hugging face dataset dictionary
        
    returns:
        audio_paths(list): paths to audio 
        texts(list) : list of transcripts
    '''
    audio_paths=[]
    texts=[]
    for i in range(len(dataset)):
        audio_path=dataset[i]['audio']['path']
        text=dataset[i]['text']
        print('Working on',text,audio_path,end="\r")
        audio_paths.append(audio_path)
        texts.append(text)
    return audio_paths,texts

getting only training text and audio_file names; we are will split it later on to train and text 

In [51]:
train_audio_paths,train_texts=huggingface_to_audiodataset(ryanspeech['train'])

Working on "I've been thinking of the same things," Montgomery answered. "There's my room with the outer door-" /root/.cache/huggingface/datasets/downloads/extracted/f5a37597fcdec7b0d827b74fc85a0dac2ba997dfe66c63577d073091587b34a1/train/wavs/RY0001-1631.wav-1126.wavavs/RY0002-0305.wav409.wav-2110.wavs/RY0002-0230.wav.wavvavwav053.wav55.wav

In [52]:
df=pd.DataFrame()
df['transcription']=train_texts
df['audio']=train_audio_paths

In [53]:
df

Unnamed: 0,transcription,audio
0,"The first PC computers appeared around 1975,",/root/.cache/huggingface/datasets/downloads/ex...
1,"Sometimes it seems like the world is a cold, u...",/root/.cache/huggingface/datasets/downloads/ex...
2,I know it took all the courage I had to utter it.,/root/.cache/huggingface/datasets/downloads/ex...
3,Yes! he might do that; so when he had got to t...,/root/.cache/huggingface/datasets/downloads/ex...
4,"They went together a long, long way, till they...",/root/.cache/huggingface/datasets/downloads/ex...
...,...,...
7890,"Okay, so check in Feb 25th and check out Feb 2...",/root/.cache/huggingface/datasets/downloads/ex...
7891,The language we are now speaking is English.,/root/.cache/huggingface/datasets/downloads/ex...
7892,I can certainly try to tell you about it.,/root/.cache/huggingface/datasets/downloads/ex...
7893,That sounds great. Thank you very much for you...,/root/.cache/huggingface/datasets/downloads/ex...


In [73]:
df.to_csv('ryanspeech.csv',index=False)

In [58]:
import torchaudio

Getting Sample Info

In [65]:
meta=torchaudio.info(df.audio.iloc[1100])
print(meta)

AudioMetaData(sample_rate=22050, num_frames=37973, num_channels=1, bits_per_sample=16, encoding=PCM_S)


In [97]:
def cleanify(text):
    #function to clean text
    '''
    #### IDRAK AI Text Cleaner#####
    # this function required re module so import it
    #i.e >>>import re
    
    It will first remove the unwantted symbols from text
    using regular expression. Then Keep the numbers, alphabets, and question mark 
    
    arguments:
        text(string): text to be cleaned
        
    return: 
        text(string): cleaned text
    '''
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') #compile regulare expression for removing symbols
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z ?]') #compile regulare expression to keep wanted data
    text=str(text)
    text = text.lower() #making text to lower case
    text = REPLACE_BY_SPACE_RE.sub(' ', text)  #applying 1 and 2nd mentioned re
    text = BAD_SYMBOLS_RE.sub(' ', text)
    text=text.strip() #remove leading and tailing spaces
    return text

In [67]:
df['transcription']=df['transcription'].apply(cleanify)

In [69]:
from sklearn.model_selection import train_test_split

In [98]:
#Training and testing splits

In [70]:
df_train,df_test=train_test_split(df,test_size=0.2)

In [72]:
def make_dictionary(df1=None,file_path=''):
    transcriptions=list(df1.transcription.values) #making list of transcriptions
    file_names=list(df1.audio.values) #making list of file names
    file_names=[file_name for file_name in file_names] #appending directory containing audiosdata with file name
    data_dict={'audio':file_names , 'transcription':transcriptions} #Hugging face need a dictionary of list for creating the dataset. 
    return data_dict

In [99]:
#Packing to Hugginface Dataset Regime and Pushing to repo

In [74]:
train_dict=make_dictionary(df1=df_train,file_path='')

In [75]:
test_dict=make_dictionary(df1=df_test,file_path='')

In [77]:
audio_dataset = Dataset.from_dict(train_dict,split='train').cast_column("audio", Audio(sampling_rate=16000)) #fetching the adio from files

In [78]:
dataset_repo='m-aliabbas/idrak_ryanspeech'

In [79]:
audio_dataset.push_to_hub(dataset_repo) #commiting / pusshing dataset to Hugging face repo

  0%|          | 0/4 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?ba/s]

In [80]:
# Adding Training and Testing Splits

In [83]:
audio_dataset = Dataset.from_dict(test_dict,split='test').cast_column("audio", Audio(sampling_rate=16000)) #fetching the adio from files

In [84]:
audio_dataset.push_to_hub(dataset_repo) #commiting / pusshing dataset to Hugging face repo

  0%|          | 0/2 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Updating downloaded metadata with the new split.


In [85]:
#Checking The Dataset Repo and Splits

In [86]:
idrak_voice=load_dataset(dataset_repo)

Downloading:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Using custom data configuration m-aliabbas--idrak_ryanspeech-caee615dadca058c


Downloading and preparing dataset None/None (download: 1.02 GiB, generated: 1.05 GiB, post-processed: Unknown size, total: 2.07 GiB) to /root/.cache/huggingface/datasets/m-aliabbas___parquet/m-aliabbas--idrak_ryanspeech-caee615dadca058c/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/218M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/436M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/437M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/m-aliabbas___parquet/m-aliabbas--idrak_ryanspeech-caee615dadca058c/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [90]:
idrak_voice['train'][4]['audio']

{'path': None,
 'array': array([ 3.96761867e-05, -8.31791544e-05,  4.91017366e-05, ...,
        -3.79574425e-05,  1.59031853e-05,  0.00000000e+00]),
 'sampling_rate': 16000}

In [100]:
dataset_repo

'm-aliabbas/idrak_ryanspeech'