In [1]:
import os

import numpy as np
import pandas as pd

import IPython.display as ipd

import torchaudio
import datasets as hfd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1.1)

In [2]:
import logging
logging_format_str = '%(asctime)s:%(name)s:%(levelname)s:%(message)s'
logging.basicConfig(format=logging_format_str, level=logging.WARNING)
logger = logging.getLogger('STT')
logger.setLevel(level=logging.DEBUG)

In [3]:
DATA_ROOT_DP = os.environ['DATA_HOME']
CV_DP = f'{DATA_ROOT_DP}/datasets/cv-corpus-8.0-2022-01-19/be'
CLIPS_DP = os.path.join(CV_DP, 'clips')

CV_PROCESSED_DP = f'{DATA_ROOT_DP}/datasets/cv-corpus-8.0-2022-01-19__be__processed'

In [29]:
# !rm -r $CV_PROCESSED_DP

## process

In [5]:
splits = ['train', 'dev', 'test']

In [6]:
df = {}
for split in splits:
    df[split] = pd.read_csv(os.path.join(CV_DP, f'{split}.tsv'), delimiter='\t')
    logger.debug(f'df_{split}.shape: {df[split].shape}')

2022-03-28 21:07:15,362:STT:DEBUG:df_train.shape: (314305, 10)
2022-03-28 21:07:15,407:STT:DEBUG:df_dev.shape: (15803, 10)
2022-03-28 21:07:15,452:STT:DEBUG:df_test.shape: (15801, 10)


In [7]:
df['train'].head(1)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,locale,segment
0,9356b041bf9c40eb40bc036a354b69f47709e0cae8d619...,common_voice_be_29365148.mp3,Прыкладам таму можа быць мой уласны досвед.,2,0,,,,be,


In [8]:
ds = hfd.DatasetDict({
    split: hfd.Dataset.from_pandas(df[split][['path', 'sentence']])
    for split in splits
})

In [9]:
ds

DatasetDict({
    train: Dataset({
        features: ['path', 'sentence'],
        num_rows: 314305
    })
    dev: Dataset({
        features: ['path', 'sentence'],
        num_rows: 15803
    })
    test: Dataset({
        features: ['path', 'sentence'],
        num_rows: 15801
    })
})

In [10]:
ds.cache_files

{'train': [], 'dev': [], 'test': []}

In [11]:
ds['train'][0]

{'path': 'common_voice_be_29365148.mp3',
 'sentence': 'Прыкладам таму можа быць мой уласны досвед.'}

In [12]:
ds['test'][0]

{'path': 'common_voice_be_29793303.mp3',
 'sentence': 'І на гэтым кірунку нас чакаюць вялікія складанасці.'}

In [13]:
def get_full_path(example, root_dp: str):
    example['path'] = os.path.join(root_dp, example['path'])
    return example

In [14]:
ds = ds.map(get_full_path, fn_kwargs=dict(root_dp=CLIPS_DP))

ds = ds.cast_column('path', hfd.Audio(sampling_rate=16_000, mono=True))
ds = ds.rename_column('path', 'audio')

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [24]:
# ds['train'][0]

In [25]:
# ds['dev'][0]

In [17]:
ds.save_to_disk(os.path.join(CV_PROCESSED_DP))

## validate

In [18]:
!ls $CV_PROCESSED_DP

dataset_dict.json  dev	test  train


In [19]:
!ls $CV_PROCESSED_DP/test

dataset.arrow  dataset_info.json  state.json


In [27]:
# !du -sh $CV_PROCESSED_DP/*

In [28]:
# !du -sh $CV_PROCESSED_DP/dev/*

In [22]:
x = hfd.load_from_disk(CV_PROCESSED_DP)

In [23]:
x

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 314305
    })
    dev: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 15803
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 15801
    })
})