# Speech Recognition with a hybrid CNN + RNN architecture

## Dataset: Mozilla Common Voice 11.0, Brazilian Portuguese

### 1. Download and explore the dataset

In [1]:
import os
import datasets



In [2]:
# Download the dataset from hugginface
# After the initial download, it will be reused from the local cache folder

dataset = datasets.load_dataset("mozilla-foundation/common_voice_11_0", "pt", cache_dir=os.getcwd())

Found cached dataset common_voice_11_0 (/media/my_data/programming/lessons/asr_pt/mozilla-foundation___common_voice_11_0/pt/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0)


  0%|          | 0/5 [00:00<?, ?it/s]

In [3]:
# See the structure of the dataset

dataset

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 18211
    })
    validation: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 8688
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 8693
    })
    other: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 16751
    })
    invalidated: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 4870
    })
})

In [4]:
from collections import Counter
print(Counter(dataset["train"]["accent"]))

Counter({'': 15536, 'Português do Brasil,Região Sul do Brasil': 1308, 'Paulistano.': 324, 'neutro': 299, 'Soa levemente cantado.': 180, 'Paulista,Paulistano,Brasileiro': 118, 'Carioca,Brasileiro': 105, 'Pouco sotaque': 94, 'Carioca': 82, 'Paulista': 72, 'Mato Grosso': 52, 'Mineiro': 41})


In [5]:
Counter(dataset["train"]["client_id"])

Counter({'4736e6f95a939c9602c1e1f200221e1ed5164b6e7190a1e567fe047fd73621cda5d4d8c063c41ad801a2fdf79246b9fa6b1d8050c23f7827e4b355c27371032e': 38,
         '81f45e8a0072b3e8f486e6fcd9306e20f7b20c904790b5eebed510a35372e8b35069cc86375c78d11b6c6e0b29dac39ef150f7c0a26fe401212b3402808ffa6d': 37,
         'bafe6f99fffb941ad1b2cefeaf232134a85fcd4053fefeaff49a859a4c036b59812907c20950ae5a82cd104b9028d41daf93606c77a3af677e4e67218554de59': 52,
         'deaafb475e5f6790e785ffd439dde2d41487217f1d464d7c6dbcb59c800c340fb159b788b071f058383da444a4248e2a90e4ddd4088d2e1f2629d0cc52988f45': 38,
         'f7b523cec5e9c74ecc57fadaaf973cc21c7be13513b8f1f312262761e99ef2c8dd799d21ed2e017340ab50872dc8727244930345b0c5ebee37aa86137a95df04': 36,
         '180571bc2bccbba0abb666d8937e807da1c6f8ccb11ee246deb6638517d30c73aa8da3714e7ff93497d20dc6e01302ec76dca681937ddb67e48b13e905376885': 34,
         '69dd03210854d9c9d0423238bdb390b3cb8232c58609f50678daf12a9cd2753c9a11d5f421d07c93d7aae56522199256c7e09a72fa3891fe5ea5eb60

In [6]:
Counter(dataset["train"]["segment"])

Counter({'': 18211})

In [7]:
# Drop unnecessary columns

dataset = dataset.remove_columns(['up_votes', 'down_votes', 'locale', 'segment'])
print(dataset)                               

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'age', 'gender', 'accent'],
        num_rows: 18211
    })
    validation: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'age', 'gender', 'accent'],
        num_rows: 8688
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'age', 'gender', 'accent'],
        num_rows: 8693
    })
    other: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'age', 'gender', 'accent'],
        num_rows: 16751
    })
    invalidated: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'age', 'gender', 'accent'],
        num_rows: 4870
    })
})


In [9]:
# Split into the train, validation, and test set

train_set = dataset["train"].with_format("torch")
val_set = dataset["validation"].with_format("torch")
test_set = dataset["test"].with_format("torch")