In [8]:
from datasets import load_from_disk

data = load_from_disk('pickles/merged_dataset/arabic_portuguese/low_cos_sim')

for i in range(10):
    print(data[i]['sentence'])

فَإِنَّ الْبَهَائِمَ الْوَحْشِيَّةَ تَأْبَى ذَلِكَ وَتَأْنَفُ مِنْهُ Não havia mais nada a dizer.
وَحُكِيَ أَنَّ الرَّشِيدَ حَبَسَ أَبِي الْعَتَاهِيَةِ فَكَتَبَ عَلَى حَائِطِ الْحَبْسِ Lua cercada, chuva ou ventosa.
وَقِيلَ فِي مَنْثُورِ الْحِكَمِ مَنْ أَظْهَرَ عَيْبَ نَفْسِهِ فَقَدْ زَكَّاهَا Quero falar com você.
أقبلت في غلالة كدم الخش Para prolongá-lo.
شَمَّ عَلِيٌ وَرْدَةً. Vicentinópolis
سائل البخيل محروم وماله مكتوم سارت به الرُّكْبانُ سافر تجد عوضا عما تفارقه Afeta positivamente a correta previsão do orçamento setorial.
لَعَلَّ الْمَرِيضَ نَائِمٌ. Pintadas
و صلاح الدين غطاها رماحا Cambuci
وَإِمَّا شَحِيحٌ يُرَوِّضُ نَفْسَهُ تَوْطِئَةً Trabalhos derivados.
وَإِنَّ التَّقْصِيرَ فِيهِ يَفُضُّ عَنْك الْمُؤَانِسِينَ Ponto Belo


In [7]:
print(data.column_names)
print(len(data))

['audio', 'sentence']
40000


In [5]:
import IPython.display as ipd

ipd.Audio(data=data[0]["audio"], autoplay=True, rate=48000)


In [3]:
ipd.Audio(data=data[1]["audio"], autoplay=True, rate=48000)

In [1]:
from datasets import load_dataset, load_metric, Audio, ClassLabel, load_from_disk, Features, Value
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC, \
    TrainingArguments, Trainer
import torch
import torchaudio
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import random
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import re
import json
from torch.utils.tensorboard import SummaryWriter
import pyarabic.araby as araby
from unidecode import unidecode

# the writer is responsible for tensorboard logging
writer = SummaryWriter(comment="_arabic_portuguese_high")

print("----------------- Checking if cuda is available... -----------------")
print(f'Cuda Available = {torch.cuda.is_available()}\n\n')

print("----------------- Loading Datasets complete. -----------------")
data = load_from_disk('pickles/merged_dataset/arabic_portuguese/high_cos_sim')
print("----------------- Loading Datasets complete. -----------------\n")
# small dataset for testing purposes only (10 samples)
data = data.select(range(100))


# split the dataset to train and test sets (90% train, 10% test) randomly
print("----------------- Splitting dataset to train and test sets... -----------------")
splits1 = data.train_test_split(test_size=0.1, seed=42)

train = splits1['train']
test = splits1['test']
# save the test set for later evaluation
test.save_to_disk('pickles/merged_dataset/arabic_portuguese/test_high')
# split the train set to train and validation sets (80% from 40000 train, 10% from 40000 validation) randomly
splits2 = train.train_test_split(test_size=0.115, seed=42)
train = splits2['train']
validation = splits2['test']

del splits1
del splits2
del data


def extract_all_chars(batch):
    all_text = " ".join(batch["sentence"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}


print("\n----------------- Extracting all characters... -----------------")
vocab_train = train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True,
                        remove_columns=train.column_names)
vocab_test = validation.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True,
                            remove_columns=validation.column_names)
print("----------------- Extracting all characters complete. -----------------\n\n")

# ----------------------------------- VOCAB -----------------------------------#

print("----------------- Preparing vocab... -----------------")
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(vocab_list)}

print(f'Vocab_dict: {vocab_dict}')

vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(f'Vocab_len: {len(vocab_dict)}')

print("----------------- Preparing vocab complete. -----------------\n\n")

print("----------------- Saving vocab to jason... -----------------")
with open('vocab/arabic_portu_high.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

print("----------------- Saving vocab to jason complete. -----------------\n\n")

tokenizer = Wav2Vec2CTCTokenizer("./vocab/arabic_portu_high.json", unk_token="[UNK]", pad_token="[PAD]",
                                 word_delimiter_token="|")

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True,
                                             return_attention_mask=True)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

# print(f'Sanity Check sampling rate is 48khz: {train[0]["audio"]}\n')

# ----------------------------------- Resampling to 16khz -----------------------------------#

# print("----------------- Resampling to 16khz... -----------------")
# train = train.cast_column("audio", Audio(sampling_rate=16_000))
# validation = validation.cast_column("audio", Audio(sampling_rate=16_000))
# print(f'Making sure the sampling rate changed to 16khz {train[0]["audio"]}')
# print("----------------- Resampling complete. -----------------\n\n")


# ----------------------------------- Preparing datasets -----------------------------------#

def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio, sampling_rate=16000).input_values[0]

    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch


print("----------------- Preparing datasets... -----------------")
train = train.map(prepare_dataset, remove_columns=train.column_names, num_proc=4)  # maybe we'll have to reduce to 1
validation = validation.map(prepare_dataset, remove_columns=validation.column_names, num_proc=4)
print("\n\n----------------- Preparing datasets complete. -----------------\n\n")

2023-03-11 21:58:14.521886: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-11 21:58:14.669975: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-11 21:58:14.691686: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-11 21:58:14.691694: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

----------------- Checking if cuda is available... -----------------
Cuda Available = True


----------------- Loading Datasets complete. -----------------
----------------- Loading Datasets complete. -----------------

----------------- Splitting dataset to train and test sets... -----------------


Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]




----------------- Extracting all characters... -----------------


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

----------------- Extracting all characters complete. -----------------


----------------- Preparing vocab... -----------------
Vocab_dict: {'i': 0, 'ي': 1, 'ی': 2, 'e': 3, 'غ': 4, 'r': 5, 'و': 6, 'o': 7, 'ك': 8, 'ب': 9, 'ت': 10, 'ش': 11, 'ن': 12, 'ر': 13, 'c': 14, 'h': 15, 'ؤ': 16, 'ئ': 17, 'س': 18, 'ص': 19, 's': 20, 'p': 21, 'ة': 22, 'ع': 23, 'ق': 24, 'l': 25, 'ظ': 26, 'ه': 27, 'ض': 28, 'ج': 29, 't': 30, 'u': 31, 'x': 32, 'd': 33, 'f': 34, 'n': 35, 'أ': 36, 'a': 37, 'ا': 38, 'ط': 39, 'j': 40, 'ء': 41, 'ث': 42, 'ز': 43, 'k': 44, 'م': 45, 'آ': 46, 'g': 47, 'm': 48, 'v': 49, 'خ': 50, 'ح': 51, 'ى': 52, 'ف': 53, 'q': 54, 'د': 55, 'ذ': 56, 'y': 57, 'b': 58, ' ': 59, 'ل': 60, 'z': 61, 'إ': 62}
Vocab_len: 65
----------------- Preparing vocab complete. -----------------


----------------- Saving vocab to jason... -----------------
----------------- Saving vocab to jason complete. -----------------


----------------- Preparing datasets... -----------------
        

#0:   0%|          | 0/20 [00:00<?, ?ex/s]

#1:   0%|          | 0/20 [00:00<?, ?ex/s]

#2:   0%|          | 0/20 [00:00<?, ?ex/s]

#3:   0%|          | 0/19 [00:00<?, ?ex/s]



        

#1:   0%|          | 0/3 [00:00<?, ?ex/s]

#0:   0%|          | 0/3 [00:00<?, ?ex/s]

#2:   0%|          | 0/3 [00:00<?, ?ex/s]

#3:   0%|          | 0/2 [00:00<?, ?ex/s]





----------------- Preparing datasets complete. -----------------




In [7]:
import IPython.display as ipd

ipd.Audio(data=train[0]["input_values"], autoplay=True, rate=16000)