In [None]:
!pip install transformers
!pip install opendatasets
!pip install pydub
!pip install gdown
!pip install -U sentence-transformers

In [None]:
import torch
from transformers import AutoProcessor, Wav2Vec2Model
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from time import time
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
import os
from pydub import AudioSegment
import opendatasets as od
import pandas as pd
import gc
import random
import pickle
import gdown
import json
from nltk import word_tokenize
import string
from zipfile import ZipFile
from torch.nn.functional import cosine_similarity
from tabulate import tabulate
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import librosa
import librosa.display
import matplotlib.pyplot as plt
import torchvision.models as models
import torchvision.transforms.functional as F
from PIL import Image
import cv2
from sentence_transformers import SentenceTransformer
from IPython.display import FileLink
from shutil import rmtree
from datasets import load_dataset
from IPython.display import Audio, display
import torchaudio

# Brown (add Spectogram, Wav2vec2, Labse)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
url = "https://drive.google.com/file/d/1MWRTrInoCkdGF-Hj3f8iv0TnzYtGDhkc/view?usp=share_link"
output = "localized_metadata.json"
gdown.download(url, output, quiet=False, fuzzy=True)

In [None]:
with open('localized_metadata.json', 'r') as f:
    localized_metdata = json.load(f)
localized_metdata.keys()

In [None]:
def download_data_part(url, output):
    gdown.download(url, output, quiet=False, fuzzy=True)
    with ZipFile(output, 'r') as zip_ref:
        zip_ref.extractall(output.split('.')[0])
    os.remove(output)

In [None]:
audio_processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
audio_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device)

In [None]:
vision_model = models.efficientnet_b7(pretrained=True)
vision_model.cuda()
_ = vision_model.eval()

In [None]:
text_model = SentenceTransformer('sentence-transformers/LaBSE').to(device)

In [None]:
def get_image_embedding(path):
    image = cv2.imread(path)
    image_tensor = torch.from_numpy(image)
    image_tensor = image_tensor.unsqueeze(0)
    image_tensor = image_tensor.permute(0, 3, 1, 2)
    image_tensor = image_tensor.to(device)
    image_tensor = image_tensor.float()
    with torch.no_grad():
        output = vision_model(image_tensor)
        embedding = output[0].cpu()
        return embedding

In [None]:
def pipeline(part_id):
    embeddings_data = {}
    valid_ids = []
    for audio_idx in tqdm(range(len(localized_metdata[f'part{part_id}']))):
        if audio_idx % 100 == 0:
            torch.cuda.empty_cache()
            gc.collect()
            torch.cuda.ipc_collect()
        try:
            file_path = f'dataset_part{part_id}'+'/audios/' + localized_metdata[f'part{part_id}'][audio_idx]['file_path'].split('/')[-1]

            audio_file = AudioSegment.from_file(file_path)
            samples = np.array(audio_file.get_array_of_samples())
            samples = samples.reshape(-1, audio_file.channels)
            samples = samples / np.max(np.abs(samples))
            samples = samples.squeeze()

            audio = torch.from_numpy(samples)
            inputs = audio_processor(audio, sampling_rate=16000, return_tensors="pt").to(device)
            with torch.no_grad():
                outputs = audio_model(**inputs)
            last_hidden_states = outputs.last_hidden_state.squeeze(0)
            embeddings = last_hidden_states.mean(dim=0)
            embeddings_data[localized_metdata[f'part{part_id}'][audio_idx]['id']] = []
            embeddings_data[localized_metdata[f'part{part_id}'][audio_idx]['id']].append(embeddings)
        except Exception as e:
            print(localized_metdata[f'part{part_id}'][audio_idx]['id'])
            continue
        valid_ids.append(localized_metdata[f'part{part_id}'][audio_idx]['id'])

    final_valid_ids = []
    for audio_idx in tqdm(range(len(localized_metdata[f'part{part_id}']))):
        if localized_metdata[f'part{part_id}'][audio_idx]['id'] not in valid_ids:
             continue
                
        if audio_idx % 100 == 0:
            torch.cuda.empty_cache()
            gc.collect()
            torch.cuda.ipc_collect()
        try:
            file_path = f'dataset_part{part_id}'+'/audios/' + localized_metdata[f'part{part_id}'][audio_idx]['file_path'].split('/')[-1]
            y, sr = librosa.load(file_path)
            spec = librosa.stft(y)
            spec_db = librosa.amplitude_to_db(abs(spec))
            
            plt.clf()
            librosa.display.specshow(spec_db, x_axis='time', y_axis='log')
            plt.xlabel('')
            plt.ylabel('')
            plt.tight_layout()

            _id = localized_metdata[f'part{part_id}'][audio_idx]['id']
            saved_path = f'spec.png'
            plt.savefig(saved_path, bbox_inches='tight', pad_inches=0)
            image_embeddings = get_image_embedding(saved_path)
            embeddings_data[_id].append(image_embeddings)

        except Exception as e:
            print(localized_metdata[f'part{part_id}'][audio_idx]['id'])
            continue
        final_valid_ids.append(localized_metdata[f'part{part_id}'][audio_idx]['id'])
    final_ids = []
    for audio_idx in tqdm(range(len(localized_metdata[f'part{part_id}']))):
        if localized_metdata[f'part{part_id}'][audio_idx]['id'] not in final_valid_ids:
            continue
        try:
            _id = localized_metdata[f'part{part_id}'][audio_idx]['id']
            text_embeddings = text_model.encode(localized_metdata[f'part{part_id}'][audio_idx]['text'])
            embeddings_data[_id].append(text_embeddings)
        except Exception as e:
            print(localized_metdata[f'part{part_id}'][audio_idx]['id'])
            continue
        final_ids.append(localized_metdata[f'part{part_id}'][audio_idx]['id'])
    final_embeddings_data = {}
    for key in embeddings_data.keys():
        if key not in final_ids:
            continue
        final_embeddings_data[key] = embeddings_data[key]
        final_embeddings_data[key][2] = torch.tensor(final_embeddings_data[key][2])
        final_embeddings_data[key][0] = final_embeddings_data[key][0].cpu()
    
    with open(f'pretrained_embeddings_part{part_id}.pkl', 'wb') as f:
        pickle.dump(final_embeddings_data, f)


In [None]:
download_data_part('https://drive.google.com/file/d/1GkXhLbzHrobM4GaGX80PzRHl-TBPg8h4/view?usp=drive_link','dataset_part1.zip')

In [None]:
# pipeline(1)

In [None]:
# part 2
# download_data_part('https://drive.google.com/file/d/1n_xxsblXrw5tCVk7ZP6_kRtubjMOuC2l/view?usp=drive_link','dataset_part2.zip')

In [None]:
# pipeline(2)

In [None]:
# part 3
# download_data_part('https://drive.google.com/file/d/14-Tvd3jvgY2Ge5-cxOsX7Arx6esfUxeN/view?usp=drive_link','dataset_part3.zip')

In [None]:
# pipeline(3)

In [None]:
# part 4
# download_data_part('https://drive.google.com/file/d/1fId6jAxD5UzObg4zu30GPpcP-P9F7TSL/view?usp=drive_link','dataset_part4.zip')

In [None]:
# pipeline(4)

In [None]:
# part 5
# download_data_part('https://drive.google.com/file/d/1VssFVfqgnXy7JpgUKpTWEVg11bqYK2bn/view?usp=drive_link','dataset_part5.zip')

In [None]:
# pipeline(5)

In [None]:
# part 6
# download_data_part('https://drive.google.com/file/d/11vhiU4lV8w3IYTXebTDr-9j7834ig6OU/view?usp=drive_link','dataset_part6.zip')

In [None]:
# pipeline(6)

In [None]:
# part 7
# download_data_part('https://drive.google.com/file/d/1FbPz9VnDgU9Bj7sxtxOb7Etx1_dnobmH/view?usp=drive_link','dataset_part7.zip')

In [None]:
# pipeline(7)

In [None]:
# from shutil import rmtree
# rmtree('dataset_part6')
# rmtree('dataset_part7')

In [None]:
# part 8
# download_data_part('https://drive.google.com/file/d/1E_UWK88hWwhwKI8kiND3OCtdo3tnlOu5/view?usp=drive_link','dataset_part8.zip')

In [None]:
# pipeline(8)

In [None]:
# part 9
# download_data_part('https://drive.google.com/file/d/17PC8U5HYQ8r9wGlpQoeptjMyA7rK6PrZ/view?usp=drive_link','dataset_part9.zip')

In [None]:
# pipeline(9)

In [None]:
# part 10
# download_data_part('https://drive.google.com/file/d/1TQI49qhcDKSX0222sAQxDsQZVbZnjlBP/view?usp=drive_link','dataset_part10.zip')

In [None]:
# pipeline(10)

In [None]:
FileLink('pretrained_embeddings_part8.pkl')

In [None]:
FileLink('pretrained_embeddings_part9.pkl')

In [None]:
FileLink('pretrained_embeddings_part10.pkl')

In [None]:
with open('pretrained_embeddings_part2.pkl', 'rb') as f:
    pretrained_embeddings_part1 = pickle.load(f)
print(len(pretrained_embeddings_part1.keys()))
# print type of embeddings
sample_key = list(pretrained_embeddings_part1.keys())[0]
print('audio', type(pretrained_embeddings_part1[sample_key][0]))
print('image', type(pretrained_embeddings_part1[sample_key][1]))
print('text', type(pretrained_embeddings_part1[sample_key][2]))

In [None]:
# changes <class 'numpy.ndarray'> for text to tensor
for key in pretrained_embeddings_part1.keys():
    pretrained_embeddings_part1[key][2] = torch.tensor(pretrained_embeddings_part1[key][2])
    # also change the type of device for audio and remove cuda
    pretrained_embeddings_part1[key][0] = pretrained_embeddings_part1[key][0].cpu()

In [None]:
sample_key = list(pretrained_embeddings_part1.keys())[0]
print('audio', type(pretrained_embeddings_part1[sample_key][0]))
print('image', type(pretrained_embeddings_part1[sample_key][1]))
print('text', type(pretrained_embeddings_part1[sample_key][2]))

In [None]:
# store all embeddings files into a zip 
with ZipFile('pretrained_embeddings.zip', 'w') as zipObj:
    for part_id in range(1, 11):
        zipObj.write(f'pretrained_embeddings_part{part_id}.pkl')

In [None]:
# now read it to be sure
with ZipFile('pretrained_embeddings.zip', 'r') as zipObj:
    zipObj.extractall('pretrained_embeddings')

In [None]:
rmtree('dataset_part8')
rmtree('dataset_part9')
rmtree('dataset_part10')

In [None]:
FileLink('pretrained_embeddings.zip')

In [None]:
# create_new_localized_metadata
new_localized_metadata = {}
all_valid_ids = []
for part_id in range(1, 11):
    with open(f'pretrained_embeddings_part{part_id}.pkl', 'rb') as f:
        pretrained_embeddings_part = pickle.load(f)
    all_valid_ids.extend(list(pretrained_embeddings_part.keys()))

In [None]:
len(all_valid_ids)

In [None]:
for key,value in localized_metdata.items():
    for item in value:
        if item['id'] in all_valid_ids:
            new_localized_metadata[item['id']] = item
            new_localized_metadata[item['id']]['part_id'] = int(key.split('_')[0][4:])

In [None]:
len(new_localized_metadata)

In [None]:
random_key = random.choice(list(new_localized_metadata.keys()))
print(new_localized_metadata[random_key])

In [None]:
with open('metadata.json', 'w') as f:
    json.dump(new_localized_metadata, f)

In [None]:
FileLink('metadata.json')

# FLEURS (add Spectogram, Wav2vec2, Labse)

In [None]:
fleurs_retrieval = load_dataset("google/fleurs", "en_us")

In [None]:
fleurs_retrieval['train'][0]['transcription']

In [None]:
fleurs_retrieval['train'][1000]

In [None]:
example = fleurs_retrieval['train'][0]
audio = example["audio"]["array"]
sampling_rate = example["audio"]["sampling_rate"]

display(Audio(audio, rate=sampling_rate))

In [None]:
audio.shape

In [None]:
samples = np.array(example["audio"]["array"])
# tensor = torchaudio.transforms.Resample(orig_sample_rate=array.shape[1], new_sample_rate=16000)(array)
# samples = samples.reshape(-1, audio_file.channels)
samples = samples / np.max(np.abs(samples))
samples = samples.squeeze()
audio = torch.from_numpy(samples)
audio = (audio - audio.mean()) / audio.std()
inputs = audio_processor(audio, sampling_rate=16000, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = audio_model(**inputs)
last_hidden_states = outputs.last_hidden_state.squeeze(0)
embeddings = last_hidden_states.mean(dim=0)
embeddings.shape

In [None]:
fleurs_retrieval['train']['path'][0]

In [None]:
file_path = f"/root/.cache/huggingface/datasets/downloads/extracted/ee72a6213f7cabb647adadc74f3888674aa278435dca4f54892382eda44c8014/{fleurs_retrieval['train'][0]['audio']['path']}"
audio_file = AudioSegment.from_file(file_path)
samples = np.array(audio_file.get_array_of_samples())
samples.shape

In [None]:
display(Audio(file_path))

In [None]:
print(fleurs_retrieval['train'].num_rows)
print(fleurs_retrieval['validation'].num_rows)
print(fleurs_retrieval['test'].num_rows)
print(fleurs_retrieval['train'].num_rows + fleurs_retrieval['validation'].num_rows + fleurs_retrieval['test'].num_rows)

In [None]:
with open('metadata.json', 'r') as f:
    metadata = json.load(f)

random_key = random.choice(list(metadata.keys()))
print(metadata[random_key])

In [None]:
# max id of prevoius metadata
max_id = max([int(key) for key in metadata.keys()])
start_id = max_id + 10000
start_id

In [None]:
fleurs_retrieval['validation'][0]

In [None]:
os.listdir('/root/.cache/huggingface/datasets/downloads/extracted/ca4597c8111fcd5b2597061f9372be99a157ee772eece66bfc4318463984a3af')

In [None]:
'/'.join(fleurs_retrieval['validation'][0]['path'].split('/')[:-1])

In [None]:
split = 'validation'
audio_idx = 0
file_path = f"{'/'.join(fleurs_retrieval[split][audio_idx]['path'].split('/')[:-1])}/{fleurs_retrieval[split][audio_idx]['audio']['path']}"
file_path

In [None]:
fleurs_retrieval

In [None]:
fleurs_metadata = {}

In [None]:
def pipeline_for_fleurs(split):
    data_v1 = {}
    for audio_idx in tqdm(range(len(fleurs_retrieval[split]))):
        if audio_idx % 100 == 0:
            torch.cuda.empty_cache()
            gc.collect()
            torch.cuda.ipc_collect()
        try:
            file_path = f"{'/'.join(fleurs_retrieval[split][audio_idx]['path'].split('/')[:-1])}/{fleurs_retrieval[split][audio_idx]['audio']['path']}"
            inner_data = {}
            inner_data['id'] = start_id + audio_idx
            inner_data['fleus_id'] = fleurs_retrieval[split][audio_idx]['id']
            inner_data['file_path'] = file_path

            audio_file = AudioSegment.from_file(file_path)
            samples = np.array(audio_file.get_array_of_samples())
            samples = samples.reshape(-1, audio_file.channels)
            samples = samples / np.max(np.abs(samples))
            samples = samples.squeeze()

            audio = torch.from_numpy(samples)
            inputs = audio_processor(audio, sampling_rate=fleurs_retrieval[split][audio_idx]['audio']['sampling_rate'], return_tensors="pt").to(device)
            with torch.no_grad():
                outputs = audio_model(**inputs)
            last_hidden_states = outputs.last_hidden_state.squeeze(0)
            embeddings = last_hidden_states.mean(dim=0)
            inner_data['audio_embedding'] = embeddings.cpu()
            inner_data['text'] = fleurs_retrieval[split][audio_idx]['transcription']
            inner_data['audio_idx'] = audio_idx
            data_v1[start_id + audio_idx] = inner_data
        except Exception as e:
            print(e)
            print(fleurs_retrieval[split][audio_idx]['id'])
            continue

    data_v2 = {}
    for audio_idx in tqdm(range(len(fleurs_retrieval[split]))):
        if start_id + audio_idx not in data_v1.keys():
             continue
                
        if audio_idx % 100 == 0:
            torch.cuda.empty_cache()
            gc.collect()
            torch.cuda.ipc_collect()
        try:
            file_path = f"{'/'.join(fleurs_retrieval[split][audio_idx]['path'].split('/')[:-1])}/{fleurs_retrieval[split][audio_idx]['audio']['path']}"
            y, sr = librosa.load(file_path)
            spec = librosa.stft(y)
            spec_db = librosa.amplitude_to_db(abs(spec))
            
            plt.clf()
            librosa.display.specshow(spec_db, x_axis='time', y_axis='log')
            plt.xlabel('')
            plt.ylabel('')
            plt.tight_layout()

            saved_path = f'spec.png'
            plt.savefig(saved_path, bbox_inches='tight', pad_inches=0)
            image_embeddings = get_image_embedding(saved_path)
            data_v2[start_id + audio_idx] = data_v1[start_id + audio_idx].copy()
            data_v2[start_id + audio_idx]['image_embedding'] = image_embeddings

        except Exception as e:
            print(e)
            print(fleurs_retrieval[split][audio_idx]['id'])
            continue
    data_v3 = {}
    for audio_idx in tqdm(range(len(fleurs_retrieval[split]))):
        if start_id + audio_idx not in data_v2.keys():
            continue
        try:
            text_embeddings = text_model.encode(data_v2[start_id + audio_idx]['text'])
            data_v3[start_id + audio_idx] = data_v2[start_id + audio_idx].copy()
            data_v3[start_id + audio_idx]['text_embedding'] = torch.tensor(text_embeddings)
        except Exception as e:
            print(e)
            print(fleurs_retrieval[split][audio_idx]['id'])
            continue
    with open(f'pretrained_embeddings_fleurs_{split}.pkl', 'wb') as f:
        pickle.dump(data_v3, f)
    return data_v3

In [None]:
split = 'validation'
fleurs_metadata[split] = pipeline_for_fleurs(split)

In [None]:
len(fleurs_metadata['validation'])

In [None]:
split = 'train'
fleurs_metadata[split] = pipeline_for_fleurs(split)

In [None]:
len(fleurs_metadata['train'])

In [None]:
split = 'test'
fleurs_metadata[split] = pipeline_for_fleurs(split)

In [None]:
len(fleurs_metadata['test'])

In [None]:
# save total metadata
with open('fleurs_data.pkl', 'wb') as f:
    pickle.dump(fleurs_metadata, f)

In [None]:
with open('fleurs_data.pkl', 'rb') as f:
    fleurs_data = pickle.load(f)
fleurs_data.keys()

In [None]:
random_key = random.choice(list(fleurs_data['validation'].keys()))
fleurs_data['validation'][random_key]

In [None]:
FileLink('fleurs_data.pkl')

In [None]:
print(len(fleurs_data['validation']) + len(fleurs_data['train']) + len(fleurs_data['test']))

# Common Voice (add Spectogram, Wav2vec2, Labse)

In [None]:
data_path = "/kaggle/input/common-voice-dataset-version-4/data-file/train.tsv"
audio_path = "/kaggle/input/common-voice-dataset-version-4/new-clip"
data_train = pd.read_csv(data_path,comment='#',sep="\t")

In [None]:
len(data_train)

232975

In [None]:
data_train.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
0,4f29be8fe932d773576dd3df5e111929f4e22242232245...,common_voice_en_19664034.mp3,"These data components in turn serve as the ""bu...",2,0,thirties,male,
1,4f29be8fe932d773576dd3df5e111929f4e22242232245...,common_voice_en_19664035.mp3,The church is unrelated to the Jewish politica...,3,0,thirties,male,
2,4f29be8fe932d773576dd3df5e111929f4e22242232245...,common_voice_en_19664037.mp3,The following represents architectures which h...,2,0,thirties,male,
3,4f29be8fe932d773576dd3df5e111929f4e22242232245...,common_voice_en_19664038.mp3,"Additionally, the pulse output can be directed...",2,0,thirties,male,
4,4f29be8fe932d773576dd3df5e111929f4e22242232245...,common_voice_en_19664040.mp3,The two are robbed by a pickpocket who is losi...,3,0,thirties,male,


In [None]:
data_train['sentence'][4]

In [None]:
display(Audio(f"{audio_path}/{data_train['path'][4]}"))

In [None]:
audio_file = AudioSegment.from_file(f"{audio_path}/{data_train['path'][4]}")

In [None]:
len(audio_file.get_array_of_samples())

223488

In [None]:
# preprocess sentence
def preprocess_sentence(sentence):
    sentence = sentence.lower()
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    sentence = sentence.strip()
    return sentence

In [None]:
preprocess_sentence(data_train['sentence'][10])

'what did you think of that trip'

In [None]:
def get_image_embedding(path):
    image = cv2.imread(path)
    image_tensor = torch.from_numpy(image)
    image_tensor = image_tensor.unsqueeze(0)
    image_tensor = image_tensor.permute(0, 3, 1, 2)
    image_tensor = image_tensor.to(device)
    image_tensor = image_tensor.float()
    with torch.no_grad():
        output = vision_model(image_tensor)
        embedding = output[0].cpu()
        return embedding

In [None]:
start_id = 200000

In [None]:
def pipeline_for_common_voice(dataset, limit, split, valid_ids, part_number, start_idx=0):
    data_v1 = {}
    counter = 0
    j = 0
    max_id = 0
    if start_idx == 0: 
        val_ids_final = valid_ids
    else:
        for i, item in enumerate(valid_ids):
            if item == start_idx:
                j = i
                break
        val_ids_final = valid_ids[(j + 1):]
    for audio_idx in tqdm(val_ids_final):
        if counter == 0:
            print(f'start_idx {audio_idx}')
        if counter % 100 == 0:
            torch.cuda.empty_cache()
            gc.collect()
            torch.cuda.ipc_collect()
        if counter == limit + 50:
            max_id = audio_idx
            break
        try:
            file_path = f"{audio_path}/{dataset['path'][audio_idx]}"
            inner_data = {}
            inner_data['id'] = start_id + audio_idx
            inner_data['file_path'] = file_path
            inner_data['audio_idx'] = audio_idx

            audio_file = AudioSegment.from_file(file_path)
            samples = np.array(audio_file.get_array_of_samples())
            samples = samples.reshape(-1, audio_file.channels)
            samples = samples / np.max(np.abs(samples))
            samples = samples.squeeze()

            samples = resampy.resample(samples, audio_file.frame_rate, 16000)
            audio = torch.from_numpy(samples)
            inputs = audio_processor(audio, sampling_rate=16000, return_tensors="pt").to(device)
            with torch.no_grad():
                outputs = audio_model(**inputs)
            last_hidden_states = outputs.last_hidden_state.squeeze(0)
            embeddings = last_hidden_states.mean(dim=0)
            inner_data['audio_embedding'] = embeddings.cpu()
            inner_data['text'] = preprocess_sentence(dataset['sentence'][audio_idx])
            data_v1[start_id + audio_idx] = inner_data
            counter += 1
        except Exception as e:
            print(e)
            print(dataset['path'][audio_idx])
            continue
    print(f'max id {max_id}')
    data_v2 = {}
    counter = 0
    for audio_idx in tqdm(val_ids_final):
        if counter == limit:
            break

        if start_id + audio_idx not in data_v1.keys():
             continue
                
        if counter % 100 == 0:
            torch.cuda.empty_cache()
            gc.collect()
            torch.cuda.ipc_collect()

        try:
            file_path = f"{audio_path}/{dataset['path'][audio_idx]}"
            y, sr = librosa.load(file_path)
            spec = librosa.stft(y)
            spec_db = librosa.amplitude_to_db(abs(spec))
            
            plt.clf()
            librosa.display.specshow(spec_db, x_axis='time', y_axis='log')
            plt.xlabel('')
            plt.ylabel('')
            plt.tight_layout()

            saved_path = f'spec.png'
            plt.savefig(saved_path, bbox_inches='tight', pad_inches=0)
            image_embeddings = get_image_embedding(saved_path)
            data_v2[start_id + audio_idx] = data_v1[start_id + audio_idx].copy()
            data_v2[start_id + audio_idx]['image_embedding'] = image_embeddings
            counter += 1

        except Exception as e:
            print(e)
            print(dataset['path'][audio_idx])
            continue
    data_v3 = {}
    counter = 0
    for audio_idx in tqdm(val_ids_final):
        if start_id + audio_idx not in data_v2.keys():
            continue
        if counter == limit:
            break
        try:
            text_embeddings = text_model.encode(data_v2[start_id + audio_idx]['text'])
            data_v3[start_id + audio_idx] = data_v2[start_id + audio_idx].copy()
            data_v3[start_id + audio_idx]['text_embedding'] = torch.tensor(text_embeddings)
            counter += 1
        except Exception as e:
            print(e)
            print(dataset['path'][audio_idx])
            continue
    with open(f'pretrained_embeddings_common_voice_{split}_part_{part_number}.pkl', 'wb') as f:
        pickle.dump(data_v3, f)
    print(f'max id {max_id}')
    return max_id

In [None]:
def find_proper_audios(dataset):
    audio_idxs = []
    for audio_idx in tqdm(range(len(dataset))): 
        file_path = f"{audio_path}/{dataset['path'][audio_idx]}"
        if os.path.exists(file_path):
            audio_idxs.append(audio_idx)
    print(f'len of audio idxs {len(audio_idxs)}')
    return audio_idxs

In [None]:
train_valid_ids = find_proper_audios(data_train)

100%|██████████| 232975/232975 [12:05<00:00, 321.04it/s]

len of audio idxs 178211





In [None]:
# max_idx = pipeline_for_common_voice(data_train, 5000, 'train', train_valid_ids, part_number=1, start_idx=0)

  0%|          | 0/178211 [00:00<?, ?it/s]

start_idx 0


  3%|▎         | 5100/178211 [26:59<15:16:17,  3.15it/s]


max id 18105


  plt.tight_layout()
  3%|▎         | 4605/178211 [51:40<37:40:45,  1.28it/s]

In [None]:
# max_idx = pipeline_for_common_voice(data_train, 5000, 'train', train_valid_ids, part_number=2, start_idx=max_idx)

In [None]:
# max_idx = pipeline_for_common_voice(data_train, 5000, 'train', train_valid_ids, part_number=3, start_idx=max_idx)

In [None]:
# max_idx = pipeline_for_common_voice(data_train, 5000, 'train', train_valid_ids, part_number=4, start_idx=max_idx)

In [None]:
# max_idx = pipeline_for_common_voice(data_train, 5000, 'train', train_valid_ids, part_number=5, start_idx=max_idx)

In [None]:
# max_idx = pipeline_for_common_voice(data_train, 5000, 'train', train_valid_ids, part_number=6, start_idx=max_idx)

In [None]:
# max_idx = pipeline_for_common_voice(data_train, 5000, 'train', train_valid_ids, part_number=7, start_idx=max_idx)

  3%|▎         | 5050/147855 [29:29<13:54:08,  2.85it/s]


max id 71054


  plt.tight_layout()
  0%|          | 179/147855 [02:05<28:40:00,  1.43it/s]

In [None]:
# max_idx = 71054

In [None]:
# max_idx = pipeline_for_common_voice(data_train, 5000, 'train', train_valid_ids, part_number=8, start_idx=max_idx)

In [None]:
# max_idx = pipeline_for_common_voice(data_train, 5000, 'train', train_valid_ids, part_number=9, start_idx=max_idx)

In [None]:
# max_idx = pipeline_for_common_voice(data_train, 5000, 'train', train_valid_ids, part_number=10, start_idx=max_idx)

In [None]:
dev_data_path = "/kaggle/input/common-voice-dataset-version-4/data-file/dev.tsv"
data_dev = pd.read_csv(dev_data_path,comment='#',sep="\t")

In [None]:
len(data_dev)

15531

In [None]:
dev_valid_ids = find_proper_audios(data_dev)

100%|██████████| 15531/15531 [00:33<00:00, 457.66it/s]

len of audio idxs 3492





In [None]:
max_idx = pipeline_for_common_voice(data_dev, 5000, 'validation', dev_valid_ids, part_number=1, start_idx=0)

In [None]:
test_data_path = "/kaggle/input/common-voice-dataset-version-4/data-file/test.tsv"
data_test = pd.read_csv(test_data_path,comment='#',sep="\t")
len(data_test)

15531

In [None]:
test_valid_ids = find_proper_audios(data_test)

100%|██████████| 15531/15531 [00:33<00:00, 461.98it/s]

len of audio idxs 2197





In [None]:
max_idx = pipeline_for_common_voice(data_test, 5000, 'test', test_valid_ids, part_number=1, start_idx=0)

In [None]:
# mix all files pretrained
import pickle
from IPython.display import FileLink

data = {'train': {}, 'test': {}, 'validation': {}}
for i in range(1, 11):
    with open(f'pretrained_embeddings_common_voice_train_part_{i}.pkl', 'rb') as f:
        data['train'].update(pickle.load(f))
with open(f'pretrained_embeddings_common_voice_test_part_1.pkl', 'rb') as f:
    data['test'].update(pickle.load(f))
with open(f'pretrained_embeddings_common_voice_validation_part_1.pkl', 'rb') as f:
    data['validation'].update(pickle.load(f))

with open(f'pretrained_embeddings_common_voice.pkl', 'wb') as f:
    pickle.dump(data, f)

In [None]:
FileLink('pretrained_embeddings_common_voice.pkl')

In [None]:
with open(f'pretrained_embeddings_common_voice.pkl', 'rb') as f:
    data = pickle.load(f)

In [None]:
print(len(data['train']))
print(len(data['validation']))
print(len(data['test']))

50000
3492
2197


# Wav2vec2 ASR Transcript

## Brown

In [None]:
import gdown
import pickle
import random

In [None]:
url = "https://kkb-production.jupyter-proxy.kaggle.net/k/139428569/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2IiwidHlwIjoiSldUIn0..5CBzgtoLZvb8BcQMfPnk4g.YCRbaBFFoNXwxFOKdpuHxZJKp4P-33myO3ZCed7eCKSU5b0xM04sCCnLY0PZdOm4mZbtof_O5lFZ9mDLpy2hSUwju3p1u-USToNMsrVXfLDggSQCFhPuv9HnV9blsYKP9B24HwqnO985bJ04IBN9AenMzQOL5Yis4qTp14gUGx2ULMit81PQiMfoFvjjMt6dVqclC1FnrSH0mFYGJUGETA.3eqncq_FXklnubs1jK2sRw/proxy/files/total_dataset_test_with_text_audio.pkl"
output = "total_dataset_test_with_text_audio.pkl"
gdown.download(url, output, quiet=False, fuzzy=True)

In [None]:
with open('total_dataset_test_with_text_audio.pkl', 'rb') as f:
    total_dataset = pickle.load(f)

In [None]:
total_dataset.keys()

In [None]:
random_key = random.choice(total_dataset['audio_path'])
print(random_key)

In [None]:
def download_data_part(url, output):
    gdown.download(url, output, quiet=False, fuzzy=True)
    with ZipFile(output, 'r') as zip_ref:
        zip_ref.extractall(output.split('.')[0])
    os.remove(output)

In [None]:
# brown dataset add asr transcripts
total_dataset_v2 = total_dataset.copy()
total_dataset_v2['asr-text'] = ['' for i in range(len(total_dataset_v2['pure-text']))]


In [None]:
with open('total_dataset_asr_v2.pkl', 'wb') as f:
    pickle.dump(total_dataset_v2, f)

In [None]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import soundfile as sf

# Load the pre-trained model and processor
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")


In [None]:
def transcribe_audio_files(audio_file_list):
    transcriptions = []
    for audio_file in tqdm(audio_file_list):
        # Load the audio file
        audio_input, _ = sf.read(audio_file)
        # Preprocess the audio file
        input_values = processor(audio_input, sampling_rate=16_000, return_tensors="pt").input_values
        # Get the model's prediction
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        # Decode the prediction to text
        transcription = processor.batch_decode(predicted_ids)[0]
        transcriptions.append(transcription.lower())
    return transcriptions

In [None]:
# Example usage
audio_file_list = ['dataset_part1/audios/audio_3915.wav','dataset_part1/audios/audio_200.wav']
transcriptions = transcribe_audio_files(audio_file_list)
print(transcriptions)


In [None]:
download_data_part('https://drive.google.com/file/d/1GkXhLbzHrobM4GaGX80PzRHl-TBPg8h4/view?usp=drive_link','dataset_part1.zip')

In [None]:
def add_for_part_asr_text(part_id):
    with open('total_dataset_asr_v2.pkl', 'rb') as f:
        total_dataset_v2 = pickle.load(f)
    with open('total_dataset_asr_v2_backup.pkl', 'wb') as f:
        pickle.dump(total_dataset_v2, f)
    item_indexes = []
    audio_pathes = []
    for i, audio_path in enumerate(total_dataset_v2['audio_path']):
        if total_dataset_v2['source'][i] != 'brown':
            continue
        if f'/part{part_id}/' in audio_path:
            item_indexes.append(i)
            audio_pathes.append(f'dataset_part{part_id}/audios/' + audio_path.split('/')[-1])
    print(len(item_indexes))
#     transcriptions = [item['transcription'] for item in asr_model.transcribe(audio_pathes)]
    transcriptions = transcribe_audio_files(audio_pathes)
    for i, item_index in enumerate(item_indexes):
        total_dataset_v2['asr-text'][item_index] = transcriptions[i]
    with open('total_dataset_asr_v2.pkl', 'wb') as f:
        pickle.dump(total_dataset_v2, f)
        # write part_id to file
    with open('part_id.txt', 'w') as f:
        f.write(str(part_id))

In [None]:
#read part_id 
with open('part_id.txt', 'r') as f:
    part_id = f.read()
part_id

In [None]:
add_for_part_asr_text(1)

In [None]:
with open('total_dataset_asr_v2.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)
c = 0
for j, i in enumerate(total_dataset_v2['asr-text']):
    if i != '':
        if c == 1 :
            print(i)
            print(total_dataset_v2['pure-text'][j])
        c += 1
print(c)

In [None]:
print(total_dataset_v2['asr-text'])

In [None]:
# part 2
download_data_part('https://drive.google.com/file/d/1n_xxsblXrw5tCVk7ZP6_kRtubjMOuC2l/view?usp=drive_link','dataset_part2.zip')

In [None]:
add_for_part_asr_text(2)

In [None]:
with open('total_dataset_asr_v2.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)
c = 0
for j, i in enumerate(total_dataset_v2['asr-text']):
    if i != '':
        if c == 1 :
            print(i)
            print(total_dataset_v2['pure-text'][j])
        c += 1
print(c)

In [None]:
from shutil import rmtree
rmtree('dataset_part1')
rmtree('dataset_part2')

In [None]:
# part 3
download_data_part('https://drive.google.com/file/d/14-Tvd3jvgY2Ge5-cxOsX7Arx6esfUxeN/view?usp=drive_link','dataset_part3.zip')

In [None]:
add_for_part_asr_text(3)

In [None]:
rmtree('dataset_part3')

In [None]:
# part 4
download_data_part('https://drive.google.com/file/d/1fId6jAxD5UzObg4zu30GPpcP-P9F7TSL/view?usp=drive_link','dataset_part4.zip')

In [None]:
add_for_part_asr_text(4)

In [None]:
# part 5
download_data_part('https://drive.google.com/file/d/1VssFVfqgnXy7JpgUKpTWEVg11bqYK2bn/view?usp=drive_link','dataset_part5.zip')

In [None]:
add_for_part_asr_text(5)

In [None]:
from shutil import rmtree
rmtree('dataset_part4')
rmtree('dataset_part5')

In [None]:
# part 6
download_data_part('https://drive.google.com/file/d/11vhiU4lV8w3IYTXebTDr-9j7834ig6OU/view?usp=drive_link','dataset_part6.zip')

In [None]:
add_for_part_asr_text(6)

In [None]:
# part 7
download_data_part('https://drive.google.com/file/d/1FbPz9VnDgU9Bj7sxtxOb7Etx1_dnobmH/view?usp=drive_link','dataset_part7.zip')

In [None]:
add_for_part_asr_text(7)

In [None]:
rmtree('dataset_part6')
rmtree('dataset_part7')

In [None]:
# part 8
download_data_part('https://drive.google.com/file/d/1E_UWK88hWwhwKI8kiND3OCtdo3tnlOu5/view?usp=drive_link','dataset_part8.zip')

In [None]:
add_for_part_asr_text(8)

In [None]:
# part 9
download_data_part('https://drive.google.com/file/d/17PC8U5HYQ8r9wGlpQoeptjMyA7rK6PrZ/view?usp=drive_link','dataset_part9.zip')

In [None]:
add_for_part_asr_text(9)

In [None]:
rmtree('dataset_part8')
rmtree('dataset_part9')

In [None]:
# part 10
download_data_part('https://drive.google.com/file/d/1TQI49qhcDKSX0222sAQxDsQZVbZnjlBP/view?usp=drive_link','dataset_part10.zip')

In [None]:
add_for_part_asr_text(10)

In [None]:
rmtree('dataset_part10')

## Common Voice

In [None]:
url = "https://drive.google.com/file/d/1sNYqSU5lmUiuSlffMcyRYv6ho5p7ra2a/view?usp=sharing"
output = "total_dataset_test_with_text_audio.pkl"
gdown.download(url, output, quiet=False, fuzzy=True)

Downloading...
From (uriginal): https://drive.google.com/uc?id=1sNYqSU5lmUiuSlffMcyRYv6ho5p7ra2a
From (redirected): https://drive.google.com/uc?id=1sNYqSU5lmUiuSlffMcyRYv6ho5p7ra2a&confirm=t&uuid=5f6f2b8e-e76a-46bb-a4e9-e04b499ff4d9
To: /kaggle/working/total_dataset_test_with_text_audio.pkl
100%|██████████| 847M/847M [00:07<00:00, 107MB/s]  


'total_dataset_test_with_text_audio.pkl'

In [None]:
with open('total_dataset_test_with_text_audio.pkl', 'rb') as f:
    total_dataset = pickle.load(f)

In [None]:
total_dataset_v2 = total_dataset.copy()
total_dataset_v2['asr-text'] = ['' for i in range(len(total_dataset_v2['pure-text']))]

In [None]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import soundfile as sf

# Load the pre-trained model and processor
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def transcribe_audio_files(audio_file_list):
    transcriptions = []
    for file_path in tqdm(audio_file_list):
        audio_file = AudioSegment.from_file(file_path)
        samples = np.array(audio_file.get_array_of_samples())
        samples = samples.reshape(-1, audio_file.channels)
        samples = samples / np.max(np.abs(samples))
        samples = samples.squeeze()

        samples = resampy.resample(samples, audio_file.frame_rate, 16000)
        audio = torch.from_numpy(samples)
        inputs = processor(audio, sampling_rate=16000, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs).logits
        predicted_ids = torch.argmax(outputs, dim=-1)
        # Decode the prediction to text
        transcription = processor.batch_decode(predicted_ids)[0]
        transcriptions.append(transcription.lower())
    return transcriptions

In [None]:
total_dataset_v2['source'][0]

'common_voice'

In [None]:
# Example usage
audio_file_list = ['/kaggle/input/common-voice-dataset-version-4/new-clip/common_voice_en_100040.mp3','/kaggle/input/common-voice-dataset-version-4/new-clip/common_voice_en_19714982.mp3']
transcriptions = transcribe_audio_files(audio_file_list)
print(transcriptions)


100%|██████████| 2/2 [00:00<00:00,  3.20it/s]

['the burning fire had been extinguished', 'this idea have to provide some incight into these farming adoptations']





In [None]:
Audio(audio_file_list[1])

In [None]:
with open('total_dataset_asr_v2.pkl', 'wb') as f:
    pickle.dump(total_dataset_v2, f)

In [None]:
def add_asr_text_common_voice():
    with open('total_dataset_asr_v2.pkl', 'rb') as f:
        total_dataset_v2 = pickle.load(f)
#     with open('total_dataset_asr_v2_backup.pkl', 'wb') as f:
#         pickle.dump(total_dataset_v2, f)
    item_indexes = []
    audio_pathes = []
    for i, audio_path in enumerate(total_dataset_v2['audio_path']):
        if total_dataset_v2['source'][i] != 'common_voice':
            continue
        item_indexes.append(i)
        audio_pathes.append(total_dataset_v2['audio_path'][i])
    print(len(item_indexes))
    transcriptions = transcribe_audio_files(audio_pathes)
    for i, item_index in enumerate(item_indexes):
        total_dataset_v2['asr-text'][item_index] = transcriptions[i]
    with open('total_dataset_asr_v2.pkl', 'wb') as f:
        pickle.dump(total_dataset_v2, f)

In [None]:
add_asr_text_common_voice()

2197


100%|██████████| 2197/2197 [12:48<00:00,  2.86it/s]


In [None]:
with open('total_dataset_asr_v2.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)
c = 0
for j, i in enumerate(total_dataset_v2['asr-text']):
    if i != '':
        if c == 1 :
            print(i)
            print(total_dataset_v2['pure-text'][j])
        c += 1
print(c)

two young women sit on a stone staircase in front of a store shuttered with a decorative iron grate
two young women sit on a stone staircase in front of a store shuttered with a decorative iron grate
2194


## FLEURS

In [None]:
fleurs_retrieval = load_dataset("google/fleurs", "en_us")

Downloading builder script:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading and preparing dataset fleurs/en_us to /root/.cache/huggingface/datasets/google___fleurs/en_us/2.0.0/af82dbec419a815084fa63ebd5d5a9f24a6e9acdf9887b9e3b8c6bbd64e0b7ac...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.38G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/171M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/290M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.41M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/368k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset fleurs downloaded and prepared to /root/.cache/huggingface/datasets/google___fleurs/en_us/2.0.0/af82dbec419a815084fa63ebd5d5a9f24a6e9acdf9887b9e3b8c6bbd64e0b7ac. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
with open('total_dataset_asr_v2.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)
c = 0
for j, i in enumerate(total_dataset_v2['source']):
    if i == 'fleurs':
        print(total_dataset_v2['audio_path'][j])
        c += 1
    if c == 3:
        break

/root/.cache/huggingface/datasets/downloads/extracted/767809e2794806f54865259a100b1774ae3de84135396cf5c5fa2a50bd3bcbef/test/1273817225649957097.wav
/root/.cache/huggingface/datasets/downloads/extracted/767809e2794806f54865259a100b1774ae3de84135396cf5c5fa2a50bd3bcbef/test/1394697283561012104.wav
/root/.cache/huggingface/datasets/downloads/extracted/767809e2794806f54865259a100b1774ae3de84135396cf5c5fa2a50bd3bcbef/test/5196260640941390313.wav


In [None]:
audio_file_list = ['/root/.cache/huggingface/datasets/downloads/extracted/767809e2794806f54865259a100b1774ae3de84135396cf5c5fa2a50bd3bcbef/test/1273817225649957097.wav','/root/.cache/huggingface/datasets/downloads/extracted/767809e2794806f54865259a100b1774ae3de84135396cf5c5fa2a50bd3bcbef/test/1394697283561012104.wav']
transcriptions = transcribe_audio_files(audio_file_list)
print(transcriptions)

100%|██████████| 2/2 [00:00<00:00,  2.36it/s]

['italian is also the every day language used by most of those who work in the state while latin is often used in religious ceremonies', "well we don't know for sure but it may have had a fork tong is diet included turtles larchfis other most sausorers ai may even have been a cannibal"]





In [None]:
with open('total_dataset_asr_v3.pkl', 'wb') as f:
    pickle.dump(total_dataset_v2, f)

In [None]:
def add_asr_text_common_voice():
    with open('total_dataset_asr_v3.pkl', 'rb') as f:
        total_dataset_v2 = pickle.load(f)
    item_indexes = []
    audio_pathes = []
    for i, audio_path in enumerate(total_dataset_v2['audio_path']):
        if total_dataset_v2['source'][i] != 'fleurs':
            continue
        item_indexes.append(i)
        audio_pathes.append(total_dataset_v2['audio_path'][i])
    print(len(item_indexes))
    transcriptions = transcribe_audio_files(audio_pathes)
    for i, item_index in enumerate(item_indexes):
        total_dataset_v2['asr-text'][item_index] = transcriptions[i]
    with open('total_dataset_asr_v3.pkl', 'wb') as f:
        pickle.dump(total_dataset_v2, f)

In [None]:
add_asr_text_common_voice()

647


100%|██████████| 647/647 [02:39<00:00,  4.04it/s]


In [None]:
with open('total_dataset_asr_v3.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)
c = 0
for j, i in enumerate(total_dataset_v2['asr-text']):
    if i != '':
        if c == 1 :
            print(i)
            print(total_dataset_v2['pure-text'][j])
        c += 1
print(c)

two young women sit on a stone staircase in front of a store shuttered with a decorative iron grate
two young women sit on a stone staircase in front of a store shuttered with a decorative iron grate
2841


In [None]:
total_dataset_v2.keys()

dict_keys(['audio', 'image', 'text', 'pure-text', 'audio_path', 'id', 'source', 'asr-text'])

In [None]:
url = "https://kkb-production.jupyter-proxy.kaggle.net/k/139427160/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2IiwidHlwIjoiSldUIn0..GPPHQYRNvzdQq_kb-qkREQ.HzgmSM8BfyRPRo1SFJujRphEncHC1r3gs5QaT4hinyaoiznna8JR289ZF957iEhvMNRJfAQE0FW79mV6wVwGVnVq5fb-nyRJ_gK8UIlS_zWi2O_jDZ8N6qgayzae8Q-J1uvz9jaBo1bdyKJsh18EJJGzq9faXuqM-uigTY1ybVlDaiXKGq1HGioMK44HKyMfhaMUcuAGX6wIGpKNwUMmjQ.2QQ_AEEggAHc7ppsmvgyXw/proxy/files/total_dataset_asr_v2.pkl"
output = "total_dataset_asr_brown.pkl"
gdown.download(url, output, quiet=False, fuzzy=True)

Downloading...
From: https://kkb-production.jupyter-proxy.kaggle.net/k/139427160/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2IiwidHlwIjoiSldUIn0..GPPHQYRNvzdQq_kb-qkREQ.HzgmSM8BfyRPRo1SFJujRphEncHC1r3gs5QaT4hinyaoiznna8JR289ZF957iEhvMNRJfAQE0FW79mV6wVwGVnVq5fb-nyRJ_gK8UIlS_zWi2O_jDZ8N6qgayzae8Q-J1uvz9jaBo1bdyKJsh18EJJGzq9faXuqM-uigTY1ybVlDaiXKGq1HGioMK44HKyMfhaMUcuAGX6wIGpKNwUMmjQ.2QQ_AEEggAHc7ppsmvgyXw/proxy/files/total_dataset_asr_v2.pkl
To: /kaggle/working/total_dataset_asr_brown.pkl
848MB [00:26, 32.2MB/s] 


'total_dataset_asr_brown.pkl'

In [None]:
with open('total_dataset_asr_brown.pkl', 'rb') as f:
    brown_asr = pickle.load(f)

In [None]:
with open('total_dataset_asr_v3.pkl', 'rb') as f:
    total_asr_dataset = pickle.load(f)
for j, i in enumerate(total_asr_dataset['source']):
    if i == 'brown':
        total_asr_dataset['asr-text'][j] = brown_asr['asr-text'][j]

In [None]:
c = 0
for j, i in enumerate(total_asr_dataset['asr-text']):
    if i == '':
        print(total_asr_dataset['pure-text'][j])
        c += 1
print(c)

2.
2.
But!!
2.
a young arab also loaded down with baggage entered and greeted the englishman
2.
10.
inverted frames are common for several of the lower values
We were coming to an intersection , turning right , chuffing to a stop.
2.
It was General Burnside's horse running in a circle.
6.
6.
6.
His statistical record that year , when Texas won only one game and lost nine , was far from impressive : he carried the ball three times for a net gain of 10 yards , punted once for 39 yards and caught one pass for 13 yards .
To help prevent orthodontic problems from arising , your dentist can do these things :
2.
Sec. 7.
first impressions are the most lasting
C
Soon as the Burnsides moved on , he'd lead Rex down by the river ; ;
2.
We have recourse to the scientifically-trained specialist in the laboratory.
A new low capacity meter is the key that unlocks the situation at Oakwood Heights.
`` O!!
25


In [None]:
with open('total_dataset_asr_final.pkl', 'wb') as f:
    pickle.dump(total_asr_dataset, f)

## Extract Labse Embeddings of ASR texts

In [None]:
with open('total_dataset_asr_final.pkl', 'rb') as f:
    total_asr_dataset = pickle.load(f)

In [None]:
text_model = SentenceTransformer('sentence-transformers/LaBSE').to(device)

In [None]:
def batch(iterable, n=16):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]
        
batch_size = 16
total_asr_dataset_v2 = total_asr_dataset.copy()
total_asr_dataset_v2['asr-text-embedding'] = []

for s in tqdm(batch(total_asr_dataset['asr-text'], batch_size)):
    total_asr_dataset_v2['asr-text-embedding'].extend(torch.Tensor(text_model.encode(s)))

In [None]:
len(total_asr_dataset_v2['asr-text-embedding'])

11411

In [None]:
len(total_asr_dataset_v2['text'])

11411

In [None]:
with open('total_dataset_asr_final_v2.pkl', 'wb') as f:
    pickle.dump(total_asr_dataset_v2, f)

In [None]:
FileLink('total_dataset_asr_final_v2.pkl')

# Hubert ASR Transcript


## Brown

In [None]:
## Hubert ASR
from transformers import AutoProcessor, HubertForCTC
import soundfile as sf

hubert_processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
hubert_model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft").to(device)


In [None]:
def asr_hubert_audio_files(audio_file_list):
    transcriptions = []
#     audio_raws = []
    for file_path in tqdm(audio_file_list):
        # Use soundfile to read the audio file
        data, samplerate = sf.read(file_path)
        # Normalize the audio data
        data = data / np.max(np.abs(data))
        # Resample the audio to 16kHz
        data = librosa.resample(data, orig_sr=samplerate, target_sr=16000)
        # Convert the audio data to PyTorch tensor
        audio = torch.from_numpy(data)
        # Process the audio data with Hubert processor
        inputs = hubert_processor(audio, sampling_rate=16000, return_tensors="pt").to(device)
        # Get the embeddings from Hubert model
        with torch.no_grad():
            logits = hubert_model(**inputs).logits
            predicted_ids = torch.argmax(logits, dim=-1)

            transcription = hubert_processor.batch_decode(predicted_ids)[0]
            transcriptions.append(transcription.lower())
#             audio_raws.append(audio)
    return transcriptions

In [None]:
def add_for_part_hubert_asr(part_id):
    with open('total_dataset_hubert_asr.pkl', 'rb') as f:
        total_dataset_v2 = pickle.load(f)
    with open('total_dataset_hubert_asr_backup.pkl', 'wb') as f:
        pickle.dump(total_dataset_v2, f)
    item_indexes = []
    audio_pathes = []
    for i, audio_path in enumerate(total_dataset_v2['audio_path']):
        if total_dataset_v2['source'][i] != 'brown':
            continue
        if f'/part{part_id}/' in audio_path:
            item_indexes.append(i)
            audio_pathes.append(f'dataset_part{part_id}/audios/' + audio_path.split('/')[-1])
    print(len(item_indexes))
    transcriptions = asr_hubert_audio_files(audio_pathes)
    for i, item_index in enumerate(item_indexes):
        total_dataset_v2['hubert-asr-text'][item_index] = transcriptions[i]
#         total_dataset_v2['audio-raw'][item_index] = audio_raws[i]
    with open('total_dataset_hubert_asr.pkl', 'wb') as f:
        pickle.dump(total_dataset_v2, f)
    with open('part_id.txt', 'w') as f:
        f.write(str(part_id))

In [None]:
def download_data_part(url, output):
    gdown.download(url, output, quiet=False, fuzzy=True)
    with ZipFile(output, 'r') as zip_ref:
        zip_ref.extractall(output.split('.')[0])
    os.remove(output)

In [None]:
url = "https://drive.google.com/file/d/1-3FIKAyT_5coJqRUvPEsW_NCAbqVZNo1/view?usp=sharing"
output = "total_dataset_hubert_final.pkl"
gdown.download(url, output, quiet=False, fuzzy=True)

In [None]:
download_data_part('https://drive.google.com/file/d/1GkXhLbzHrobM4GaGX80PzRHl-TBPg8h4/view?usp=drive_link','dataset_part1.zip')

In [None]:
with open(f'total_dataset_hubert_final.pkl', 'rb') as f:
    total_dataset_hubert_final = pickle.load(f)

In [None]:
total_dataset_hubert_final.keys()

In [None]:
total_dataset_hubert_final['hubert-asr-text'] = ['' for i in range(len(total_dataset_hubert_final['pure-text']))]

In [None]:
total_dataset_hubert_final['audio-raw'] = ['' for i in range(len(total_dataset_hubert_final['audio']))]

In [None]:
with open('total_dataset_hubert_asr.pkl', 'wb') as f:
    pickle.dump(total_dataset_hubert_final, f)

In [None]:
os.remove('total_dataset_hubert_v2_backup.pkl')

In [None]:
add_for_part_hubert_asr(1)

In [None]:
with open('total_dataset_hubert_asr.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)
c = 0
for j, i in enumerate(total_dataset_v2['hubert-asr-text']):
    if i != '' and total_dataset_v2['source'][j] == 'brown':
        if c < 2 :
            print(i)
            print(total_dataset_v2['pure-text'][j])
        c += 1
print(c)

In [None]:
# part 2
download_data_part('https://drive.google.com/file/d/1n_xxsblXrw5tCVk7ZP6_kRtubjMOuC2l/view?usp=drive_link','dataset_part2.zip')

In [None]:
add_for_part_hubert_asr(2)

In [None]:
from shutil import rmtree
rmtree('dataset_part1')
rmtree('dataset_part2')

In [None]:
# part 3
download_data_part('https://drive.google.com/file/d/14-Tvd3jvgY2Ge5-cxOsX7Arx6esfUxeN/view?usp=drive_link','dataset_part3.zip')

In [None]:
#read part_id 
with open('part_id.txt', 'r') as f:
    part_id = f.read()
part_id

In [None]:
with open('total_dataset_hubert_asr_backup.pkl', 'rb') as f:                                  
    total_dataset_v2 = pickle.load(f)  

In [None]:
total_dataset_v2.keys()

In [None]:
del total_dataset_v2["audio-raw"]

In [None]:
total_dataset_v2.keys()

In [None]:
gc.collect()

In [None]:
with open('total_dataset_hubert_asr.pkl', 'wb') as f:
    pickle.dump(total_dataset_v2, f)

In [None]:
add_for_part_hubert_asr(3)

In [None]:
rmtree('dataset_part3')

In [None]:
# part 4
download_data_part('https://drive.google.com/file/d/1fId6jAxD5UzObg4zu30GPpcP-P9F7TSL/view?usp=drive_link','dataset_part4.zip')

In [None]:
add_for_part_hubert_asr(4)

In [None]:
# part 5
download_data_part('https://drive.google.com/file/d/1VssFVfqgnXy7JpgUKpTWEVg11bqYK2bn/view?usp=drive_link','dataset_part5.zip')

In [None]:
add_for_part_hubert_asr(5)

In [None]:
from shutil import rmtree
rmtree('dataset_part4')
rmtree('dataset_part5')

In [None]:
# part 6
download_data_part('https://drive.google.com/file/d/11vhiU4lV8w3IYTXebTDr-9j7834ig6OU/view?usp=drive_link','dataset_part6.zip')

In [None]:
add_for_part_hubert_asr(6)

In [None]:
# part 7
download_data_part('https://drive.google.com/file/d/1FbPz9VnDgU9Bj7sxtxOb7Etx1_dnobmH/view?usp=drive_link','dataset_part7.zip')

In [None]:
add_for_part_hubert_asr(7)

In [None]:
rmtree('dataset_part6')
rmtree('dataset_part7')

In [None]:
# part 8
download_data_part('https://drive.google.com/file/d/1E_UWK88hWwhwKI8kiND3OCtdo3tnlOu5/view?usp=drive_link','dataset_part8.zip')

In [None]:
add_for_part_hubert_asr(8)

In [None]:
# part 9
download_data_part('https://drive.google.com/file/d/17PC8U5HYQ8r9wGlpQoeptjMyA7rK6PrZ/view?usp=drive_link','dataset_part9.zip')

In [None]:
add_for_part_hubert_asr(9)

In [None]:
rmtree('dataset_part8')
rmtree('dataset_part9')

In [None]:
# part 10
download_data_part('https://drive.google.com/file/d/1TQI49qhcDKSX0222sAQxDsQZVbZnjlBP/view?usp=drive_link','dataset_part10.zip')

In [None]:
add_for_part_hubert_asr(10)

In [None]:
with open('total_dataset_hubert_asr.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)
c = 0
for j, i in enumerate(total_dataset_v2['hubert-asr-text']):
    if i != '' and total_dataset_v2['source'][j] == 'brown':
        if c < 2 :
            print(total_dataset_v2['pure-text'][j])
            print(total_dataset_v2['hubert-asr-text'][j])
        c += 1
print(c)

In [None]:
with open('total_dataset_hubert_asr.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)
c = 0
for j, i in enumerate(total_dataset_v2['hubert-asr-text']):
    if i == '' and total_dataset_v2['source'][j] == 'brown':
        c += 1
print(c)

In [None]:
rmtree('dataset_part10')

In [None]:
FileLink('total_dataset_hubert_asr.pkl')

In [None]:
with open('total_dataset_hubert_asr.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)

In [None]:
total_dataset_v2.keys()

## Common Voice

In [None]:
url = "https://drive.google.com/file/d/1nun6MfobJKK9LJ6zCFINO4-QmZiZsaeJ/view?usp=sharing"
output = "total_dataset_hubert_ast_v1.pkl"
gdown.download(url, output, quiet=False, fuzzy=True)

Downloading...
From (uriginal): https://drive.google.com/uc?id=1nun6MfobJKK9LJ6zCFINO4-QmZiZsaeJ
From (redirected): https://drive.google.com/uc?id=1nun6MfobJKK9LJ6zCFINO4-QmZiZsaeJ&confirm=t&uuid=132827c7-e31b-4ab4-98b1-e8a6723131b5
To: /kaggle/working/total_dataset_hubert_ast_v1.pkl
100%|██████████| 1.50G/1.50G [00:18<00:00, 82.9MB/s]


'total_dataset_hubert_ast_v1.pkl'

In [None]:
with open(f'total_dataset_hubert_ast_v1.pkl', 'rb') as f:
    total_dataset_hubert_final = pickle.load(f)

In [None]:
with open('total_dataset_hubert_asr_v2.pkl', 'wb') as f:
    pickle.dump(total_dataset_hubert_final, f)

In [None]:
os.remove('total_dataset_huber.pkl')

In [None]:
def asr_hubert_audio_files(audio_file_list):
    transcriptions = []
#     audio_raws = []
    for file_path in tqdm(audio_file_list):
        # Use soundfile to read the audio file
        data, samplerate = sf.read(file_path)
        # Normalize the audio data
        data = data / np.max(np.abs(data))
        # Resample the audio to 16kHz
        data = librosa.resample(data, orig_sr=samplerate, target_sr=16000)
        # Convert the audio data to PyTorch tensor
        audio = torch.from_numpy(data)
        # Process the audio data with Hubert processor
        inputs = hubert_processor(audio, sampling_rate=16000, return_tensors="pt").to(device)
        # Get the embeddings from Hubert model
        with torch.no_grad():
            logits = hubert_model(**inputs).logits
            predicted_ids = torch.argmax(logits, dim=-1)

            transcription = hubert_processor.batch_decode(predicted_ids)[0]
            transcriptions.append(transcription.lower())
#             audio_raws.append(audio)
    return transcriptions

In [None]:
def add_hubert_asr_to_common_voice():
    with open('total_dataset_hubert_asr_v2.pkl', 'rb') as f:
        total_dataset_v2 = pickle.load(f)
    with open('total_dataset_hubert_asr_v2_backup.pkl', 'wb') as f:
        pickle.dump(total_dataset_v2, f)
    item_indexes = []
    audio_pathes = []
    for i, audio_path in enumerate(total_dataset_v2['audio_path']):
        if total_dataset_v2['source'][i] != 'common_voice':
            continue
        item_indexes.append(i)
        audio_pathes.append(total_dataset_v2['audio_path'][i])
    print(len(item_indexes))    
    transcriptions = asr_hubert_audio_files(audio_pathes)
    for i, item_index in enumerate(item_indexes):
        total_dataset_v2['hubert-asr-text'][item_index] = transcriptions[i]
    with open('total_dataset_hubert_asr_v2.pkl', 'wb') as f:
        pickle.dump(total_dataset_v2, f)

In [None]:
add_hubert_asr_to_common_voice()

2197


100%|██████████| 2197/2197 [02:53<00:00, 12.64it/s]


## FLEURS

In [None]:
fleurs_retrieval = load_dataset("google/fleurs", "en_us")

In [None]:
def add_hubert_fleurs():
    with open('total_dataset_hubert_asr_v2.pkl', 'rb') as f:
        total_dataset_v2 = pickle.load(f)
    with open('total_dataset_hubert_asr_v2_backup.pkl', 'wb') as f:
        pickle.dump(total_dataset_v2, f)
    item_indexes = []
    audio_pathes = []
    for i, audio_path in enumerate(total_dataset_v2['audio_path']):
        if total_dataset_v2['source'][i] != 'fleurs':
            continue
        item_indexes.append(i)
        audio_pathes.append(total_dataset_v2['audio_path'][i])
    print(len(item_indexes))    
    transcriptions = asr_hubert_audio_files(audio_pathes)
    for i, item_index in enumerate(item_indexes):
        total_dataset_v2['hubert-asr-text'][item_index] = transcriptions[i]
    with open('total_dataset_hubert_asr_v2.pkl', 'wb') as f:
        pickle.dump(total_dataset_v2, f)

In [None]:
add_hubert_fleurs()

647


100%|██████████| 647/647 [01:14<00:00,  8.72it/s]


In [None]:
from IPython.display import FileLink
FileLink('total_dataset_hubert_asr_v2.pkl')

In [None]:
with open('total_dataset_hubert_asr_v2.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)
c = 0
print(len(total_dataset_v2['hubert-asr-text']))
for j, i in enumerate(total_dataset_v2['hubert-asr-text']):
    if i != '':
        if c < 2 :
            print(total_dataset_v2['pure-text'][j])
            print(total_dataset_v2['hubert-asr-text'][j])
        c += 1
print(c)

11411
this area helps to provide some insight into these farming adaptations
this area has to provide some insight into these farming adaptations
During the hottest part of the day , of course , the sun comes straight down and there isn't any shade ''.
during the hottest part of the day of course the sun comes straight down and there isn't any shady
11392


In [None]:
fleurs_retrieval = load_dataset("google/fleurs", "en_us")

In [None]:
def add_hubert_fleurs():
    with open('total_dataset_hubert_v2.pkl', 'rb') as f:
        total_dataset_v2 = pickle.load(f)
    with open('total_dataset_hubert_backup.pkl', 'wb') as f:
        pickle.dump(total_dataset_v2, f)
    item_indexes = []
    audio_pathes = []
    for i, audio_path in enumerate(total_dataset_v2['audio_path']):
        if total_dataset_v2['source'][i] != 'fleurs':
            continue
        item_indexes.append(i)
        audio_pathes.append(total_dataset_v2['audio_path'][i])
    print(len(item_indexes))
    embeddings = hubert_audio_files(audio_pathes)
    all_embeddings = np.concatenate(embeddings, axis=0)
#     reduced_embeddings = pca.fit_transform(all_embeddings)
    
    # Convert back to list of torch tensors
#     embeddings = [torch.from_numpy(emb) for emb in reduced_embeddings]
    embeddings = [torch.from_numpy(emb) for emb in all_embeddings]
    
    for i, item_index in enumerate(item_indexes):
        total_dataset_v2['hubert-emb'][item_index] = embeddings[i]
    with open('total_dataset_hubert_v2.pkl', 'wb') as f:
        pickle.dump(total_dataset_v2, f)

In [None]:
add_hubert_fleurs()

647


100%|██████████| 647/647 [01:11<00:00,  8.99it/s]


In [None]:
from IPython.display import FileLink
FileLink('total_dataset_hubert_v2.pkl')

In [None]:
with open('total_dataset_hubert_v2.pkl', 'rb') as f:
    total_dataset_hubert_v2 = pickle.load(f)

In [None]:
print(len(total_dataset_v2['hubert-asr-text']))
print(len(total_dataset_v2['hubert-emb']))


11411
11411


# Add Hubert Embedding to Data

## Brown

In [None]:
def download_data_part(url, output):
    gdown.download(url, output, quiet=False, fuzzy=True)
    with ZipFile(output, 'r') as zip_ref:
        zip_ref.extractall(output.split('.')[0])
    os.remove(output)

In [None]:
download_data_part('https://drive.google.com/file/d/1GkXhLbzHrobM4GaGX80PzRHl-TBPg8h4/view?usp=drive_link','dataset_part1.zip')

In [None]:
url = "https://drive.google.com/file/d/1tNgdESKJFyfIqyRs0BFxZxu8Cdc1hw7D/view?usp=sharing"
output = "total_dataset_v5.pkl"
gdown.download(url, output, quiet=False, fuzzy=True)

In [None]:
with open('total_dataset_v5.pkl', 'rb') as f:
    total_dataset_v5 = pickle.load(f)

In [None]:
from transformers import AutoProcessor, HubertModel
import soundfile as sf

hubert_processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
hubert_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft").to(device)


In [None]:
embeddings.shape

In [None]:
total_dataset_v5['train'].keys()

In [None]:
total_dataset_v5['train']['hubert-emb'] = ['' for i in range(len(total_dataset_v5['train']['audio']))]
total_dataset_v5['test']['hubert-emb'] = ['' for i in range(len(total_dataset_v5['train']['audio']))]
total_dataset_v5['validation']['hubert-emb'] = ['' for i in range(len(total_dataset_v5['train']['audio']))]

In [None]:
def hubert_audio_files(audio_file_list):
    embeddings = []
    for file_path in tqdm(audio_file_list):
        # Use soundfile to read the audio file
        data, samplerate = sf.read(file_path)
        # Normalize the audio data
        data = data / np.max(np.abs(data))
        # Resample the audio to 16kHz
        data = librosa.resample(data, orig_sr=samplerate, target_sr=16000)
        # Convert the audio data to PyTorch tensor
        audio = torch.from_numpy(data)
        # Process the audio data with Hubert processor
        inputs = hubert_processor(audio, sampling_rate=16000, return_tensors="pt").to(device)
        # Get the embeddings from Hubert model
        with torch.no_grad():
            try:
                hidden_states = hubert_model(**inputs).last_hidden_state
                avg_embedding = torch.mean(hidden_states.cpu(), dim=1)
                embeddings.append(avg_embedding.detach().numpy())
            except Exception as e:
                print(f"Error processing file: {file_path}")
                random_tensor = torch.randn(1, 1024)
                embeddings.append(random_tensor.detach().numpy())
    return embeddings

In [None]:
with open('total_dataset_v6.pkl', 'wb') as f:
        pickle.dump(total_dataset_v5, f)

In [None]:
def add_for_part_hubert(part_id):
    with open('total_dataset_v6.pkl', 'rb') as f:
        total_dataset = pickle.load(f)
    with open('total_dataset_v6_backup.pkl', 'wb') as f:
        pickle.dump(total_dataset, f)
    item_indexes = []
    audio_pathes = []
    split = 'train'
    gc.collect()
    for i, audio_path in enumerate(total_dataset[split]['audio_path']):
        if total_dataset[split]['source'][i] != 'brown':
            continue
        if f'/part{part_id}/' in audio_path:
            item_indexes.append(i)
            audio_pathes.append(f'dataset_part{part_id}/audios/' + audio_path.split('/')[-1])
    print(len(item_indexes))
    embeddings = hubert_audio_files(audio_pathes)
    all_embeddings = np.concatenate(embeddings, axis=0)
    embeddings = [torch.from_numpy(emb) for emb in all_embeddings]
    
    for i, item_index in enumerate(item_indexes):
        total_dataset[split]['hubert-emb'][item_index] = embeddings[i]
        
    item_indexes = []
    audio_pathes = []
    gc.collect()
    split = 'validation'
    for i, audio_path in enumerate(total_dataset[split]['audio_path']):
        if total_dataset[split]['source'][i] != 'brown':
            continue
        if f'/part{part_id}/' in audio_path:
            item_indexes.append(i)
            audio_pathes.append(f'dataset_part{part_id}/audios/' + audio_path.split('/')[-1])
    print(len(item_indexes))
    embeddings = hubert_audio_files(audio_pathes)
    all_embeddings = np.concatenate(embeddings, axis=0)
    embeddings = [torch.from_numpy(emb) for emb in all_embeddings]
    
    for i, item_index in enumerate(item_indexes):
        total_dataset[split]['hubert-emb'][item_index] = embeddings[i]
    
    gc.collect()
    item_indexes = []
    audio_pathes = []
    split = 'test'
    for i, audio_path in enumerate(total_dataset[split]['audio_path']):
        if total_dataset[split]['source'][i] != 'brown':
            continue
        if f'/part{part_id}/' in audio_path:
            item_indexes.append(i)
            audio_pathes.append(f'dataset_part{part_id}/audios/' + audio_path.split('/')[-1])
    print(len(item_indexes))
    embeddings = hubert_audio_files(audio_pathes)
    all_embeddings = np.concatenate(embeddings, axis=0)
    embeddings = [torch.from_numpy(emb) for emb in all_embeddings]
    
    for i, item_index in enumerate(item_indexes):
        total_dataset[split]['hubert-emb'][item_index] = embeddings[i]     
    gc.collect()
    
    with open('total_dataset_v6.pkl', 'wb') as f:
        pickle.dump(total_dataset, f)
    with open('part_id.txt', 'w') as f:
        f.write(str(part_id))

In [None]:
add_for_part_hubert(1)

In [None]:
with open('total_dataset_v6.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)
c = 0
split = 'test'
for j, i in enumerate(total_dataset_v2[split]['hubert-emb']):
    if i != '' and total_dataset_v2[split]['source'][j] == 'brown':
        if c < 2 :
            print(i)
            print(total_dataset_v2[split]['hubert-emb'][j].shape)
        c += 1
print(c)

In [None]:
del total_dataset_v2

In [None]:
with open('total_dataset_v6.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)
split = 'train'
x = total_dataset_v2[split]['hubert-emb'].copy()
for j, i in enumerate(total_dataset_v2[split]['hubert-emb']):
    if i != '' and total_dataset_v2[split]['source'][j] == 'brown':
        x[j] = total_dataset_v2[split]['hubert-emb'][j].squeeze(0) 
total_dataset_v2[split]['hubert-emb'] = x.copy() 

split = 'test'
x = total_dataset_v2[split]['hubert-emb'].copy()
for j, i in enumerate(total_dataset_v2[split]['hubert-emb']):
    if i != '' and total_dataset_v2[split]['source'][j] == 'brown':
        x[j] = total_dataset_v2[split]['hubert-emb'][j].squeeze(0) 
total_dataset_v2[split]['hubert-emb'] = x.copy() 

split = 'validation'
x = total_dataset_v2[split]['hubert-emb'].copy()
for j, i in enumerate(total_dataset_v2[split]['hubert-emb']):
    if i != '' and total_dataset_v2[split]['source'][j] == 'brown':
        x[j] = total_dataset_v2[split]['hubert-emb'][j].squeeze(0) 
total_dataset_v2[split]['hubert-emb'] = x.copy() 



In [None]:
with open('total_dataset_v6.pkl', 'wb') as f:
    pickle.dump(total_dataset_v2, f)

In [None]:
del total_dataset_v2

In [None]:
from shutil import rmtree
rmtree('dataset_part1')

In [None]:
# part 2
download_data_part('https://drive.google.com/file/d/1n_xxsblXrw5tCVk7ZP6_kRtubjMOuC2l/view?usp=drive_link','dataset_part2.zip')

In [None]:
add_for_part_hubert(2)

In [None]:
with open('total_dataset_v6.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)
c = 0
split = 'validation'
part_id = 2
for j, i in enumerate(total_dataset_v2[split]['hubert-emb']):
    if i != '' and total_dataset_v2[split]['source'][j] == 'brown' and f'/part{part_id}/' in total_dataset_v2[split]['audio_path'][j]:
        if c < 2 :
            print(i)
            print(total_dataset_v2[split]['hubert-emb'][j].shape)
        c += 1
print(c)
del total_dataset_v2

In [None]:
rmtree('dataset_part2')

In [None]:
# part 3
download_data_part('https://drive.google.com/file/d/14-Tvd3jvgY2Ge5-cxOsX7Arx6esfUxeN/view?usp=drive_link','dataset_part3.zip')

In [None]:
with open('total_dataset_v6_backup.pkl', 'rb') as f:
    total_dataset = pickle.load(f)
with open('total_dataset_v6.pkl', 'wb') as f:
    pickle.dump(total_dataset, f)
del total_dataset

In [None]:
add_for_part_hubert(3)

In [None]:
rmtree('dataset_part3')

In [None]:
# part 4
download_data_part('https://drive.google.com/file/d/1fId6jAxD5UzObg4zu30GPpcP-P9F7TSL/view?usp=drive_link','dataset_part4.zip')

In [None]:
add_for_part_hubert(4)

In [None]:
rmtree('dataset_part4')

In [None]:
# part 5
download_data_part('https://drive.google.com/file/d/1VssFVfqgnXy7JpgUKpTWEVg11bqYK2bn/view?usp=drive_link','dataset_part5.zip')

In [None]:
add_for_part_hubert(5)

In [None]:
rmtree('dataset_part5')

In [None]:
# part 6
download_data_part('https://drive.google.com/file/d/11vhiU4lV8w3IYTXebTDr-9j7834ig6OU/view?usp=drive_link','dataset_part6.zip')

In [None]:
add_for_part_hubert(6)

In [None]:
rmtree('dataset_part6')

In [None]:
# part 7
download_data_part('https://drive.google.com/file/d/1FbPz9VnDgU9Bj7sxtxOb7Etx1_dnobmH/view?usp=drive_link','dataset_part7.zip')

In [None]:
add_for_part_hubert(7)

In [None]:
rmtree('dataset_part7')

In [None]:
# part 8
download_data_part('https://drive.google.com/file/d/1E_UWK88hWwhwKI8kiND3OCtdo3tnlOu5/view?usp=drive_link','dataset_part8.zip')

In [None]:
add_for_part_hubert(8)

In [None]:
rmtree('dataset_part8')
# part 9
download_data_part('https://drive.google.com/file/d/17PC8U5HYQ8r9wGlpQoeptjMyA7rK6PrZ/view?usp=drive_link','dataset_part9.zip')

In [None]:
add_for_part_hubert(9)

In [None]:
rmtree('dataset_part9')

In [None]:
# part 10
download_data_part('https://drive.google.com/file/d/1TQI49qhcDKSX0222sAQxDsQZVbZnjlBP/view?usp=drive_link','dataset_part10.zip')

In [None]:
add_for_part_hubert(10)

In [None]:
with open('total_dataset_v6.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)
c = 0
split = 'validation'
for j, i in enumerate(total_dataset_v2[split]['hubert-emb']):
    if i != '' and total_dataset_v2[split]['source'][j] == 'brown':
        if c < 2 :
            print(i)
            print(total_dataset_v2[split]['hubert-emb'][j].shape)
        c += 1
print(c)
del total_dataset_v2

In [None]:
gc.collect()

In [None]:
rmtree('dataset_part10')

In [None]:
!rm 'total_dataset_v6_backup.pkl'

In [None]:
!rm 'total_dataset_v5.pkl'

In [None]:
with open('total_dataset_v6.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)
with open('total_dataset_v7.pkl', 'wb') as f:
    pickle.dump(total_dataset_v2, f)
del total_dataset_v2

## Common Voice

In [None]:
def add_hubert_to_common_voice():
    with open('total_dataset_v7.pkl', 'rb') as f:
        total_dataset = pickle.load(f)
    with open('total_dataset_v7_backup.pkl', 'wb') as f:
        pickle.dump(total_dataset, f)
    item_indexes = []
    audio_pathes = []
    split = 'train'
    gc.collect()
    for i, audio_path in enumerate(total_dataset[split]['audio_path']):
        if total_dataset[split]['source'][i] != 'common_voice':
            continue
        item_indexes.append(i)
        audio_pathes.append(total_dataset[split]['audio_path'][i])
    print(len(item_indexes))
    embeddings = hubert_audio_files(audio_pathes)
    all_embeddings = np.concatenate(embeddings, axis=0)
    embeddings = [torch.from_numpy(emb) for emb in all_embeddings]
    
    for i, item_index in enumerate(item_indexes):
        total_dataset[split]['hubert-emb'][item_index] = embeddings[i]
    
    item_indexes = []
    audio_pathes = []
    split = 'validation'
    gc.collect()
    for i, audio_path in enumerate(total_dataset[split]['audio_path']):
        if total_dataset[split]['source'][i] != 'common_voice':
            continue
        item_indexes.append(i)
        audio_pathes.append(total_dataset[split]['audio_path'][i])
    print(len(item_indexes))
    embeddings = hubert_audio_files(audio_pathes)
    all_embeddings = np.concatenate(embeddings, axis=0)
    embeddings = [torch.from_numpy(emb) for emb in all_embeddings]
    
    for i, item_index in enumerate(item_indexes):
        total_dataset[split]['hubert-emb'][item_index] = embeddings[i]
    
    item_indexes = []
    audio_pathes = []
    split = 'test'
    gc.collect()
    for i, audio_path in enumerate(total_dataset[split]['audio_path']):
        if total_dataset[split]['source'][i] != 'common_voice':
            continue
        item_indexes.append(i)
        audio_pathes.append(total_dataset[split]['audio_path'][i])
    print(len(item_indexes))
    embeddings = hubert_audio_files(audio_pathes)
    all_embeddings = np.concatenate(embeddings, axis=0)
    embeddings = [torch.from_numpy(emb) for emb in all_embeddings]
    
    for i, item_index in enumerate(item_indexes):
        total_dataset[split]['hubert-emb'][item_index] = embeddings[i]
        
        
    with open('total_dataset_v7.pkl', 'wb') as f:
        pickle.dump(total_dataset, f)

In [None]:
add_hubert_to_common_voice()

In [None]:
!rm 'total_dataset_v7_backup.pkl'

In [None]:
gc.collect()

## FLEURS

In [None]:
fleurs_retrieval = load_dataset("google/fleurs", "en_us")

In [None]:
!rm 'total_dataset_hubert_v8.pkl'

In [None]:
with open('total_dataset_hubert_v8.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)
with open('total_dataset_v8.pkl', 'wb') as f:
    pickle.dump(total_dataset_v2, f)
del total_dataset_v2

In [None]:
def add_hubert_fleurs():
    with open('total_dataset_v8.pkl', 'rb') as f:
        total_dataset = pickle.load(f)
    with open('total_dataset_v8_backup.pkl', 'wb') as f:
        pickle.dump(total_dataset, f)
    item_indexes = []
    audio_pathes = []
    split = 'train'
    gc.collect()
    for i, audio_path in enumerate(total_dataset[split]['audio_path']):
        if total_dataset[split]['source'][i] != 'fleurs':
            continue
        item_indexes.append(i)
        audio_pathes.append(total_dataset[split]['audio_path'][i])
    print(len(item_indexes))
    embeddings = hubert_audio_files(audio_pathes)
    all_embeddings = np.concatenate(embeddings, axis=0)
    embeddings = [torch.from_numpy(emb) for emb in all_embeddings]
    
    for i, item_index in enumerate(item_indexes):
        total_dataset[split]['hubert-emb'][item_index] = embeddings[i]
    
    item_indexes = []
    audio_pathes = []
    split = 'validation'
    gc.collect()
    for i, audio_path in enumerate(total_dataset[split]['audio_path']):
        if total_dataset[split]['source'][i] != 'fleurs':
            continue
        item_indexes.append(i)
        audio_pathes.append(total_dataset[split]['audio_path'][i])
    print(len(item_indexes))
    embeddings = hubert_audio_files(audio_pathes)
    all_embeddings = np.concatenate(embeddings, axis=0)
    embeddings = [torch.from_numpy(emb) for emb in all_embeddings]
    
    for i, item_index in enumerate(item_indexes):
        total_dataset[split]['hubert-emb'][item_index] = embeddings[i]
    
    item_indexes = []
    audio_pathes = []
    split = 'test'
    gc.collect()
    for i, audio_path in enumerate(total_dataset[split]['audio_path']):
        if total_dataset[split]['source'][i] != 'fleurs':
            continue
        item_indexes.append(i)
        audio_pathes.append(total_dataset[split]['audio_path'][i])
    print(len(item_indexes))
    embeddings = hubert_audio_files(audio_pathes)
    all_embeddings = np.concatenate(embeddings, axis=0)
    embeddings = [torch.from_numpy(emb) for emb in all_embeddings]
    
    for i, item_index in enumerate(item_indexes):
        total_dataset[split]['hubert-emb'][item_index] = embeddings[i]

        
    with open('total_dataset_hubert_v8.pkl', 'wb') as f:
        pickle.dump(total_dataset, f)

In [None]:
add_hubert_fleurs()

In [None]:
! rm "total_dataset_v8_backup.pkl"

In [None]:
gc.collect()
with open('total_dataset_v8.pkl', 'rb') as f:
    total_dataset_v2 = pickle.load(f)

In [None]:
c = 0
split = 'train'
for j, i in enumerate(total_dataset_v2[split]['hubert-emb']):
    if i == '' or total_dataset_v2[split]['hubert-emb'][j].shape[0] != 1024:
        c += 1
print(c)

In [None]:
total_dataset_v2['validation']['hubert-emb'][0].shape

In [None]:
len(total_dataset_v2['test']['hubert-emb'])

In [None]:
len(total_dataset_v2['validation']['hubert-emb'])

In [None]:
len(total_dataset_v2['train']['hubert-emb'])

In [None]:
def keep_until_empty(input_list):
    output_list = []
    for item in input_list:
        if item == '':
            break
        output_list.append(item)
    return output_list

total_dataset_v2['validation']['hubert-emb'] = keep_until_empty(total_dataset_v2['validation']['hubert-emb'])
total_dataset_v2['test']['hubert-emb'] = keep_until_empty(total_dataset_v2['test']['hubert-emb'])

In [None]:
c = 0
split = 'train'
for j, i in enumerate(total_dataset_v2[split]['audio']):
    if total_dataset_v2[split]['hubert-emb'][j] == '' or total_dataset_v2[split]['hubert-emb'][j].shape[0] != 1024:
        c += 1
print(c)

In [None]:
with open('total_dataset_v8.pkl', 'wb') as f:
    pickle.dump(total_dataset_v2, f)

In [None]:
del total_dataset_v2

In [None]:
from IPython.display import FileLink
FileLink('total_dataset_v9.pkl')

# Add Roberta Embeddings to Data

In [None]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

xlmr_model = SentenceTransformer('sentence-transformers/paraphrase-xlm-r-multilingual-v1').to(device)
embeddings = xlmr_model.encode(sentences)

gc.collect()

In [None]:
with open('total_dataset_v8.pkl', 'rb') as f:
    total_dataset = pickle.load(f)

In [None]:
embeddings = xlmr_model.encode(total_dataset['train']['pure-text'])
total_dataset['train']['xlmr-emb'] = [torch.from_numpy(emb) for emb in embeddings]

embeddings = xlmr_model.encode(total_dataset['validation']['pure-text'])
total_dataset['validation']['xlmr-emb'] = [torch.from_numpy(emb) for emb in embeddings]

embeddings = xlmr_model.encode(total_dataset['test']['pure-text'])
total_dataset['test']['xlmr-emb'] = [torch.from_numpy(emb) for emb in embeddings]

In [None]:
print(len(total_dataset['validation']['hubert-emb']))
print(len(total_dataset['test']['hubert-emb']))
print(len(total_dataset['train']['hubert-emb']))
print(len(total_dataset['validation']['xlmr-emb']))
print(len(total_dataset['test']['xlmr-emb']))
print(len(total_dataset['train']['xlmr-emb']))

In [None]:
total_dataset['validation']['xlmr-emb'][0].shape

In [None]:
with open('total_dataset_v9.pkl', 'wb') as f:
    pickle.dump(total_dataset, f)