# Imports

In [None]:
%%capture

!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install torchaudio
!pip install librosa==0.8.1

In [None]:
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm
import torch

import torchaudio
from sklearn.model_selection import train_test_split

import os
import sys

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

# Download Dataset

In [None]:
!wget https://www.openslr.org/resources/12/train-clean-100.tar.gz
# !wget https://www.openslr.org/resources/12/test-clean.tar.gz
# !wget https://www.openslr.org/resources/12/dev-clean.tar.gz

In [None]:
!tar -xf train-clean-100.tar.gz
# !tar -xf test-clean.tar.gz
# !tar -xf /content/dev-clean.tar.gz

# Data preparation

In [None]:
data = []

# for path in tqdm(Path("/content/LibriSpeech/train-clean-100/").glob("**/**/*.flac")):
for path in tqdm(Path("/content/LibriSpeech/test-clean/").glob("**/**/*.flac")):
    name = str(path).split('/')[-1].split('.')[0]
    label = str(path).split('/')[-3]

    try:
        # There are some broken files
        s = torchaudio.load(path)
        data.append({
            "info": name,
            "path": path,
            "speaker": label
        })
    except Exception as e:
        # print(str(path), e)
        pass

    # break

In [None]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,info,path,speaker
0,4507-16021-0059,/content/LibriSpeech/test-clean/4507/16021/450...,4507
1,4507-16021-0017,/content/LibriSpeech/test-clean/4507/16021/450...,4507
2,4507-16021-0032,/content/LibriSpeech/test-clean/4507/16021/450...,4507
3,4507-16021-0052,/content/LibriSpeech/test-clean/4507/16021/450...,4507
4,4507-16021-0048,/content/LibriSpeech/test-clean/4507/16021/450...,4507


In [None]:
# Filter broken and non-existed paths

print(f"Step 0: {len(df)}")

df["status"] = df["path"].apply(lambda path: True if os.path.exists(path) else None)
df = df.dropna(subset=["path"])
df = df.drop("status", 1)
print(f"Step 1: {len(df)}")

df = df.sample(frac=1)
df = df.reset_index(drop=True)
df.head()

Step 0: 2620
Step 1: 2620


  df = df.drop("status", 1)


Unnamed: 0,info,path,speaker
0,1221-135767-0001,/content/LibriSpeech/test-clean/1221/135767/12...,1221
1,6930-81414-0015,/content/LibriSpeech/test-clean/6930/81414/693...,6930
2,5142-33396-0007,/content/LibriSpeech/test-clean/5142/33396/514...,5142
3,8463-287645-0009,/content/LibriSpeech/test-clean/8463/287645/84...,8463
4,7176-88083-0005,/content/LibriSpeech/test-clean/7176/88083/717...,7176


In [None]:
#Let's display some random sample of the dataset and run it a couple of times to get a feeling for the audio and the emotional label.
import torchaudio
import librosa
import IPython.display as ipd
import numpy as np

idx = np.random.randint(0, len(df))
sample = df.iloc[idx]
path = sample["path"]
label = sample["speaker"]


print(f"ID Location: {idx}")
print(f"      Label: {label}")
print()

speech, sr = torchaudio.load(path)
speech = speech[0].numpy().squeeze()
speech = librosa.resample(np.asarray(speech), sr, 16_000)
ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000)

ID Location: 121
      Label: 3575



In [None]:
# !mkdir /content/data

In [None]:
save_path = "/content/drive/MyDrive/Grad project/Data_paths"

# train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df["emotion"])

# train_df = train_df.reset_index(drop=True)
# test_df = test_df.reset_index(drop=True)

df.to_csv(f"{save_path}/test-clean.csv", sep="\t", encoding="utf-8", index=False)


print(df.shape)


(2620, 3)


# Load the Dataset




In [None]:
# Loading the created dataset using datasets
from datasets import load_dataset, load_metric


data_files = {
    "data": "/content/drive/MyDrive/Grad project/Data_paths/train_clean-100.csv"
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
dataset_dict = dataset["data"]

print(dataset_dict)


#HUBERT

In [None]:
from transformers import AutoProcessor, HubertModel
from datasets import load_dataset
import soundfile as sf
from transformers import Wav2Vec2FeatureExtractor
import torch


processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")

# # processor = AutoProcessor.from_pretrained("facebook/hubert-base-ls960")
model = HubertModel.from_pretrained("facebook/hubert-base-ls960",output_hidden_states=True)
# model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
# model = model.to(device)

# processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
# model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


In [None]:
model = model.to(device)

In [None]:
layer_num = 6

In [None]:
# # create logger
# import logging

# logname = "/content/drive/MyDrive/Grad project/models_hidden_states/logger.log"
# os.makedirs("/content/drive/MyDrive/Grad project/models_hidden_states", exist_ok=True)

# logging.basicConfig(filename=logname,
#                     filemode='a',
#                     format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
#                     datefmt='%H:%M:%S',
#                     level=logging.INFO,
#                     force=True)

# logging.info("init logger")


In [None]:
counter = 0
model.eval()

try:
  logging.info("----------------New RUN-----------------------")
except:
  print("logger not working")

# iterate over the dataset
for index, utterance in enumerate(tqdm(dataset_dict)):
  # print(index)
  # preprocesses each utterance
  speech, samplerate = sf.read(utterance["path"])
  input_values = processor(speech, return_tensors="pt", sampling_rate = samplerate).input_values.to(device)  # Batch size 1
  # produce the hidden states from Hubert
  with torch.no_grad():
    hidden_states = model(input_values).hidden_states
  #transform into numpy array
  layers = []
  print()
  for item in hidden_states:
    layers.append(item.detach().cpu().numpy())
  # layers = np.array(layers)[layer_num].squeeze()
  layers = np.array(layers)
  if counter < 1:
    print(layers.shape)
    counter += 1
  #save the hidden states
  path = utterance["path"].replace("flac","npy").replace("LibriSpeech", "Hubert").replace("/content","/content/drive/MyDrive/Grad project/models_hidden_states")
  dir_path = "/".join(path.split("/")[0:-1])
  os.makedirs(dir_path, exist_ok=True)
  np.save(path,layers)
  # try:
  #   speaker = utterance["speaker"]
  #   file_path = utterance["path"]
  #   logging.info(f"iteration id:{index}, speaker id:{speaker},  file path:{file_path}")
  # except:
  #   # print("logger not working")
  #   pass


In [None]:
counter = 0
model.eval()

# try:
#   logging.info("----------------New RUN-----------------------")
# except:
#   print("logger not working")

# iterate over the dataset
for index, utterance in enumerate(tqdm(dataset_dict)):
  # print(index)
  # preprocesses each utterance
  speech, samplerate = sf.read(utterance["path"])
  input_values = processor(speech, return_tensors="pt", sampling_rate = samplerate).input_values.to(device)  # Batch size 1
  # produce the hidden states from Hubert
  with torch.no_grad():
    last_hidden_state = model(input_values).last_hidden_state
  #transform into numpy array
  # layers = []
  # for item in hidden_states:
  #   layers.append(item.detach().cpu().numpy())
  # # layers = np.array(layers)[layer_num].squeeze()
  layers = np.array(last_hidden_state.detach().cpu().numpy()).squeeze()
  if counter < 1:
    print(layers.shape)
    counter += 1
  #save the hidden states
  path = utterance["path"].replace("flac","npy").replace("LibriSpeech", "Hubert_outputs").replace("/content","/content/drive/MyDrive/Grad project/models_hidden_states/Hubert_outputs/Hubert_large_ASR_last_layer")
  dir_path = "/".join(path.split("/")[0:-1])
  os.makedirs(dir_path, exist_ok=True)
  np.save(path,layers)
  # try:
  #   speaker = utterance["speaker"]
  #   file_path = utterance["path"]
  #   logging.info(f"iteration id:{index}, speaker id:{speaker},  file path:{file_path}")
  # except:
  #   print("logger not working")


  0%|          | 0/2703 [00:00<?, ?it/s]

(841, 1024)


100%|██████████| 2703/2703 [06:46<00:00,  6.64it/s]


#WAV2VEC2

In [None]:
 from transformers import  AutoFeatureExtractor, Wav2Vec2Processor, AutoModelForCTC
 from datasets import load_dataset
 import torch
 import soundfile as sf


processor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h",output_hidden_states=True)
model = model.to(device)



Downloading (…)rocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# create logger
import logging

logname = "/content/drive/MyDrive/Grad project/models_hidden_states/loggerWav2Vec.log"
os.makedirs("/content/drive/MyDrive/Grad project/models_hidden_states", exist_ok=True)

logging.basicConfig(filename=logname,
                    filemode='a',
                    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.INFO,
                    force=True)

logging.info("init logger")


In [None]:
layer_num = 12

In [None]:
counter = 0
model.eval()

try:
  logging.info("----------------Wav2Vec2: New RUN-----------------------")
except:
  print("logger not working")

# iterate over the dataset
for index, utterance in enumerate(tqdm(dataset_dict)):
  # print(index)
  # preprocesses each utterance
  speech, samplerate = sf.read(utterance["path"])
  input_values = processor(speech, return_tensors="pt", sampling_rate = samplerate).input_values.to(device)  # Batch size 1
  # produce the hidden states from Hubert
  with torch.no_grad():
    hidden_states = model(input_values).hidden_states
  #transform into numpy array
  layers = []
  for item in hidden_states:
    layers.append(item.detach().cpu().numpy())
  layers = np.array(layers)
  if counter < 1:
    print(layers.shape)
    counter += 1
  #save the hidden states
  path = utterance["path"].replace("flac","npy").replace("LibriSpeech", "Wav2Vec2").replace("/content","/content/drive/MyDrive/Grad project/models_hidden_states")
  dir_path = "/".join(path.split("/")[0:-1])
  os.makedirs(dir_path, exist_ok=True)
  np.save(path,layers)
  try:
    speaker = utterance["speaker"]
    file_path = utterance["path"]
    logging.info(f"iteration id:{index}, speaker id:{speaker},  file path:{file_path}")
  except:
    print("logger not working")


  0%|          | 1/2620 [00:12<8:53:08, 12.21s/it]

(13, 1, 671, 768)


100%|██████████| 2620/2620 [07:38<00:00,  5.71it/s]


In [None]:
counter = 0
model.eval()

# try:
#   logging.info("----------------New RUN-----------------------")
# except:
#   print("logger not working")

# iterate over the dataset
for index, utterance in enumerate(tqdm(dataset_dict)):
  # print(index)
  # preprocesses each utterance
  speech, samplerate = sf.read(utterance["path"])
  input_values = processor(speech, return_tensors="pt", sampling_rate = samplerate).input_values.to(device)  # Batch size 1
  # produce the hidden states from Hubert
  with torch.no_grad():
    last_hidden_state = model(input_values).hidden_states[12]
  #transform into numpy array
  # layers = []
  # for item in hidden_states:
  #   layers.append(item.detach().cpu().numpy())
  # # layers = np.array(layers)[layer_num].squeeze()
  layers = np.array(last_hidden_state.detach().cpu().numpy()).squeeze()
  if counter < 1:
    print(layers.shape)
    counter += 1
  #save the hidden states
  path = utterance["path"].replace("flac","npy").replace("LibriSpeech", "Wav2Vec2").replace("/content","/content/drive/MyDrive/Grad project/models_hidden_states")
  dir_path = "/".join(path.split("/")[0:-1])
  os.makedirs(dir_path, exist_ok=True)
  np.save(path,layers)
  # try:
  #   speaker = utterance["speaker"]
  #   file_path = utterance["path"]
  #   logging.info(f"iteration id:{index}, speaker id:{speaker},  file path:{file_path}")
  # except:
  #   print("logger not working")

  0%|          | 2/28539 [00:14<48:19:53,  6.10s/it] 

(794, 768)


 62%|██████▏   | 17592/28539 [28:53<18:51,  9.68it/s]

In [None]:
path

'/content/drive/MyDrive/Grad project/models_hidden_states/Hubert/test-clean/908/31957/908-31957-0004.npy'

#Hubert ASR

In [None]:
 from transformers import  AutoFeatureExtractor, Wav2Vec2Processor, AutoModelForCTC, AutoProcessor
 from datasets import load_dataset
 import torch
 import soundfile as sf


processor = AutoProcessor.from_pretrained("danieleV9H/hubert-base-libri-clean-ft100h-v3")
model = AutoModelForCTC.from_pretrained("danieleV9H/hubert-base-libri-clean-ft100h-v3",output_hidden_states=True)
model = model.to(device)



Downloading (…)rocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/217 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/268 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

In [None]:
# create logger
import logging

logname = "/content/drive/MyDrive/Grad project/models_hidden_states/loggerWav2Vec.log"
os.makedirs("/content/drive/MyDrive/Grad project/models_hidden_states", exist_ok=True)

logging.basicConfig(filename=logname,
                    filemode='a',
                    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.INFO,
                    force=True)

logging.info("init logger")


In [None]:
counter = 0
model.eval()

# try:
#   logging.info("----------------New RUN-----------------------")
# except:
#   print("logger not working")

# iterate over the dataset
for index, utterance in enumerate(tqdm(dataset_dict)):
  # print(index)
  # preprocesses each utterance
  speech, samplerate = sf.read(utterance["path"])
  input_values = processor(speech, return_tensors="pt", sampling_rate = samplerate).input_values.to(device)  # Batch size 1
  # produce the hidden states from Hubert
  with torch.no_grad():
    last_hidden_state = model(input_values).hidden_states[12]
  #transform into numpy array
  # layers = []
  # for item in hidden_states:
  #   layers.append(item.detach().cpu().numpy())
  # # layers = np.array(layers)[layer_num].squeeze()
  layers = np.array(last_hidden_state.detach().cpu().numpy()).squeeze()
  if counter < 1:
    print(layers.shape)
    counter += 1
  #save the hidden states
  path = utterance["path"].replace("flac","npy").replace("LibriSpeech", "Hubert_ASR").replace("/content","/content/drive/MyDrive/Grad project/models_hidden_states")
  dir_path = "/".join(path.split("/")[0:-1])
  os.makedirs(dir_path, exist_ok=True)
  np.save(path,layers)
  # try:
  #   speaker = utterance["speaker"]
  #   file_path = utterance["path"]
  #   logging.info(f"iteration id:{index}, speaker id:{speaker},  file path:{file_path}")
  # except:
  #   print("logger not working")


  0%|          | 2/28539 [00:07<24:16:32,  3.06s/it]

(794, 768)


100%|██████████| 28539/28539 [47:37<00:00,  9.99it/s]


In [None]:
counter = 0
model.eval()

try:
  logging.info("----------------Wav2Vec2: New RUN-----------------------")
except:
  print("logger not working")


# dataset_dict = dataset_dict.to(device)

# iterate over the dataset
for index, utterance in enumerate(tqdm(dataset_dict)):
  # print(index)
  # preprocesses each utterance
  speech, samplerate = sf.read(utterance["path"])
  input_values = processor(speech, return_tensors="pt", sampling_rate = samplerate).input_values.to(device)  # Batch size 1
  # produce the hidden states from Hubert
  with torch.no_grad():
    hidden_states = model(input_values).hidden_states
  #transform into numpy array
  layers = []
  for item in hidden_states:
    layers.append(item.detach().cpu().numpy())
  layers = np.array(layers)
  if counter < 1:
    print(layers.shape)
    counter += 1
  #save the hidden states
  path = utterance["path"].replace("flac","npy").replace("LibriSpeech", "Hubert_ASR").replace("/content","/content/drive/MyDrive/Grad project/models_hidden_states")
  dir_path = "/".join(path.split("/")[0:-1])
  os.makedirs(dir_path, exist_ok=True)
  np.save(path,layers)
  # try:
  #   speaker = utterance["speaker"]
  #   file_path = utterance["path"]
  #   logging.info(f"iteration id:{index}, speaker id:{speaker},  file path:{file_path}")
  # except:
  #   print("logger not working")


  0%|          | 1/28539 [00:00<2:09:54,  3.66it/s]

(13, 1, 794, 768)


 13%|█▎        | 3649/28539 [21:10<2:24:28,  2.87it/s]


KeyboardInterrupt: ignored

In [None]:
counter = 0
model.eval()

try:
  logging.info("----------------Hubert ASR: New RUN-----------------------")
except:
  print("logger not working")

# iterate over the dataset
for index, utterance in enumerate(tqdm(dataset_dict)):
  # print(index)
  # preprocesses each utterance
  speech, samplerate = sf.read(utterance["path"])
  input_values = processor(speech, return_tensors="pt", sampling_rate = samplerate).input_values.to(device)  # Batch size 1
  # produce the hidden states from Hubert
  with torch.no_grad():
    hidden_states = model(input_values).hidden_states
  #transform into numpy array
  layers = []
  for item in hidden_states:
    layers.append(item.detach().cpu().numpy())
  layers = np.array(layers)
  if counter < 1:
    print(layers.shape)
    counter += 1
  #save the hidden states
  path = utterance["path"].replace("flac","npy").replace("LibriSpeech", "Hubert_ASR").replace("/content","/content/drive/MyDrive/Grad project/models_hidden_states")
  dir_path = "/".join(path.split("/")[0:-1])
  os.makedirs(dir_path, exist_ok=True)
  np.save(path,layers)
  try:
    speaker = utterance["speaker"]
    file_path = utterance["path"]
    logging.info(f"iteration id:{index}, speaker id:{speaker},  file path:{file_path}")
  except:
    print("logger not working")


  0%|          | 0/2703 [00:00<?, ?it/s]

(13, 1, 841, 768)


100%|██████████| 2703/2703 [07:32<00:00,  5.98it/s]


#Hubert SID

In [None]:
import torch
from transformers import HubertForSequenceClassification, Wav2Vec2FeatureExtractor




model = HubertForSequenceClassification.from_pretrained("superb/hubert-base-superb-sid")
processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-base-superb-sid")
model = model.to(device)


Downloading (…)lve/main/config.json:   0%|          | 0.00/54.4k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

In [None]:
# create logger
import logging

logname = "/content/drive/MyDrive/Grad project/models_hidden_states/loggerWav2Vec.log"
os.makedirs("/content/drive/MyDrive/Grad project/models_hidden_states", exist_ok=True)

logging.basicConfig(filename=logname,
                    filemode='a',
                    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.INFO,
                    force=True)

logging.info("init logger")


In [None]:
counter = 0
model.eval()

try:
  logging.info("----------------Hubert ASR: New RUN-----------------------")
except:
  print("logger not working")

# iterate over the dataset
for index, utterance in enumerate(tqdm(dataset_dict)):
  # print(index)
  # preprocesses each utterance
  speech, samplerate = sf.read(utterance["path"])
  input_values = processor(speech, return_tensors="pt", sampling_rate = samplerate).input_values.to(device)  # Batch size 1
  # produce the hidden states from Hubert
  with torch.no_grad():
    hidden_states = model(input_values).hidden_states
  #transform into numpy array
  layers = []
  for item in hidden_states:
    layers.append(item.detach().cpu().numpy())
  layers = np.array(layers)
  if counter < 1:
    print(layers.shape)
    counter += 1
  #save the hidden states
  path = utterance["path"].replace("flac","npy").replace("LibriSpeech", "Hubert_ASR").replace("/content","/content/drive/MyDrive/Grad project/models_hidden_states")
  dir_path = "/".join(path.split("/")[0:-1])
  os.makedirs(dir_path, exist_ok=True)
  np.save(path,layers)
  try:
    speaker = utterance["speaker"]
    file_path = utterance["path"]
    logging.info(f"iteration id:{index}, speaker id:{speaker},  file path:{file_path}")
  except:
    print("logger not working")


In [None]:
counter = 0
model.eval()

try:
  logging.info("----------------Hubert ASR: New RUN-----------------------")
except:
  print("logger not working")

# iterate over the dataset
for index, utterance in enumerate(tqdm(dataset_dict)):
  # print(index)
  # preprocesses each utterance
  speech, samplerate = sf.read(utterance["path"])
  input_values = processor(speech, return_tensors="pt", sampling_rate = samplerate).input_values.to(device)  # Batch size 1
  # produce the hidden states from Hubert
  with torch.no_grad():
    hidden_states = model(input_values).hidden_states
  #transform into numpy array
  layers = []
  for item in hidden_states:
    layers.append(item.detach().cpu().numpy())
  layers = np.array(layers)
  if counter < 1:
    print(layers.shape)
    counter += 1
  #save the hidden states
  path = utterance["path"].replace("flac","npy").replace("LibriSpeech", "Hubert_SID").replace("/content","/content/drive/MyDrive/Grad project/models_hidden_states")
  dir_path = "/".join(path.split("/")[0:-1])
  os.makedirs(dir_path, exist_ok=True)
  np.save(path,layers)
  try:
    speaker = utterance["speaker"]
    file_path = utterance["path"]
    logging.info(f"iteration id:{index}, speaker id:{speaker},  file path:{file_path}")
  except:
    print("logger not working")


  0%|          | 1/2703 [00:06<4:57:57,  6.62s/it]

(13, 1, 841, 768)


100%|██████████| 2703/2703 [1:47:58<00:00,  2.40s/it]


#Wav2Vec2 SID

In [None]:
from transformers import AutoProcessor, AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
import soundfile as sf
from transformers import Wav2Vec2FeatureExtractor
import torch

processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-sid")
# processor = AutoProcessor.from_pretrained("superb/wav2vec2-base-superb-sid")
model = AutoModelForAudioClassification.from_pretrained("superb/wav2vec2-base-superb-sid")

model = model.to(device)


Downloading (…)rocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/54.9k [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

In [None]:
# create logger
import logging

logname = "/content/drive/MyDrive/Grad project/models_hidden_states/loggerWav2Vec.log"
os.makedirs("/content/drive/MyDrive/Grad project/models_hidden_states", exist_ok=True)

logging.basicConfig(filename=logname,
                    filemode='a',
                    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.INFO,
                    force=True)

logging.info("init logger")


In [None]:
counter = 0
model.eval()

# try:
#   logging.info("----------------New RUN-----------------------")
# except:
#   print("logger not working")

# iterate over the dataset
for index, utterance in enumerate(tqdm(dataset_dict)):
  # print(index)
  # preprocesses each utterance
  speech, samplerate = sf.read(utterance["path"])
  input_values = processor(speech, return_tensors="pt", sampling_rate = samplerate).input_values.to(device)  # Batch size 1
  # produce the hidden states from Hubert
  with torch.no_grad():
    last_hidden_state = model(input_values).hidden_states[12]
  #transform into numpy array
  # layers = []
  # for item in hidden_states:
  #   layers.append(item.detach().cpu().numpy())
  # # layers = np.array(layers)[layer_num].squeeze()
  layers = np.array(last_hidden_state.detach().cpu().numpy()).squeeze()
  if counter < 1:
    print(layers.shape)
    counter += 1
  #save the hidden states
  path = utterance["path"].replace("flac","npy").replace("LibriSpeech", "Wav2Vec2_ASR").replace("/content","/content/drive/MyDrive/Grad project/models_hidden_states")
  dir_path = "/".join(path.split("/")[0:-1])
  os.makedirs(dir_path, exist_ok=True)
  np.save(path,layers)
  # try:
  #   speaker = utterance["speaker"]
  #   file_path = utterance["path"]
  #   logging.info(f"iteration id:{index}, speaker id:{speaker},  file path:{file_path}")
  # except:
  #   print("logger not working")


  0%|          | 1/28539 [00:00<2:17:54,  3.45it/s]

(794, 768)


100%|██████████| 28539/28539 [47:47<00:00,  9.95it/s]


In [None]:
counter = 0
model.eval()

try:
  logging.info("----------------Wav2Vec2: New RUN-----------------------")
except:
  print("logger not working")

# iterate over the dataset
for index, utterance in enumerate(tqdm(dataset_dict)):
  # print(index)
  # preprocesses each utterance
  speech, samplerate = sf.read(utterance["path"])
  input_values = processor(speech, return_tensors="pt", sampling_rate = samplerate).input_values.to(device)  # Batch size 1
  # produce the hidden states from Hubert
  with torch.no_grad():
    hidden_states = model(input_values).hidden_states
  #transform into numpy array
  layers = []
  for item in hidden_states:
    layers.append(item.detach().cpu().numpy())
  layers = np.array(layers)
  if counter < 1:
    print(layers.shape)
    counter += 1
  #save the hidden states
  path = utterance["path"].replace("flac","npy").replace("LibriSpeech", "Wav2Vec2_SID").replace("/content","/content/drive/MyDrive/Grad project/models_hidden_states")
  dir_path = "/".join(path.split("/")[0:-1])
  os.makedirs(dir_path, exist_ok=True)
  np.save(path,layers)
  # try:
  #   speaker = utterance["speaker"]
  #   file_path = utterance["path"]
  #   logging.info(f"iteration id:{index}, speaker id:{speaker},  file path:{file_path}")
  # except:
  #   print("logger not working")


  0%|          | 0/28539 [00:00<?, ?it/s]

(13, 1, 794, 768)


  4%|▍         | 1283/28539 [16:39<4:50:16,  1.56it/s]

In [None]:
counter = 0
model.eval()

try:
  logging.info("----------------Hubert ASR: New RUN-----------------------")
except:
  print("logger not working")

# iterate over the dataset
for index, utterance in enumerate(tqdm(dataset_dict)):
  # print(index)
  # preprocesses each utterance
  speech, samplerate = sf.read(utterance["path"])
  input_values = processor(speech, return_tensors="pt", sampling_rate = samplerate).input_values.to(device)  # Batch size 1
  # produce the hidden states from Hubert
  with torch.no_grad():
    hidden_states = model(input_values).hidden_states
  #transform into numpy array
  layers = []
  for item in hidden_states:
    layers.append(item.detach().cpu().numpy())
  layers = np.array(layers)
  if counter < 1:
    print(layers.shape)
    counter += 1
  #save the hidden states
  path = utterance["path"].replace("flac","npy").replace("LibriSpeech", "Wav2Vec2_SID").replace("/content","/content/drive/MyDrive/Grad project/models_hidden_states")
  dir_path = "/".join(path.split("/")[0:-1])
  os.makedirs(dir_path, exist_ok=True)
  np.save(path,layers)
  try:
    speaker = utterance["speaker"]
    file_path = utterance["path"]
    logging.info(f"iteration id:{index}, speaker id:{speaker},  file path:{file_path}")
  except:
    print("logger not working")


  0%|          | 1/2620 [00:00<16:45,  2.61it/s]

(13, 1, 671, 768)


100%|██████████| 2620/2620 [10:21<00:00,  4.22it/s]


In [None]:
counter = 0
model.eval()

try:
  logging.info("----------------Hubert ASR: New RUN-----------------------")
except:
  print("logger not working")

# iterate over the dataset
for index, utterance in enumerate(tqdm(dataset_dict)):
  # print(index)
  # preprocesses each utterance
  speech, samplerate = sf.read(utterance["path"])
  input_values = processor(speech, return_tensors="pt", sampling_rate = samplerate).input_values.to(device)  # Batch size 1
  # produce the hidden states from Hubert
  with torch.no_grad():
    hidden_states = model(input_values).hidden_states
  #transform into numpy array
  layers = []
  for item in hidden_states:
    layers.append(item.detach().cpu().numpy())
  layers = np.array(layers)
  if counter < 1:
    print(layers.shape)
    counter += 1
  #save the hidden states
  path = utterance["path"].replace("flac","npy").replace("LibriSpeech", "Wav2Vec2_SID").replace("/content","/content/drive/MyDrive/Grad project/models_hidden_states")
  dir_path = "/".join(path.split("/")[0:-1])
  os.makedirs(dir_path, exist_ok=True)
  np.save(path,layers)
  try:
    speaker = utterance["speaker"]
    file_path = utterance["path"]
    logging.info(f"iteration id:{index}, speaker id:{speaker},  file path:{file_path}")
  except:
    print("logger not working")


  0%|          | 1/2703 [00:13<10:25:16, 13.88s/it]

(13, 1, 841, 768)


100%|██████████| 2703/2703 [1:50:13<00:00,  2.45s/it]
