In [None]:
from transformers import WavLMModel

In [None]:
model = WavLMModel.from_pretrained("patrickvonplaten/wavlm-libri-clean-100h-base-plus")

In [None]:
import os
print(os.environ['HUGGINGFACE_HUB_CACHE'])

In [None]:
os.environ['HF_HOME'] = '/data/valerii/.cache/'

In [None]:
from transformers import Wav2Vec2Processor, HubertModel
from datasets import load_dataset
import soundfile as sf
import torchaudio, torch

# processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
# model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


In [None]:
model = HubertModel.from_pretrained("facebook/hubert-base-ls960")

In [None]:
config = model.config

In [None]:
model.training

In [None]:
config

In [None]:
config.gradient_checkpointing = True

In [None]:
model = HubertModel.from_pretrained("facebook/hubert-base-ls960", config = config)

In [None]:
model = model.to(0)

In [None]:
import os
os.curdir = "/data/valerii/AudioCaption/"
os.chdir(os.curdir)

In [None]:
from datahandlers.MyDataloader import *
import random
clotho_dataset = ClothoDataset(None, './data/Clotho', 'test', 26, tokens_size = 50,tokenizer_type = None)
caps_dataset = AudioCapsDataset(None, './data/AudioCaps', 'test', 26, tokens_size = 50,tokenizer_type = None)

In [None]:
dataloader = MyDataLoader({"hearts_kaggle": True}, 'train', 4,
                                             is_distributed=False, prefix_size = 80)

## Testing whether training process is fine or not

In [None]:
from models.HubertCaption import *

In [None]:
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# hubert_encoder = HubertModel.from_pretrained("facebook/hubert-base-ls960")
cnn_reshaper = ReshaperCNN(1024, 768)
# gpt = GPT2Model.from_pretrained("gpt2")

In [None]:
torch.cuda.empty_cache()

In [None]:
# del audio
# del tokens
# del mask
torch.cuda.empty_cache()

In [None]:
for audio, tokens, mask, path in dataloader:
    hidden_states = model(audio.to(0)).last_hidden_state
    break

In [None]:
audio = audio[:, :100000]

In [None]:
model(audio.to(1)).last_hidden_state

In [None]:
hidden_states.shape

In [None]:
reshaped_states = hidden_states.permute(0,2,1)

In [None]:
reshaped_states.shape

In [None]:
prefix_vectors = cnn_reshaper(reshaped_states)

In [None]:
prefix_vectors.shape

In [None]:
mappingnetwork = MappingNetwork(dim_embedding = 768,
                                prefix_length = 80, 
                                device = 'cpu') 

In [None]:
mapped_prefix_vectors = mappingnetwork(prefix_vectors)

In [None]:
mapped_prefix_vectors.shape

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt = GPT2Model.from_pretrained("gpt2")

In [None]:
language_header = nn.Linear(768, 50257, bias=False) # 50257 : original vocabulary size of GPT2
header_gpt2_header_params = './AAC_Prefix/PreTrained_GPT2Header.pt'
language_header.load_state_dict(torch.load(header_gpt2_header_params)) # use pre-trained header

In [None]:
embedding_text = gpt.wte(tokens.to('cpu'))
embedding_cat = torch.cat((mapped_prefix_vectors, embedding_text), dim=1)

out = gpt(inputs_embeds=embedding_cat.to('cpu'), attention_mask=mask.to('cpu'))
out_hidden_states = out[0]

logits = language_header(out_hidden_states)

In [None]:
logits.shape

In [None]:
logits.reshape(-1, logits.shape[-1]).shape

In [None]:
tokens.flatten().shape

In [None]:
embedding_text.shape

In [None]:
corrected_logits = logits[:, -embedding_text.shape[1]:, :]

In [None]:
logits[0]

In [None]:
corrected_logits[0]

In [None]:
corrected_logits.reshape(-1, logits.shape[-1]).shape

In [None]:
tokens.flatten()

In [None]:
loss = nnf.cross_entropy(corrected_logits.reshape(-1, logits.shape[-1]), tokens.flatten(), ignore_index=0)

In [None]:
loss

## Testing Hubert Encoder 

In [None]:
audio, caption, path = random.choice(clotho_dataset)

In [None]:
print(audio, audio.shape)

In [None]:
(input_values, input_values.shape)

In [None]:
import IPython
IPython.display.Audio(audio, rate=16000)

In [None]:
IPython.display.Audio(input_values, rate=16000)

In [None]:
hidden_states = model(audio.unsqueeze(0)).last_hidden_state

In [None]:
print(hidden_states, hidden_states.shape)

In [None]:
def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch

ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
ds = ds.map(map_to_array)

input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
hidden_states = model(input_values).last_hidden_state

## Testing caption model 

In [None]:
from pynvml import *
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")
    print("#####")

In [None]:
from Trainer import *
from datahandlers.MyDataloader import *
from models.HubertCaption import *

In [None]:
# print_gpu_utilization()
model = HubertCaption(device = 0).to(0)
# print_gpu_utilization()

In [None]:
model.train()

In [None]:
train_dataloader = MyDataLoader({"hearts_kaggle": True}, 'train', 5,
                                             is_distributed=False, prefix_size = 60)
test_dataloader = MyDataLoader({"hearts_kaggle": True}, 'test', 5, is_distributed=False,
                                            prefix_size =60 , test_captions=1)

In [None]:
total_epochs = 30
warmup_steps = int((total_epochs * len(train_dataloader)) / 15)
num_training_steps=total_epochs * len(train_dataloader)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay = 0.01)
scheduler = get_cosine_schedule_with_warmup(
optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)

# print("starting initialization of trainer")
trainer = Trainer(model, "roflotest", train_dataloader, test_dataloader, optimizer, scheduler, 0, accum_iter=2,is_distributed=False)

print("trainer initialized")

trainer.train(total_epochs)

In [None]:
test_dataloader.test_captions

# Extend torch tensor

In [None]:
import torch

# Assuming you have a tensor with shape (10, 20)
original_tensor = torch.randn(1, 3)

# Create a new tensor with shape (7, 10, 20) by repeating the original tensor
new_shape = (7, 1, 3)  # New dimensions for repetition
result_tensor = original_tensor.unsqueeze(0).expand(*new_shape)

# Now, result_tensor will have 7 copies of the original tensor
print(result_tensor.shape)  # Should print torch.Size([7, 10, 20])

In [None]:
original_tensor

In [None]:
result_tensor

# Checking semantics of prefix vectors

In [1]:
from models.Semantic import *
from datahandlers.MyDataloader import *
import random
import IPython
clotho_dataset = ClothoDataset(None, './data/Clotho', 'test', 26, tokens_size = 50,tokenizer_type = None)
# caps_dataset = AudioCapsDataset(None, './data/AudioCaps', 'test', 26, tokens_size = 50,tokenizer_type = None)
# dataloader = MyDataLoader({"hearts_kaggle": True}, 'train', 4, is_distributed=False, prefix_size = 80)

  from .autonotebook import tqdm as notebook_tqdm
get dataset...: 100%|██████████| 1045/1045 [00:02<00:00, 494.69it/s]


In [2]:
prefix_size_dict = {"temporal_prefix_size" : 60, "global_prefix_size" : 20}
device = 0
model = SemanticPrefix(prefix_size_dict = prefix_size_dict, device = device)
weights_path = '/home/stud_valery/AudioCaption/data/old_model_recs/bbc_model2/best_model'
params = torch.load(weights_path, map_location='cuda:' + str(device))
model.load_state_dict(params, strict=False)
model.eval().to(device)

  self.melW = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels,


temporal feature ver's mapping network : num_head = 8 num_layers = 4 prefix_vector_length = 60
global feature ver's mapping network : num_head = 8 num_layers = 4 prefix_vector_length = 20
Encoder freezing
GPT2 has been freezed
header is training


SemanticPrefix(
  (audio_encoder): Cnn14(
    (spectrogram_extractor): Spectrogram(
      (stft): STFT(
        (conv_real): Conv1d(1, 257, kernel_size=(512,), stride=(320,), bias=False)
        (conv_imag): Conv1d(1, 257, kernel_size=(512,), stride=(320,), bias=False)
      )
    )
    (logmel_extractor): LogmelFilterBank()
    (spec_augmenter): SpecAugmentation(
      (time_dropper): DropStripes()
      (freq_dropper): DropStripes()
    )
    (bn0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv_block1): ConvBlock(
      (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (conv_block2): ConvBlock(
      (conv1): Conv2d(64, 1

In [3]:
dataset = clotho_dataset
with torch.no_grad():
    audio_file, caption, path = random.choice(dataset)
    # predict_beam, semantics = model(audio_file.unsqueeze(0).to(0), beam_search = True)#[0][0]
    # predict_beam_nonsoft = model(audio_file.unsqueeze(0).to(0), beam_search = True, with_softmax = False)
    # predict_beam_softmax = model(audio_file.unsqueeze(0).to(0), beam_search = True, with_softmax = True)
    # predict_greedy_nonsoft = model(audio_file.unsqueeze(0).to(0), beam_search = False, with_softmax = False)[0]
    # predict_greedy_softmax = model(audio_file.unsqueeze(0).to(0), beam_search = False, with_softmax = True)[0]
    # # predict = predict[0][0]
    # print(f"Ground truth caption: {caption}\n")

    # print("beam nonsoft:",  predict_beam_nonsoft)
    # print("beam soft:", predict_beam_softmax)
    # print("greedy nonsoft:", predict_greedy_nonsoft)
    # print("greedy soft:", predict_greedy_softmax)
IPython.display.Audio(path, rate=16000)

In [None]:
print (semantics, semantics.shape)

In [None]:
logits = (
            semantics @ torch.transpose(model.gpt.wte.weight.to(semantics.dtype), 0, 1)
        ).float()

In [None]:
model.gpt.wte.weight

In [None]:
semantics.max()

In [None]:
model.gpt.wte.weight[1000].max()

In [None]:
model.gpt.wte(torch.tensor([1000, 300], device = 0))

In [None]:
logits[0][0]

In [None]:
tokens = torch.argmax(logits, -1)
print(tokens)

In [None]:
model.tokenizer.decode(tokens[0])

In [None]:
embs.shape

In [None]:
embs = model.gpt.wte(tokens)
model.generate_beam(embs)

In [None]:
model.generate_beam(semantics)

In [17]:
prefix.argmax(-1)

torch.Size([1, 80])

In [5]:
model.get_semantic_vectors(prefix)

shape prefix reshaped:  torch.Size([80, 768])
semantic logits after linear shape:  torch.Size([80, 50257])
shape of semantic logits after reshape back:  torch.Size([1, 80, 50257])
after softmax:  torch.Size([1, 80])
after gpt wte:  tensor([[[ 8.9528e-02,  1.3695e-01,  3.4434e-01,  ...,  1.9922e-01,
          -5.5177e-02,  2.6428e-02],
         [ 5.2775e-02, -5.7685e-03,  1.6769e-01,  ...,  2.4477e-01,
          -8.4319e-02, -8.0892e-04],
         [ 8.9528e-02,  1.3695e-01,  3.4434e-01,  ...,  1.9922e-01,
          -5.5177e-02,  2.6428e-02],
         ...,
         [-1.4190e-01,  2.6440e-04,  3.7298e-02,  ...,  2.5052e-01,
           9.0462e-02,  1.3856e-02],
         [ 1.5184e-02, -6.1816e-04,  4.5935e-02,  ...,  9.1113e-02,
          -1.9022e-01,  5.2916e-02],
         [-4.5813e-02, -2.0575e-01,  1.5883e-01,  ..., -1.1458e-01,
           2.0977e-02,  1.5604e-01]]], device='cuda:0')


tensor([[[ 8.9528e-02,  1.3695e-01,  3.4434e-01,  ...,  1.9922e-01,
          -5.5177e-02,  2.6428e-02],
         [ 5.2775e-02, -5.7685e-03,  1.6769e-01,  ...,  2.4477e-01,
          -8.4319e-02, -8.0892e-04],
         [ 8.9528e-02,  1.3695e-01,  3.4434e-01,  ...,  1.9922e-01,
          -5.5177e-02,  2.6428e-02],
         ...,
         [-1.4190e-01,  2.6440e-04,  3.7298e-02,  ...,  2.5052e-01,
           9.0462e-02,  1.3856e-02],
         [ 1.5184e-02, -6.1816e-04,  4.5935e-02,  ...,  9.1113e-02,
          -1.9022e-01,  5.2916e-02],
         [-4.5813e-02, -2.0575e-01,  1.5883e-01,  ..., -1.1458e-01,
           2.0977e-02,  1.5604e-01]]], device='cuda:0')

In [4]:
with torch.no_grad():
    predict, prefix = model(audio_file.unsqueeze(0).to(0), beam_search = False, with_softmax = True, check_prefix = True)
    
    print(f"Ground truth caption: {caption}\n")
    print("pred: ", predict)


Ground truth caption: water drips continuously from the ceiling, never slowing.

pred:  ['someone is dropping a spoon into a container and then placing it on a hard surface.']


In [None]:
print(prefix.shape, prefix)

In [None]:
normalized_prefix = (prefix - prefix.mean())/prefix.std()
print(normalized_prefix)

In [None]:
prefix.mean(dim = 1).shape

In [None]:
normalized_prefix = (prefix - prefix.mean())/prefix.std()
# print(normalized_prefix)
model.generate_beam(prefix, with_softmax=True)

In [None]:
tokens = torch.tensor(model.tokenizer("The cat is ").input_ids)
print(tokens)
embs = model.gpt.wte(tokens.to(0))
print(embs)

In [None]:
embs_soft = torch.softmax(embs, -1).log()

In [None]:
small_prefix = prefix/100
small_prefix

In [None]:
concat_vec = torch.cat((embs, prefix[0]), dim = 0)

In [None]:
embs

In [None]:
embs.unsqueeze(0)/embs.std()

In [None]:
model.generate(embs.unsqueeze(0), with_softmax = True)

In [None]:
small_prefix = prefix/1000.0
small_prefix.std(dim = 1).mean()

In [None]:
embs.std(dim = 1)

In [None]:
model.generate(small_prefix, with_softmax = True)

## Text prompt

In [4]:
def gen_iters(embs, output_list, max_iter):
    new_embs = embs.clone()
    for i in range(max_iter):
        print(new_embs)
        out = model.gpt(inputs_embeds = new_embs).last_hidden_state[0]
        logits = model.language_header(out)
        token = torch.argmax(logits, -1)
        # print(token)
        # print(new_embs.shape, out.shape)
        # print(i, new_embs)
        new_embs = torch.cat((new_embs, out.unsqueeze(0)), dim=0)
        output_list.append(model.tokenizer.decode(token)) #didn't work last time
        
        print(token)
        if token == 13:
            break
def gen_from_latent(embs, gen_from_embs = False, gen_with_generate = False):
    output = []
    if gen_from_embs == True:
        gen_iters(embs, output, 5)
    elif gen_with_generate == True:
        output = model.generate(embs.unsqueeze(0), with_softmax=True)
    else:
        return None
    return output    
def gen_text(prompt, gen_from_embs = False, gen_with_generate = False):
    tokens = torch.tensor(model.tokenizer(prompt).input_ids)
    embs = model.gpt.wte(tokens.to(0))
    return gen_from_latent(embs, gen_from_embs, gen_with_generate)

In [None]:
gen_text("Hello, today we decided that", gen_with_generate = True)

In [None]:
gen_text("Hello, today we decided that", gen_with_generate = True)

## 'Audio' prompt
 

In [None]:
# audio_file, sr = torchaudio.load('/data/valerii/AudioCaption/models/audio.wav')
dataset = clotho_dataset
audio_file, caption, path = random.choice(dataset)
print('ground truth: ', caption)
IPython.display.Audio(path, rate=16000)

In [None]:
with torch.no_grad():
    predict, prefix = model(audio_file.to(0), beam_search = False, with_softmax = False, check_prefix = True)

In [5]:
dataset = clotho_dataset
with torch.no_grad():
    audio_file, caption, path = random.choice(dataset)
print('ground truth: ', caption)
IPython.display.Audio(path, rate=16000)

NameError: name 'clotho_dataset' is not defined

In [6]:
with torch.no_grad():
    predict, prefix = model(audio_file.unsqueeze(0).to(0), beam_search = False, with_softmax = False, check_prefix = True)
    print(f"Ground truth caption: {caption}\n")
    print("pred: ", predict)

NameError: name 'model' is not defined

In [None]:
model.tokenizer.decode(tokens)

In [None]:
prompt = "Would you like to taste this syrop? Yes, of course, and you? It would be great for me to taste that, of course! But \
    what about the cat? What about him? Maybe he would like to taste that syrop as well? OOh, no problem, we will share with him. But \
        i am pretty sure that he doesn't want to taste something sweet like that."
tokens = torch.tensor(model.tokenizer(prompt).input_ids)
embs = model.gpt.wte(tokens.to(0))
gen_from_latent(torch.cat((embs, prefix.squeeze(0)), dim = 0), gen_with_generate = True)

In [None]:
embs

In [None]:
header_gpt2_header_params = './models/PreTrained_GPT2Header.pt'
model.language_header.load_state_dict(torch.load(header_gpt2_header_params))

In [None]:
prefix.shape

In [None]:
gen

In [None]:
prefix

## Evaluate using evaluater

In [1]:
from Trainer import Evaluater
from models.AAC_Prefix import *
from datahandlers.MyDataloader import *


In [None]:
prefix_size_dict = {"temporal_prefix_size" : 60, "global_prefix_size" : 20}
device = 0
model = AAC_Prefix(prefix_size_dict = prefix_size_dict, device = device)
weights_path = '/data/valerii/AudioCaption/data/old_model_recs/bbc_model2/best_model'
params = torch.load(weights_path, map_location='cuda:' + str(device))
model.load_state_dict(params)

In [None]:
test_dataloader = MyDataLoader({"audiocaps": True}, 'test', 5, is_distributed=False, prefix_size = 80)
evaluater = Evaluater(model, test_dataloader, device, beam_search=True)
evaluater.eval()