# Preparing Caption Embeddings for AudioCaps Alternative 4 Captions (ACalt4)

Our implementation does not convert texts into sentence (semantic) embeddings on the fly. Instead, we convert them into embeddings in advance in an offline fashion.

- Download ACalt4 as `../data/audiocaps_alternative_4.csv` in advance from the external website DOSHISHA.
- The following will create `../data/capemb_GTEbase_AC_BLIP_Aug.npy` using the GTE base sentence embedding encoder model.

In [1]:
import warnings; warnings.simplefilter('ignore')
import logging; logging.basicConfig(level=logging.INFO)
import numpy as np
import pandas as pd
import torch

INFO:numexpr.utils:Note: detected 80 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
INFO:numexpr.utils:Note: NumExpr detected 80 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [2]:
# https://huggingface.co/thenlper/gte-base

import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

input_texts = [
    "what is the capital of China?",
    "how to implement quick sort in python?",
    "Beijing",
    "sorting algorithms"
]

tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-base")
model = AutoModel.from_pretrained("thenlper/gte-base")

# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# (Optionally) normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:1] @ embeddings[1:].T) * 100
print(scores.tolist())

[[69.65808868408203, 88.03551483154297, 68.79684448242188]]


In [3]:
df = pd.read_csv('../data/audiocaps_alternative_4.csv').set_index('youtube_id')
df

Unnamed: 0_level_0,caption1,caption2,caption3,caption4
youtube_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
---1_cCGK4M,A train is moving along the tracks with the rh...,"A train swiftly moving along the tracks, accom...","A train horn blaring in the distance, blending...","The unmistakable sound of a train, with the cl..."
---lTs1dxhU,A racing car speeding past in a virtual race,A car zooming around a track in a video game,The fast-paced sound of a car zooming along a ...,A dynamic sound of a vehicle racing on a track...
--0PQM4-hqg,Water flowing through a river with a gurgling ...,A waterfall cascading down with a rush of water,Gurgling water flowing through a peaceful land...,Natures symphony includes the gentle gurgling ...
--299m5_DdE,Excitement fills the indoor water park as chil...,The joyful sounds of children playing fill the...,Gurgling water and a waterfall fill the indoor...,The air in an indoor water park is filled with...
--2XRMjyizo,"Bird vocalizations, with chirps and tweets, fi...",Two police officers standing in front of a map,Birds chirping and tweeting in the background,Amidst the scene of two police officers studyi...
...,...,...,...,...
zzlfP-snUeY,A bulldozer idling in a rural area,A bulldozer idles and its engine rumbles softl...,An idling engine of a vehicle in an outdoor se...,The engine of a parked bulldozer purrs quietly...
zzm3dwoXY8Y,Birds chirping and cooing in a natural outdoor...,Birds chirping and cooing in an outdoor setting,A soft cooing sound coming from a group of bir...,The cooing of pigeons in an outdoor environment
zzvWbSyZfr0,The snoring in this image is occasionally inte...,There is snoring and occasional speech coming ...,A young girl is peacefully sleeping on a bed i...,"In the background, there is a gentle snoring s..."
zzwBazlj0Oc,The soft sound of pigeons cooing in a confined...,Birds cooing softly in a confined space,Pigeons cooing softly in a confined space,Pigeons cooing softly in a small room


In [4]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

cap_chunks = [c for c in chunks(list(df.values), 64)]

In [5]:
from tqdm import tqdm

model = model.to('cuda:0')

emb_chunks = []
for i, caps in enumerate(tqdm(cap_chunks)):
    flat_caps = []
    for cap4 in caps:
        assert len(cap4) == 4  # asserts 4 captions each
        for cap in cap4:
            flat_caps.append(cap)

    with torch.no_grad():
        batch_dict = tokenizer(flat_caps, max_length=512, padding=True, truncation=True, return_tensors='pt')
        batch_dict['input_ids'] = batch_dict['input_ids'].to('cuda:0')
        batch_dict['token_type_ids'] = batch_dict['token_type_ids'].to('cuda:0')
        batch_dict['attention_mask'] = batch_dict['attention_mask'].to('cuda:0')
        outputs = model(**batch_dict)
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']).detach().cpu()
    embeddings = embeddings.reshape(-1, 4, embeddings.shape[-1])
    emb_chunks.append(embeddings)


100%|██████████| 653/653 [01:25<00:00,  7.62it/s]


In [6]:
embs = torch.cat(emb_chunks, dim=0).numpy().astype(np.float16)
embs.shape

(41785, 4, 768)

In [7]:
embdic = {y: c for y, c in zip(df.index.values, embs)}
np.save('../data/capemb_GTEbase_AC_BLIP_Aug.npy', embdic)

In [8]:
embdic['---1_cCGK4M'].shape, embdic['---1_cCGK4M']

((4, 768),
 array([[-0.1776 , -0.2524 ,  0.2241 , ...,  0.568  ,  0.501  , -0.3445 ],
        [-0.1724 , -0.3872 ,  0.0874 , ...,  0.247  ,  0.6016 , -0.3633 ],
        [ 0.1284 , -0.0255 ,  0.1407 , ...,  0.4292 ,  0.4458 , -0.1812 ],
        [-0.04327, -0.3618 ,  0.4766 , ...,  0.3176 ,  0.2566 , -0.4915 ]],
       dtype=float16))