# Preparing Caption Embeddings for Auto-ACD

Our implementation does not convert texts into sentence (semantic) embeddings on the fly. Instead, we convert them into embeddings in advance in an offline fashion.

- Download [Auto-ACD](https://auto-acd.github.io/) train.csv as `../data/auto-acd/train.csv` in advance from the [Hugging Face](https://huggingface.co/datasets/Loie/Auto-ACD).
- The following will create `../data/capemb_GTEbase_Audo_A_C_D.npy` using the GTE base sentence embedding encoder model.

In [1]:
import warnings; warnings.simplefilter('ignore')
import logging; logging.basicConfig(level=logging.INFO)
import numpy as np
import pandas as pd
import torch

In [2]:
# https://huggingface.co/thenlper/gte-base

import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

input_texts = [
    "what is the capital of China?",
    "how to implement quick sort in python?",
    "Beijing",
    "sorting algorithms"
]

tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-base")
model = AutoModel.from_pretrained("thenlper/gte-base")

# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# (Optionally) normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:1] @ embeddings[1:].T) * 100
print(scores.tolist())

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/618 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/219M [00:00<?, ?B/s]

[[69.65808868408203, 88.03551483154297, 68.79684448242188]]


In [4]:
import urllib.request


def download_segment_csv():
    EVAL_URL = 'http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv'
    BALANCED_TRAIN_URL = 'http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv'
    UNBALANCED_TRAIN_URL = 'http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv'

    for subset_url in [EVAL_URL, BALANCED_TRAIN_URL, UNBALANCED_TRAIN_URL]:
        subset_path = '/tmp/' + Path(subset_url).name
        if Path(subset_path).is_file():
            continue
        with open(subset_path, 'w') as f:
            subset_data = urllib.request.urlopen(subset_url).read().decode()
            f.write(subset_data)
            print('Wrote', subset_path)


def make_metadata(download=False):
    # download the original metadata.
    if download:
        download_segment_csv()

    # load label maps.
    e_df = pd.read_csv('/tmp/eval_segments.csv', skiprows=2, sep=', ', engine='python')
    e_df['split'] = 'eval_segments'
    b_df = pd.read_csv('/tmp/balanced_train_segments.csv', skiprows=2, sep=', ', engine='python')
    b_df['split'] = 'balanced_train_segments'
    u_df = pd.read_csv('/tmp/unbalanced_train_segments.csv', skiprows=2, sep=', ', engine='python')
    u_df['split'] = 'unbalanced_train_segments'
    df = pd.concat([e_df, b_df, u_df])
    df = df[['# YTID', 'positive_labels', 'split']].copy()
    df.columns = ['ytid', 'label', 'split']

    on = urllib.request.urlopen('https://raw.githubusercontent.com/audioset/ontology/master/ontology.json').read().decode()
    on = json.loads(on)
    id2name = {o['id']: o['name'] for o in on}
    id2desc = {o['id']: o['description'] for o in on}

    # clean labels.
    def remove_quotations(s):
        assert s[0] == '"' and s[-1] == '"'
        return s[1:-1]
    df.label = df.label.apply(lambda s: remove_quotations(s))

    return df, id2name, id2desc


df, id2name, id2desc = make_metadata(download=False)
# df, id2name, id2desc = make_metadata(download=True)

Wrote /tmp/eval_segments.csv
Wrote /tmp/balanced_train_segments.csv
Wrote /tmp/unbalanced_train_segments.csv


In [5]:
acd = pd.read_csv('../data/auto-acd/train.csv')

#
# Replace suggestions.
# ex.)
# From: 'The audio caption for the given information could be: "A car engine idles consistently, indicating a car engine knocking, possibly in an engine room."'
# To: 'A car engine idles consistently, indicating a car engine knocking, possibly in an engine room.'
#

import re

def replace_could_be_as_suggested(cap):
    '''Replace following suggestion as suggested.
    'The audio caption for the given information could be: "A car engine idles consistently, indicating a car engine knocking, possibly in an engine room."'
    '''
    if 'be: "' not in cap:
        return cap
    return re.search('be: "(.+)"', cap).group(1)
    
acd.caption = acd.caption.apply(replace_could_be_as_suggested)


In [6]:
def autoacd_get_ytid(ytid):
    if ytid[0] == 'Y':
        return ytid[1:1+11]
    return ytid[:11]
acd['ytid'] = acd.youtube_id.apply(autoacd_get_ytid)
acd

Unnamed: 0,youtube_id,caption,ytid
0,oPRC3zComfA_000117,Electronic music plays as a person skillfully ...,oPRC3zComfA
1,cwPXoYexXrs_000311,The sound of rustling followed by digital beep...,cwPXoYexXrs
2,ATSbLAiKFXI_000030,"Rain falls onto a hard surface as a rowboat, c...",ATSbLAiKFXI
3,Tdxa6DYIo-Y_000469,The clarinet plays a melodic tune accompanied ...,Tdxa6DYIo-Y
4,eALUwesAXys_000268,Gunshots ring out as a metallic clang echoes i...,eALUwesAXys
...,...,...,...
1921873,ioo3vA8dvZE_000192,A man and child shoot targets with guns while ...,ioo3vA8dvZE
1921874,TbDp9hcY-pc_000030,A dog loudly barks and scratches at a rolling ...,TbDp9hcY-pc
1921875,YmAJAqzo-Y_Y,A woman talks quietly in a small room while a ...,mAJAqzo-Y_Y
1921876,YPsERnugT1hs,A lady in a dress speaks softly on a boat whil...,PsERnugT1hs


In [7]:
def convert_to_simple_caption(comma_separated_labels):
    labels = [id2name[l] for l in comma_separated_labels.split(',')]
    simple_caption = "The sound of {}, and {}.".format(", ".join(labels[:-1]),  labels[-1]) if len(labels) > 1 else f"The sound of {labels[0]}."
    return simple_caption

df['caption'] = df.label.apply(lambda s: convert_to_simple_caption(s))
df

Unnamed: 0,ytid,label,split,caption
0,--4gqARaEJE,"/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk",eval_segments,"The sound of Domestic animals, pets, Squeak, D..."
1,--BfvyPmVMo,/m/03l9g,eval_segments,The sound of Hammer.
2,--U7joUcTCo,/m/01b_21,eval_segments,The sound of Cough.
3,--i-y1v8Hy8,"/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005",eval_segments,"The sound of Music, Speech, Female singing, an..."
4,-0BIyqJj9ZU,"/m/07rgt08,/m/07sq110,/t/dd00001",eval_segments,"The sound of Chuckle, chortle, Belly laugh, an..."
...,...,...,...,...
2041784,zzyyleHsxfk,"/m/05tny_,/m/068hy,/m/0bt9lr,/m/0jbk",unbalanced_train_segments,"The sound of Bark, Domestic animals, pets, Dog..."
2041785,zzz-JsGPtxQ,"/m/015lz1,/m/0l14jd",unbalanced_train_segments,"The sound of Singing, and Choir."
2041786,zzz3PZXRQ_8,"/m/030rvx,/m/09x0r",unbalanced_train_segments,"The sound of Buzzer, and Speech."
2041787,zzznDcamMpw,"/m/09ddx,/m/09x0r",unbalanced_train_segments,"The sound of Duck, and Speech."


In [8]:
from collections import OrderedDict

acdic = OrderedDict()
for k, v in df.reset_index()[['ytid', 'caption']].values:
    acdic[k] = v

for k, cap in acd[['ytid', 'caption']].values:
    if k in acdic:
        acdic[k] = cap

In [9]:
for i, j in zip(list(acdic.keys()), list(df.ytid)):
    if i != j:
        print(i, j)
    if i in ['--4gqARaEJE', '--BfvyPmVMo', '--U7joUcTCo']:
        print('OK', i, j)

OK --4gqARaEJE --4gqARaEJE
OK --BfvyPmVMo --BfvyPmVMo
OK --U7joUcTCo --U7joUcTCo


In [10]:
df['auto_acd_on_label_caption'] = acdic.values()

In [11]:
K = 10000
df[['ytid', 'caption', 'auto_acd_on_label_caption']][K:K+10].values

array([['RSXUYIi95wo',
        'The sound of Whispering, Music, Crackle, and Speech.',
        'A woman whispers with music playing softly in the background, creating a serene atmosphere.'],
       ['RSryuuvUfDM',
        'The sound of Blues, Guitar, Acoustic guitar, Music, Mandolin, Musical instrument, Steel guitar, slide guitar, and Plucked string instrument.',
        'A melodic acoustic guitar is being played with background music in a music studio.'],
       ['RSxTBSVezfU', 'The sound of Tubular bells.',
        'The sound of Tubular bells.'],
       ['RT234P5SzJo',
        'The sound of Wild animals, Walk, footsteps, Roaring cats (lions, tigers), and Animal.',
        'Lions roar loudly amidst the rustling of leaves, indicating a wild animal encounter in their natural habitat.'],
       ['RTDr2L_OT0M', 'The sound of Printer.',
        'The sound of a ticking clock accompanies the rhythmic noise of a printer as it prints in an office.'],
       ['RTGrNUWT2Sc', 'The sound of Music,

In [12]:
df.to_csv('../data/auto_acd_on_label_caption.csv', index=None)

In [13]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

cap_chunks = [c for c in chunks(list(df.auto_acd_on_label_caption.values), 256)]

In [21]:
model = model.to('cuda:0')

In [30]:
from tqdm import tqdm

emb_chunks = []
for i, caps in enumerate(tqdm(cap_chunks)):
    with torch.no_grad():
        batch_dict = tokenizer(caps, max_length=512, padding=True, truncation=True, return_tensors='pt')
        batch_dict['input_ids'] = batch_dict['input_ids'].to('cuda:0')
        batch_dict['token_type_ids'] = batch_dict['token_type_ids'].to('cuda:0')
        batch_dict['attention_mask'] = batch_dict['attention_mask'].to('cuda:0')
        outputs = model(**batch_dict)
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']).detach().cpu()
    emb_chunks.append(embeddings)


100%|██████████| 8142/8142 [37:13<00:00,  3.65it/s] 


In [31]:
embs = torch.cat(emb_chunks, dim=0).numpy().astype(np.float16)
embs.shape

(2084320, 768)

In [32]:
embdic = {y: c for y, c in zip(df.ytid.values, embs)}
np.save('../data/capemb_GTEbase_Audo_A_C_D.npy', embdic)

In [33]:
embs[0]

array([ 9.99613404e-02,  4.34343398e-01,  1.33894801e-01,  2.50875980e-01,
        9.74929333e-01,  4.29515004e-01,  9.52278614e-01,  6.12851858e-01,
       -3.82169485e-01, -1.05268466e+00, -2.49101609e-01,  1.67118013e-01,
       -9.08356428e-01,  2.73141295e-01, -1.98690295e-01,  1.12705946e+00,
        6.97766364e-01,  1.66415870e-01,  5.90664566e-01, -5.95370680e-02,
       -5.06446719e-01, -5.86260080e-01, -2.00126931e-01,  5.48775375e-01,
        2.73360133e-01,  3.00086532e-02,  6.87165409e-02,  6.79675281e-01,
       -1.51699400e+00,  2.00590771e-02,  3.40286791e-01,  4.85813711e-03,
        1.41574219e-01, -4.14322525e-01, -1.45870492e-01,  1.52382016e-01,
        1.94605529e-01, -9.57792819e-01,  1.13982752e-01, -2.38529876e-01,
       -2.95700312e-01, -5.99810034e-02,  3.01841736e-01,  3.30573209e-02,
       -8.68972778e-01, -6.47516787e-01, -4.76893365e-01,  8.97218168e-01,
       -3.44504297e-01, -1.94447771e-01, -5.41380167e-01,  4.86705780e-01,
       -2.01902032e-01, -