In [None]:
from ego4d.ego4d_dataset import Ego4D_Narration
import soundfile as sf
import numpy as np
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from sentence_transformers import SentenceTransformer
import pandas as pd
import re
import tqdm

In [None]:
# text embedding for capture24
model = SentenceTransformer('all-MiniLM-L6-v2')
capture24 = pd.read_csv('./resources/capture24_label_count.csv')
clean_annotation = []
for annotation, count in zip(capture24['annotation'], capture24['count']):
    annotations = annotation.split(';')
    annotation = re.sub(r'\d+', '', annotations[-2])
    clean_annotation.append(annotation)

embeddings = model.encode(clean_annotation)
np.save('./resources/capture24_label_embedding.npy', embeddings)

In [None]:
dataset = Ego4D_Narration(modal=['audio', 'imu'], window_sec=10)
all_embeddings = {'audio': [], 'imu': [], 'text': []}
from imagebind import data
import torch
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# Instantiate model
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)
print("Model loaded successfully.")
for idx in range(len(dataset)):
    data_sample = dataset.__getitem__(idx)
    imu = torch.from_numpy(data_sample['imu'])[None, :]
    audio = torch.from_numpy(data_sample['audio'][None, :])
    text = data_sample['text']
    inputs = {
        ModalityType.IMU: imu.float().to(device),
        ModalityType.AUDIO: data.load_and_transform_audio_data([[audio, 16000]], (device)),
        ModalityType.TEXT: data.load_and_transform_text([text], device),
    }
    with torch.no_grad():
        embeddings = model(inputs)
    imu_embedding = embeddings[ModalityType.IMU].cpu().numpy()
    audio_embedding = embeddings[ModalityType.AUDIO].cpu().numpy()
    text_embedding = embeddings[ModalityType.TEXT].cpu().numpy()
    all_embeddings['audio'].append(audio_embedding)
    all_embeddings['imu'].append(imu_embedding)
    all_embeddings['text'].append(text_embedding)
    break
np.save('./resources/ego4d_imagebind_audio_embedding.npy', np.concatenate(all_embeddings['audio'], axis=0))
np.save('./resources/ego4d_imagebind_imu_embedding.npy', np.concatenate(all_embeddings['imu'], axis=0))
np.save('./resources/ego4d_imagebind_text_embedding.npy', np.concatenate(all_embeddings['text'], axis=0))

motion_candidate = ['walking', 'standing', 'sports', 'lying down', 'sitting', 'transportation']
inputs = {
        ModalityType.TEXT: data.load_and_transform_text(motion_candidate, device),
}
with torch.no_grad():
    embeddings = model(inputs)
np.save('./resources/ego4d_imagebind_motion_embedding.npy', embeddings[ModalityType.TEXT].cpu().numpy())

In [None]:
# embedding visualization
dataset = Ego4D_Narration(modal=['audio', 'imu'], window_sec=10)
scenario_embeddings = {}; scenario_labels = []
for i in range(len(dataset)):
    data_sample = dataset.window_idx[i]
    scenario = data_sample['scenario']
    for s in scenario:
        if s not in scenario_embeddings:
            scenario_embeddings[s] = []
        scenario_embeddings[s].append(i)

audio_embeddings = np.load('./resources/ego4d_imagebind_audio_embedding.npy')
imu_embeddings = np.load('./resources/ego4d_imagebind_imu_embedding.npy')
text_embeddings = np.load('./resources/ego4d_imagebind_text_embedding.npy')

random_scenario = np.random.choice(list(scenario_embeddings.keys()), 10, replace=False)
all_embeddings = {'audio': [], 'imu': [], 'text': []}; all_labels = []
for i, scenario in enumerate(random_scenario):
    indices = scenario_embeddings[scenario]
    random_indices = np.random.choice(indices, 50, replace=False)
    all_embeddings['audio'].append(audio_embeddings[random_indices])
    all_embeddings['imu'].append(imu_embeddings[random_indices])
    all_embeddings['text'].append(text_embeddings[random_indices])
    all_labels.extend([i] * 50)

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
tsne = TSNE(n_components=2, random_state=42)
for modality in all_embeddings:
    embeddings_2d = tsne.fit_transform(all_embeddings[modality])
    fig, axs = plt.subplots(1, 1, figsize=(6, 4))
    plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=all_labels)
    plt.yticks([])
    plt.xticks([])
    plt.tight_layout()
    plt.savefig(f'./figs/ego4d_{modality}_scenario_mapping.png')

embeddings_2d = tsne.fit_transform(np.concatenate([all_embeddings['audio'], all_embeddings['imu']], axis=1))
fig, axs = plt.subplots(1, 1, figsize=(6, 4))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=all_labels)
plt.yticks([])
plt.xticks([])
plt.tight_layout()
plt.savefig(f'./figs/ego4d_multimodal_scenario_mapping.png')


In [None]:
# scenario similarity
dataset = Ego4D_Narration(modal=['audio', 'imu'], window_sec=10)
text_embeddings = np.load('./resources/ego4d_narration_embedding.npy')

scenario_embeddings = {}
for i in range(len(dataset)):
    data_sample = dataset.window_idx[i]
    scenario = data_sample['scenario']
    for s in scenario:
        if s not in scenario_embeddings:
            scenario_embeddings[s] = []
        scenario_embeddings[s].append(i)
scenario_names = list(scenario_embeddings.keys())
scenario_similarities = []
for s in scenario_names:
    scenario = dataset.scenario_map[s].replace('/', ', ')
    idxs = scenario_embeddings[s]
    idxs = np.random.choice(idxs, 500)
    text_embeddings_scenario = text_embeddings[idxs]
    all_idxs = np.arange(len(dataset))
    random_idxs = np.random.choice(all_idxs, len(idxs), replace=False)
    random_embeddings = text_embeddings[random_idxs]

    intra_cosine_similarity = np.dot(text_embeddings_scenario, text_embeddings_scenario.T)
    intra_cosine_similarity = np.mean(intra_cosine_similarity)

    inter_cosine_similarity = np.dot(text_embeddings_scenario, random_embeddings.T)
    inter_cosine_similarity = np.mean(inter_cosine_similarity)
    scenario_similarities.append([intra_cosine_similarity, inter_cosine_similarity])
np.savetxt('./resources/ego4d_scenario_similarity.txt', scenario_similarities, fmt='%f')

In [None]:
# given activity_set, find the top 50 samples for each activity
dataset = Ego4D_Narration(modal=['audio', 'imu'], window_sec=10)
text_embeddings = np.load('./resources/ego4d_narration_embedding.npy')
audio_embeddings = np.load('./resources/ego4d_audio_embedding.npy')

import json
activity_motion_attributes = json.load(open('./resources/ego4d_activity_basic.json'))
activity_candidates = list(activity_motion_attributes.keys())

import laion_clap   
model = laion_clap.CLAP_Module(enable_fusion=False)
model.load_ckpt()
activity_embeddings = model.get_text_embedding(activity_candidates)
cosine_similarity = np.dot(text_embeddings, activity_embeddings.T)

# for each activity, find the top 50 samples
top_k = 50
dataset_folder = '../dataset/ego4d/sampled/'
for i in range(len(activity_candidates)):
    argmax_activity = activity_candidates[i]
    idxs = np.argsort(cosine_similarity[:, i])[-top_k:]
    average_similarity = np.mean(cosine_similarity[idxs, i])
    print(average_similarity)
    # only keep the idxs with the > 0.7
    idxs = idxs[np.where(cosine_similarity[idxs, i] > 0.7)]
    print(argmax_activity, len(idxs))
    dataset_folder_activity = f'../dataset/ego4d/mini/{argmax_activity}'
    os.makedirs(dataset_folder_activity, exist_ok=True)
    for j, idx in enumerate(idxs):
        imu_activity = ', '.join(activity_motion_attributes[argmax_activity])
        data_sample = dataset.window_idx[idx]
        scenario = data_sample['scenario']
        if len(scenario) > 3: # meaningless scenario
            continue
        scenario = ', '.join([dataset.scenario_map[s].replace('/', ' or ') for s in scenario])
        text = data_sample['text']
        audio_name = f'{dataset_folder_activity}/{j}_{scenario}_{argmax_activity}_{text}_{imu_activity}.wav'
        imu_name = audio_name.replace('.wav', '.npy')

        data_sample = dataset.__getitem__(idx)
        audio = data_sample['audio']
        imu = data_sample['imu']
        np.save(imu_name, imu[:])
        sf.write(audio_name, audio, 16000)