In [1]:
import os
import os.path
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
"""
Adapted from: https://github.com/Vision-CAIR/MiniGPT-4/blob/main/demo.py
"""
import argparse
import os
import random

import numpy as np
import torch
import torch.backends.cudnn as cudnn
import gradio as gr

from video_llama.common.config import Config
from video_llama.common.dist_utils import get_rank
from video_llama.common.registry import registry
from video_llama.conversation.conversation_video import Chat, Conversation, default_conversation,SeparatorStyle,conv_llava_llama_2
import decord
decord.bridge.set_bridge('torch')

from tqdm import tqdm

#%%
# imports modules for registration
from video_llama.datasets.builders import *
from video_llama.models import *
from video_llama.processors import *
from video_llama.runners import *
from video_llama.tasks import *
import pandas as pd



In [3]:
def parse_args():
    parser = argparse.ArgumentParser(description="Demo")
    parser.add_argument("--cfg-path", default='eval_configs/video_llama_eval_withaudio.yaml', help="path to configuration file.")
    parser.add_argument("--gpu-id", type=int, default=0, help="specify the gpu to load the model.")
    parser.add_argument("--model_type", type=str, default='vicuna', help="The type of LLM")
    parser.add_argument(
        "--options",
        nargs="+",
        help="override some settings in the used config, the key-value pair "
        "in xxx=yyy format will be merged into config file (deprecate), "
        "change to --cfg-options instead.",
    )
    # args = parser.parse_args()
    args, remaining_args = parser.parse_known_args()
    return args, remaining_args

In [4]:
from typing import List
import torch
import torch.nn.functional as F

def compute_llama_sentence_embeddings(llama, tokenizer, texts: str | List[str], avg=True, normalize=True):
    inps = tokenizer(texts, return_tensors="pt", padding=True).to(llama.device)

    with torch.no_grad():
        embs = llama(**inps)["last_hidden_state"]

    
    return embs


In [5]:
def main(
    df_dir: str = "/home/ahmadi/video-ir/dataset/filtered_captions.csv",
    save_dir_path: str = "/home/ahmadi/video-ir/dataset/llama_data/after_pooling/trainVal_embeddings/text/20cap/llama_txt_embedding_AveragePooling",
    First_element: bool = False,
    Max_pooling: bool = False,
    Average_pooling: bool = True,
    Sum: bool = False    
    ):
    # Model Initialization
    print('Initializing Chat')
    args, remaining_args = parse_args()
    cfg = Config(args)

    model_config = cfg.model_cfg
    model_config.device_8bit = args.gpu_id

    model_cls = registry.get_model_class(model_config.arch)
    model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
    model.eval()


    df = pd.read_csv(df_dir)
        
    vid_names = df["video_id"]
    captions = df['sentence']

    tokenizer = model.llama_tokenizer


    for i in tqdm(range(len(captions))):
        
        embeddings = compute_llama_sentence_embeddings(model.llama_model.model, tokenizer, captions[i])
        embeddings = embeddings.float()
        embeddings = embeddings.cpu().detach().numpy()
        
        # print(captions[i])
        # print(embeddings.shape)
        
        if First_element:
            embeddings = embeddings[0][0]
                
        elif Max_pooling:
            embeddings = np.max(embeddings[0], axis=0)
                
        elif Average_pooling:
            embeddings = np.min(embeddings[0], axis=0)
                
        elif Sum:
            embeddings = np.sum(embeddings[0], axis=0)
        
        # embeddings = embeddings[0]
        # print(embeddings.shape)
        p_init = save_dir_path + "/" + vid_names[i] 
        
        for j in range(20):
            if os.path.isfile(p_init+"-"+ j + ".npy") == False:
                np.save(p_init+"-"+ j + ".npy", embeddings)
                break

In [6]:
main()

Initializing Chat


Loading VIT
Loading VIT Done
Loading Q-Former


You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
Using pad_token, but it is not set yet.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Initializing audio encoder from /home/ahmadi/video-ir/video-LLAMA/Image-bind ...
audio encoder initialized.
Load first Checkpoint: /home/ahmadi/video-ir/video-LLAMA/Video-LLaMA/finetune-vicuna7b-v2.pth
Load second Checkpoint: /home/ahmadi/video-ir/video-LLAMA/Video-LLaMA/finetune_vicuna7b_audiobranch.pth


100%|██████████| 180000/180000 [1:24:33<00:00, 35.48it/s]


In [7]:
#First element is ok
#sum is ok

In [7]:
p = '/home/ahmadi/video-ir/dataset/filtered_captions.csv'
df_2 = pd.read_csv(p)
df_2


Unnamed: 0,video_id,sentence
0,video2960,a cartoon animals runs through an ice cave in ...
1,video2960,a cartoon character runs around inside of a vi...
2,video2960,a character is running in the snow
3,video2960,a person plays a video game centered around ic...
4,video2960,a person plays online and records themselves
...,...,...
179995,video8600,shows a man in a red sweeter and white shirt ...
179996,video8600,a man explains how to save money using careful...
179997,video8600,a person with maroon tshirt speaks in the news...
179998,video8600,the man in a purple sweater is giving a news s...


In [7]:
import numpy as np 
made_now = np.load('/home/ahmadi/video-ir/dataset/llama_data/before_pooling/trainVal_embeddings/text/video2960.npy')

made_bef = np.load('/home/ahmadi/video-ir/dataset/llama_data/after_pooling/trainVal_embeddings/text/llama_txt_embedding_AveragePooling/video2960.npy')

In [9]:
np.array_equal(made_now, made_bef, equal_nan=True)

True