In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [1]:
"""
Adapted from: https://github.com/Vision-CAIR/MiniGPT-4/blob/main/demo.py
"""
import argparse
import os
import random

import numpy as np
import torch
import torch.backends.cudnn as cudnn
import gradio as gr

from video_llama.common.config import Config
from video_llama.common.dist_utils import get_rank
from video_llama.common.registry import registry
from video_llama.conversation.conversation_video import Chat, Conversation, default_conversation,SeparatorStyle,conv_llava_llama_2
import decord
decord.bridge.set_bridge('torch')

from tqdm import tqdm

#%%
# imports modules for registration
from video_llama.datasets.builders import *
from video_llama.models import *
from video_llama.processors import *
from video_llama.runners import *
from video_llama.tasks import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def parse_args():
    parser = argparse.ArgumentParser(description="Demo")
    parser.add_argument("--cfg-path", default='eval_configs/video_llama_eval_withaudio.yaml', help="path to configuration file.")
    parser.add_argument("--gpu-id", type=int, default=0, help="specify the gpu to load the model.")
    parser.add_argument("--model_type", type=str, default='vicuna', help="The type of LLM")
    parser.add_argument(
        "--options",
        nargs="+",
        help="override some settings in the used config, the key-value pair "
        "in xxx=yyy format will be merged into config file (deprecate), "
        "change to --cfg-options instead.",
    )
    # args = parser.parse_args()
    args, remaining_args = parser.parse_known_args()
    return args, remaining_args

In [3]:
# Model Initialization
print('Initializing Chat')
args, remaining_args = parse_args()
cfg = Config(args)

model_config = cfg.model_cfg
model_config.device_8bit = args.gpu_id

model_cls = registry.get_model_class(model_config.arch)
model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
model.eval();

Initializing Chat
Loading VIT
Loading VIT Done
Loading Q-Former


You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
Using pad_token, but it is not set yet.
Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.02s/it]


Initializing audio encoder from /home/ahmadi/video-ir/video-LLAMA/Image-bind ...
audio encoder initialized.
Load first Checkpoint: /home/ahmadi/video-ir/video-LLAMA/Video-LLaMA/finetune-vicuna7b-v2.pth
Load second Checkpoint: /home/ahmadi/video-ir/video-LLAMA/Video-LLaMA/finetune_vicuna7b_audiobranch.pth


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 23.69 GiB total capacity; 18.80 GiB already allocated; 20.69 MiB free; 19.04 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [5]:
from typing import List
import torch
import torch.nn.functional as F

def compute_llama_sentence_embeddings(llama, tokenizer, texts: str | List[str], avg=True, normalize=True):
    inps = tokenizer(texts, return_tensors="pt", padding=True).to(llama.device)

    with torch.no_grad():
        embs = llama(**inps)["last_hidden_state"]

    
    return embs


In [6]:
import pandas as pd 
df_dir = "/home/ahmadi/video-ir/dataset/1KA/MSRVTT_JSFUSION_test.csv"
save_dir_path = "/home/ahmadi/video-ir/dataset/1KA/llama_txt_embedding_sum"
First_element = False
Max_pooling = False
Average_pooling = False
Sum = True

df = pd.read_csv(df_dir)
    
vid_names = df["video_id"]
captions = df['sentence']

In [7]:
print(captions[10])
tokenizer = model.llama_tokenizer
embeddings = compute_llama_sentence_embeddings(model.llama_model.model, tokenizer, captions[10])
embeddings = embeddings.float()
embeddings = embeddings.cpu().detach().numpy()
embeddings = embeddings[0][0]
print(embeddings)

a man is singing and standing in the road
[ 0.6640625   0.67578125 -0.54296875 ...  0.12011719  0.421875
  0.71484375]


In [8]:
import numpy as np 
k = np.load("/home/ahmadi/video-ir/dataset/1KA/llama_txt_embedding_firstElement/"+ vid_names[10] + ".npy")
print(k)

[ 0.6640625   0.67578125 -0.54296875 ...  0.12011719  0.421875
  0.71484375]


In [11]:
from pprint import pprint

In [None]:
print()

In [12]:
for i in tqdm(range(len(captions))):
        
    embeddings = compute_llama_sentence_embeddings(model.llama_model.model, tokenizer, captions[i])
    embeddings = embeddings.float()
    embeddings = embeddings.cpu().detach().numpy()
    
    print(captions[i])
    print(embeddings.shape)
    
    if First_element:
        embeddings = embeddings[0][0]
            
    elif Max_pooling:
        embeddings = np.max(embeddings[0], axis=0)
            
    elif Average_pooling:
        embeddings = np.min(embeddings[0], axis=0)
            
    elif Sum:
        embeddings = np.sum(embeddings[0], axis=0)
    
    print(embeddings.shape)
        
    break
            
            
    # np.save(save_dir_path + "/" + vid_names[i] + ".npy", embeddings)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

a person is connecting something to system
(1, 8, 4096)
(4096,)





In [3]:
import os
import numpy as np

def compare_npy_files(dir1, dir2):
    # Get the list of files in both directories
    files1 = [f for f in os.listdir(dir1) if f.endswith('.npy')]
    files2 = [f for f in os.listdir(dir2) if f.endswith('.npy')]

    # Find the common set of file names
    common_files = set(files1) & set(files2)

    # Compare the content of files with the same names
    for filename in common_files:
        path1 = os.path.join(dir1, filename)
        path2 = os.path.join(dir2, filename)

        # Load the data from the .npy files
        data1 = np.load(path1, allow_pickle=True)
        data2 = np.load(path2, allow_pickle=True)

        # Compare the data
        if np.array_equal(data1, data2):
            print()
        else:
            print(data1.shape)
            print(data2.shape)
            print(f"Files {filename} are different.")

# Example usage
dir1 = '/home/ahmadi/video-ir/dataset/1KA/check_txt_sum'
dir2 = '/home/ahmadi/video-ir/dataset/1KA/llama_txt_embedding_sum'
compare_npy_files(dir1, dir2)



(4096,)
(4096,)
Files video7158.npy are different.

(4096,)
(4096,)
Files video9524.npy are different.
(4096,)
(4096,)
Files video9010.npy are different.

(4096,)
(4096,)
Files video8300.npy are different.


























(4096,)
(4096,)
Files video9039.npy are different.


(4096,)
(4096,)
Files video8022.npy are different.


(4096,)
(4096,)
Files video7613.npy are different.






(4096,)
(4096,)
Files video7357.npy are different.








(4096,)
(4096,)
Files video7200.npy are different.







(4096,)
(4096,)
Files video9351.npy are different.



(4096,)
(4096,)
Files video8489.npy are different.






(4096,)
(4096,)
Files video8865.npy are different.















(4096,)
(4096,)
Files video7501.npy are different.




(4096,)
(4096,)
Files video7159.npy are different.


(4096,)
(4096,)
Files video8783.npy are different.








(4096,)
(4096,)
Files video7549.npy are different.


(4096,)
(4096,)
Files video7845.npy are different.

(4096,)
(4096,)
Files video9409.npy ar