In [1]:
# !pip3 install torch --index-url https://download.pytorch.org/whl/cu118
# !pip3 install transformers
# !pip3 install flash-attn --no-build-isolation
# !pip3 install scikit-learn

In [2]:
from transformers import AutoModel, AutoTokenizer
import torch
import json
from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
model = AutoModel.from_pretrained(
    'mesolitica/llama2-embedding-2b-8k-contrastive',
    use_flash_attention_2 = True, 
    torch_dtype = torch.bfloat16,
    trust_remote_code = True,
)

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [4]:
_ = model.cuda()

In [5]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/llama2-embedding-2b-8k-contrastive')

In [6]:
contexts, questions = [], []

with open('ultrachat-lom-agc.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        if l[1]['content_ms']:
            contexts.append(l[0]['content'].strip())
            questions.append(l[1]['content_ms'].strip())
            
len(contexts), len(questions)

(8034, 8034)

In [7]:
contexts_v = []
for i in tqdm(range(len(contexts))):
    input_ids = tokenizer([contexts[i]], 
        return_tensors = 'pt',
        padding = True
    )
    v = model.encode(input_ids.to('cuda')).detach().cpu().numpy()
    contexts_v.append(v)

100%|██████████| 8034/8034 [03:46<00:00, 35.48it/s]


In [8]:
questions_v = []
for i in tqdm(range(len(contexts_v))):
    input_ids = tokenizer([questions[i]], 
        return_tensors = 'pt',
        padding = True
    )
    v = model.encode(input_ids.to('cuda')).detach().cpu().numpy()
    questions_v.append(v)

100%|██████████| 8034/8034 [00:56<00:00, 142.29it/s]


In [9]:
contexts_v_np = np.array(contexts_v)[:,0]
questions_v_np = np.array(questions_v)[:,0]

In [10]:
tops = {
    1: 0,
    3: 0,
    5: 0,
    10: 0,
}

for i in tqdm(range(len(questions_v_np))):
    argsort = np.argsort(cosine_similarity(questions_v_np[i].reshape(1, -1), contexts_v_np)[0])[::-1]
    for k in tops.keys():
        if i in argsort[:k]:
            tops[k] += 1

100%|██████████| 8034/8034 [03:22<00:00, 39.73it/s]


In [11]:
np.argsort(cosine_similarity(questions_v_np[i].reshape(1, -1), contexts_v_np)[0])[::-1][:10]

array([ 959,  578,  952, 5255, 1079, 1912, 5249,  961, 5247,  944])

In [13]:
for k, v in tops.items():
    print(k, v / len(questions_v_np))

1 0.1549663928304705
3 0.29113766492407267
5 0.3725416977844162
10 0.4886731391585761
