In [1]:
from transformers import AutoModel, AutoTokenizer
import torch
import json
from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
model = AutoModel.from_pretrained('mesolitica/llama2-embedding-600m-8k', 
                                  trust_remote_code = True,
                                 use_flash_attention_2 = True, torch_dtype = torch.bfloat16)

Downloading config.json:   0%|          | 0.00/902 [00:00<?, ?B/s]

Downloading modeling.py:   0%|          | 0.00/2.93k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mesolitica/llama2-embedding-600m-8k:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


[2023-11-26 05:06:25,234] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Downloading model.safetensors:   0%|          | 0.00/2.17G [00:00<?, ?B/s]

You are attempting to use Flash Attention 2.0 with a model initialized on CPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [3]:
_ = model.cuda()

In [4]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/llama2-embedding-600m-8k')

Downloading tokenizer_config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

In [7]:
contexts, questions = [], []

with open('ultrachat-crossref-melayu-malay.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        if l[1]['content_ms']:
            contexts.append(l[0]['content'].strip())
            questions.append(l[1]['content_ms'].strip())
            
len(contexts), len(questions)

(9959, 9959)

In [8]:
contexts_v = []
for i in tqdm(range(len(contexts))):
    input_ids = tokenizer([contexts[i]], 
        return_tensors = 'pt',
        padding = True
    )
    v = model.encode(input_ids.to('cuda')).detach().cpu().numpy()
    contexts_v.append(v)

100%|██████████| 9959/9959 [03:08<00:00, 52.82it/s]


In [9]:
questions_v = []
for i in tqdm(range(len(questions))):
    input_ids = tokenizer([questions[i]], 
        return_tensors = 'pt',
        padding = True
    )
    v = model.encode(input_ids.to('cuda')).detach().cpu().numpy()
    questions_v.append(v)

100%|██████████| 9959/9959 [00:28<00:00, 345.60it/s]


In [10]:
contexts_v_np = np.array(contexts_v)[:,0]
questions_v_np = np.array(questions_v)[:,0]

In [11]:
contexts_v_np.shape, questions_v_np.shape

((9959, 1536), (9959, 1536))

In [12]:
contexts_v_np[0]

array([ 0.02002746,  0.02448317, -0.0335461 , ...,  0.00952441,
        0.00889908, -0.03066978], dtype=float32)

In [13]:
tops = {
    1: 0,
    3: 0,
    5: 0,
    10: 0,
}

for i in tqdm(range(len(questions_v_np))):
    argsort = np.argsort(cosine_similarity(questions_v_np[i].reshape(1, -1), contexts_v_np)[0])[::-1]
    for k in tops.keys():
        if i in argsort[:k]:
            tops[k] += 1

100%|██████████| 9959/9959 [04:35<00:00, 36.10it/s]


In [14]:
for k, v in tops.items():
    print(k, v / len(questions_v_np))

1 0.09549151521237072
3 0.1834521538307059
5 0.23375840947886334
10 0.3098704689225826
