In [1]:
from arguments import DataArguments
from transformers import AutoConfig, AutoTokenizer
import torch

In [2]:
data_args = DataArguments(train_data = "/home/ubuntu/mosaic-embedding-pairs", train_group_size = 3,
             query_max_len = 8192, passage_max_len = 8192)

In [3]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/llama-600m-hf-32768-fpf')
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"

In [4]:
from data_triplet import TrainDatasetForEmbedding, EmbedCollator

In [5]:
dataset = TrainDatasetForEmbedding(data_args, tokenizer)

In [6]:
b = [dataset[i] for i in range(2)]
b

[(['Pelaburan Syarikat China di Malaysia Tingkat Hubungan Dua Hala\n\nUntuk mendapatkan maklumat terkini, ikuti kami melalui Telegram\nKuala Lumpur –\xa0 Menteri Perdagangan Antarabangsa dan Industri, Tengku Datuk Seri Utama Tengku Zafrul Aziz berkata peningkatan minat syarikat China melabur di Malaysia memberi petanda baik kepada negara dan telah meningkatkan hubungan dua hala antara Malaysia dan China serta telah disokong oleh keyakinan terhadap kerajaan Perpaduan negara.\nBeliau berkata menerusi satu kenyataan yang dikeluarkan oleh Lembaga Pembangunan Pelaburan Malaysia (MIDA), Kementerian Perdagangan Antarabangsa dan Industri\xa0 (MITI) akan terus membantu memudahkan urusan para pelabur untuk menjalankan perniagaan di negara ini bagi menunjukkan bahawa Malaysia adalah sebuah negara sentiasa menyokong industri, perdagangan. Beliau juga menzahirkan ucapan tahniah kepada MIDA dan semua agensi berkaitan dalam membantu mendapatkan pelaburan berpotensi yang bernilai RM 170 bilion. “MITI,

In [8]:
collactor = EmbedCollator(
            tokenizer,
            query_max_len=data_args.query_max_len,
            passage_max_len=data_args.passage_max_len
        )

In [9]:
b_ = collactor(b)
b_

{'query': {'input_ids': tensor([[    1, 15549,   370,  ...,  3480,  2518, 29889],
         [    1, 15549,   370,  ...,  3480,  2518, 29889],
         [    1,   317,   423,  ...,     0,     0,     0],
         [    1,   317,   423,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])},
 'pos': {'input_ids': tensor([[    1,  2292,   481,  ...,     0,     0,     0],
         [    1,  6225,   557,  ...,     0,     0,     0],
         [    1, 15549,   370,  ...,  3480,  2518, 29889],
         [    1, 15549,   370,  ...,  3480,  2518, 29889]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]])},
 'neg': {'input_ids': tensor([[    1,  6225,   557,  ...,     0,     0,     0],
         [    1,  6225,   557,  ...,     0,     0,     0],
         [    1,   38

In [11]:
len(b_['query']['input_ids']), len(b_['pos']['input_ids']), len(b_['neg']['input_ids'])

(4, 4, 4)

In [12]:
from modeling import LlamaModelEmbedding
from transformers import LlamaConfig

config = LlamaConfig.from_pretrained(
    'mesolitica/llama-600m-hf-32768-fpf',
    num_labels=1,
)

config.temperature = 0.02
config.normalized = True
config.sentence_pooling_method = 'mean'

In [13]:
model = LlamaModelEmbedding.from_pretrained('mesolitica/llama-600m-hf-32768-fpf', config = config)

Some weights of LlamaModelEmbedding were not initialized from the model checkpoint at mesolitica/llama-600m-hf-32768-fpf and are newly initialized: ['model.dense_layer.bias', 'model.dense_layer.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
rep_anchor = model.encode(b_['query'])

In [15]:
rep_pos = model.encode(b_['pos'])

In [16]:
n_reps = model.encode(b_['neg'])

In [17]:
import torch.nn.functional as F
EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2)

In [18]:
rep_anchor.shape

torch.Size([4, 1536])

In [19]:
rep_pos.shape

torch.Size([4, 1536])

In [20]:
distance_pos = EUCLIDEAN(rep_anchor, rep_pos)
distance_neg = EUCLIDEAN(rep_anchor, n_reps)

In [32]:
loss = torch.nn.TripletMarginLoss(margin=1.0, p=2, eps=1e-7)

In [33]:
loss(rep_anchor, rep_pos, n_reps)

tensor(0.9278, grad_fn=<MeanBackward0>)

In [26]:
triplet_margin = 1.0
losses = F.relu(distance_pos - distance_neg + triplet_margin)
losses

tensor([0.8989, 0.9589, 0.9267, 0.9267], grad_fn=<ReluBackward0>)

In [27]:
loss = losses.mean()
loss

tensor(0.9278, grad_fn=<MeanBackward0>)

In [28]:
distance_pos

tensor([0.7172, 0.7390, 0.8828, 0.8828], grad_fn=<NormBackward1>)

In [29]:
distance_neg

tensor([0.8183, 0.7801, 0.9560, 0.9560], grad_fn=<NormBackward1>)

In [30]:
distance_pos - distance_neg

tensor([-0.1011, -0.0411, -0.0733, -0.0733], grad_fn=<SubBackward0>)