In [None]:
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import torch
from torch import Tensor, nn
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel


In [None]:
%pip install datasets==2.21.0

In [None]:
from datasets import load_dataset

train_dataset = load_dataset("bentrevett/multi30k", split="train")
test_dataset = load_dataset("bentrevett/multi30k", split="test")

In [None]:
def filter_dataset(dataset, minlen: int, maxlen: int) -> list[dict[str, str]]:
    return [
        dataset[i]["en"]
        for i in range(len(dataset))
        if len(dataset[i]["en"].split(" ")) <= maxlen and len(dataset[i]["en"].split(" "))>=minlen
    ]

In [None]:
maxlen = 30
minlen = 5
train_filtered = filter_dataset(train_dataset, minlen, maxlen)
test_filtered = filter_dataset(test_dataset, minlen, maxlen)
print(len(train_filtered), len(test_filtered))

28945 997


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42)

<torch._C.Generator at 0x7bb81fcb2870>

In [None]:
from sentence_transformers import SentenceTransformer, util
paraph_model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2').to(device)
#модель, на выходе которой эмбеддинг предложения, с косинусным сходством близкому к 1 для предложений похожих по смыслу


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
in_sentences = [
    "toyan Mihaylovski’s works are a remarkable representation of early modern philosophical thought, expressed in a poetic manner.",
    "Huge valves in the driers regulate the movement of water, which is facilitated by the force of gravity."
]
out_sentences = [
    "A poetic example of early modern philosophical thought can be found in the surprising works of the renowned intellectual Stoyan Mihaylovski.",
    "The water is moved by the gravity and is controlled by huge valves in the driers."
]


# Получение эмбеддингов
in_embeddings = paraph_model.encode(in_sentences, convert_to_tensor=True)
out_embeddings = paraph_model.encode(out_sentences, convert_to_tensor=True)
(F.normalize(in_embeddings,dim=-1)*F.normalize(out_embeddings,dim=-1)).sum(-1)
util.pytorch_cos_sim(in_embeddings, out_embeddings)

tensor([[0.8765, 0.0614],
        [0.0634, 0.9607]], device='cuda:0')

In [None]:
paraphrase_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")
paraphrase_model = AutoModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2").to(device)
paraphrase_embedding_matrix = paraphrase_model.embeddings.word_embeddings.weight
paraphrase_embedding_matrix.shape

torch.Size([30527, 768])

In [None]:
def collate_fn(
    tokenizer: AutoTokenizer, batch: list[str]
) -> tuple[Tensor, Tensor]:
    encoded_batch = tokenizer.batch_encode_plus(
        batch, padding="longest", return_tensors="pt", return_attention_mask=True)
    return encoded_batch["input_ids"].to(device), encoded_batch["attention_mask"].to(device)

In [None]:
input1,mask1 = collate_fn(paraphrase_tokenizer, in_sentences)
input2,mask2 = collate_fn(paraphrase_tokenizer, out_sentences)

In [None]:
embed =nn.Embedding.from_pretrained(paraphrase_embedding_matrix, freeze=True)

input_embeds1 =embed(input1)
input_embeds2 =embed(input2)

input_embeds1.shape

torch.Size([2, 27, 768])

In [None]:
# На вход в paraph_model подаётся "input_ids" и "attention_mask". Для подачи embeds, изменяем слой эмбединга, добавляя hook
# Используется только для emb токенов из AutoModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2"),
#для других предобученных emb пытаемся построить модель, по emb токенов, получающая emb предложений как в paraph_model

def replace_embeddings(module, input, output, input_embeds):
    return input_embeds

def get_sent_emb(input_embeds, attention_mask):
  transformer = paraph_model._first_module().auto_model.embeddings
  hook = transformer.word_embeddings.register_forward_hook(lambda module, input, output: replace_embeddings(module, input, output,input_embeds))

  try:
        with torch.no_grad():
            emb = paraph_model({
                "input_ids": torch.zeros_like(attention_mask, dtype=torch.long, device=device),
                "attention_mask": attention_mask,
            })["sentence_embedding"]
  finally:
        # Убираем хук, чтобы он не мешал следующим вызовам
        hook.remove()

  return emb


In [None]:
#Пример
emb1 = get_sent_emb(input_embeds1,mask1)
emb2 = get_sent_emb(input_embeds2,mask2)
util.pytorch_cos_sim(emb1,emb2)

tensor([[0.8765, 0.0614],
        [0.0634, 0.9607]], device='cuda:0')

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-tiny")
bert_model = BertModel.from_pretrained("prajjwal1/bert-tiny").to(device)
embedding_matrix = bert_model.embeddings.word_embeddings.weight
embedding_matrix.shape

In [None]:
def collate_fn_lin(
    tokenizer: AutoTokenizer, batch: list[str]
) -> tuple[Tensor, Tensor]:
    encoded_batch = tokenizer.batch_encode_plus(
        batch, padding="longest", return_tensors="pt", return_attention_mask=True)
    paraphrase_emb = torch.tensor(paraph_model.encode(batch))
    return encoded_batch["input_ids"].to(device), encoded_batch["attention_mask"].to(device),paraphrase_emb.to(device)

In [None]:
loader = DataLoader(train_filtered, batch_size=32, shuffle=True, collate_fn=lambda batch:collate_fn_lin(bert_tokenizer,batch))

In [None]:
tokens, mask, paraphrase_emb = next(iter(loader))
tokens.shape,mask.shape, paraphrase_emb.shape

(torch.Size([32, 25]), torch.Size([32, 25]), torch.Size([32, 768]))

In [None]:
#Эти две модели пытаюся перевести из предобученных эмбедингов(токенов) другой модели(BertModel.from_pretrained("prajjwal1/bert-tiny")) в sentence emb paraph_model

class SentenceEmbeddingModel(nn.Module):
    def __init__(self, token_dim=128, output_dim=768, pretrained_emb_matrix=embedding_matrix):
        super(SentenceEmbeddingModel, self).__init__()
        self.embed = nn.Embedding.from_pretrained(pretrained_emb_matrix, freeze=True)
        self.projection = nn.Linear(token_dim, output_dim)  # Линейный слой для проекции эмбеддингов токенов
    def forward(self, tokens, flag=0):
        # Mean Pooling с учетом маски

        token_embeddings = self.embed(tokens)
        attention_mask = (tokens!=0)
        token_embeddings = token_embeddings * attention_mask.unsqueeze(-1)  # Маскируем токены
        sum_embeddings = torch.sum(token_embeddings, dim=1)  # Суммируем по длине предложения
        token_counts = torch.sum(attention_mask, dim=1).unsqueeze(-1)  # Количество ненулевых токенов
        mean_embeddings = sum_embeddings / token_counts  # Усредняем
        return self.projection(mean_embeddings)

    def emb_forward(self, emb, attention_mask):
        token_embeddings = emb * attention_mask.unsqueeze(-1)  # Маскируем токены
        sum_embeddings = torch.sum(token_embeddings, dim=1)  # Суммируем по длине предложения
        token_counts = torch.sum(attention_mask, dim=1).unsqueeze(-1)  # Количество ненулевых токенов
        mean_embeddings = sum_embeddings / token_counts  # Усредняем
        return self.projection(mean_embeddings)


In [None]:
class TransformerSentenceEmbeddingModel(nn.Module):
    def __init__(self, token_dim=128, output_dim=768, num_layers=3, num_heads=4, dim_feedforward=1024, dropout=0.1):
        super(TransformerSentenceEmbeddingModel, self).__init__()
        self.embed = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=token_dim,
                nhead=num_heads,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True
            ),
            num_layers=num_layers
        )
        self.projection = nn.Linear(token_dim, output_dim)

    def forward(self, tokens, attention_mask):
        token_embeddings = self.embed(tokens)

        src_key_padding_mask = attention_mask == 0

        transformer_output = self.transformer(token_embeddings, src_key_padding_mask=src_key_padding_mask)

        cls_embedding = transformer_output[:, 0, :]  # B x V

        return self.projection(cls_embedding)

In [None]:
sentence_embedding_model = TransformerSentenceEmbeddingModel().to(device)

optimizer = torch.optim.Adam(sentence_embedding_model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

for epoch in range(10):
    epoch_loss = 0
    n=0
    for tokens, mask, paraphrase_emb in loader:
      predicted_embeddings = sentence_embedding_model(tokens,mask)


      loss = torch.tensor(1,device=device)-(F.normalize(predicted_embeddings,dim=-1)*F.normalize(paraphrase_emb,dim=-1)).sum(-1).mean() + loss_fn(predicted_embeddings, paraphrase_emb.to(device))

      epoch_loss+=loss.item()
      n+=1

      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

    print(f"epoch:{epoch}, Loss: {epoch_loss/n:.4f}")

In [None]:
encoded_batch1,mask1,in_embeddings=collate_fn_lin(bert_tokenizer,in_sentences)
encoded_batch2,mask2,out_embeddings=collate_fn_lin(bert_tokenizer,out_sentences)

In [None]:
util.pytorch_cos_sim(in_embeddings, out_embeddings) # ljk;y

tensor([[0.8765, 0.0614],
        [0.0634, 0.9607]], device='cuda:0')

In [None]:
#для TransformerSentenceEmbeddingModel, схожесть выучила, а не схожесть не совсем.
in_emb = sentence_embedding_model(encoded_batch1,mask1)
out_emb = sentence_embedding_model(encoded_batch2,mask2)
util.pytorch_cos_sim(in_emb, out_emb)

tensor([[0.8482, 0.1844],
        [0.2960, 0.9401]], device='cuda:0', grad_fn=<MmBackward0>)

In [None]:
# для SentenceEmbeddingModel, схожесть выучила, а не схожесть не совсем.
util.pytorch_cos_sim(in_emb, out_emb)

tensor([[0.7314, 0.2295],
        [0.3689, 0.8723]], device='cuda:0', grad_fn=<MmBackward0>)

In [None]:
train_loader = DataLoader(train_filtered, batch_size=32, shuffle=True, collate_fn=lambda batch:collate_fn(paraphrase_tokenizer,batch))
test_loader = DataLoader(test_filtered, batch_size=32, shuffle=True, collate_fn=lambda batch:collate_fn(paraphrase_tokenizer,batch))

tokens,mask = next(iter(train_loader))

In [None]:
class RNNCell(nn.Module):
    """
    (x_{t}, h_{t-1}) -> h_{t}
    """
    def __init__(self, input_dim: int, hidden_dim: int) -> None:
        super().__init__()
        self.linear = nn.Linear(input_dim+hidden_dim, hidden_dim)
        torch.nn.init.kaiming_normal_(self.linear.weight, nonlinearity='tanh')

    def forward(self, x: Tensor, h: Tensor) -> Tensor:
        # x: B x input_dim
        # h: B x hidden_dim
        h = torch.cat([x, h], dim=1)
        h = self.linear(h)
        return F.tanh(h)

In [None]:
paraphrase_embedding_matrix.shape

torch.Size([30527, 768])

In [None]:
class RNN_encoder(nn.Module):
    def __init__(self, vocab_size: int, pretrained_emb_matrix, hidden_dim: int = 1024, cell: nn.Module = RNNCell) -> None:
        super().__init__()
        self.init_h = nn.Parameter(data=torch.randn(1, hidden_dim))
        self.embed = nn.Embedding.from_pretrained(pretrained_emb_matrix, freeze=False)
        self.rnn = cell(pretrained_emb_matrix.size(1), hidden_dim)


    def forward(self, x: Tensor, gen=0) -> Tensor:
        # x: B x T
        # embed(x): B x T -> B x T x input_dim
        B, T = (x.size(0),x.size(1))

        if gen==0:
          x = self.embed(x) # B x T x input_dim

        h = self.init_h.expand((B, -1)) # B x hidden_dim

        for t in range(T):
            xt = x[:, t, :]
            h = self.rnn.forward(xt, h)  # B x hidden

        return h # B x hidden, B x T x input_dim

In [None]:
class RNN_decoder(nn.Module):
    def __init__(self, vocab_size: int, hidden_dim: int = 1024, cell: nn.Module = RNNCell) -> None:
        super().__init__()
        self.init_h = nn.Parameter(data=torch.randn(1, hidden_dim))
        self.rnn = cell(hidden_dim, hidden_dim)
        self.lm_head = nn.Linear(hidden_dim, vocab_size)


    def forward(self, x: Tensor, T: int) -> Tensor:
        # x: B x S (S - sentence_emb_dim)
        B, S = x.shape
        x = x.unsqueeze(1).expand((B, T, S))

        h = self.init_h.expand((B, -1)) # B x hidden_dim

        res = []
        for t in range(T):
            xt = x[:, t, :]
            h = self.rnn.forward(xt, h)  # B x hidden
            y = self.lm_head(h).unsqueeze(1)  # B x 1 x vocab_size
            res.append(y)

        return torch.cat(res, dim=1) # B x T x vocab_size

In [None]:
class SentenceAutoEncoder(nn.Module):
  def __init__(self, vocab_size, pretrained_emb_matrix, hidden_dim:int=1024)-> None:
    super().__init__()
    self.encoder = RNN_encoder(vocab_size,hidden_dim = hidden_dim, pretrained_emb_matrix=pretrained_emb_matrix)
    self.l = nn.Linear(hidden_dim, hidden_dim)
    self.dropout = nn.Dropout(0.1)
    self.decoder = RNN_decoder(vocab_size, hidden_dim=hidden_dim)

  def forward(self, x: Tensor)-> Tensor:
    B, T = x.shape
    sent_emb = self.encoder(x)

    logits = self.decoder(self.dropout(self.l(sent_emb)), T)

    return logits



In [None]:
autoencoder = SentenceAutoEncoder(vocab_size=len(paraphrase_tokenizer), pretrained_emb_matrix=paraphrase_embedding_matrix).to(device)
logits = autoencoder.forward(tokens)
logits.shape, tokens.shape

(torch.Size([32, 26, 30527]), torch.Size([32, 26]))

In [None]:
probs = F.softmax(logits, dim=-1)

In [None]:
#Генерация из logits, применяем Gumble-sotmax trick.  X~[p1,p2,..,pk] = argmax(gi+logpi), gi~Gumbel

def generate(probs, n_samples=10, pretrained_embedding_matrix=paraphrase_embedding_matrix):
  data = []
  for i in range(n_samples):
    ##генерируем Gumbel из U(0,1)(Функция распределения F(x) имеет распределение U(0,1) и берем обратную функцию)
    #Если X — переменная с экспоненциальным распределением со средним значением 1, то −log(X) имеет стандартное распределение Гамбеля.
    p = (-torch.log(-torch.log(torch.rand(probs.shape,device=device) + 1e-20) + 1e-20))
    alpha = F.softmax((p+torch.log(probs))/0.5, dim=-1)
    data.append(torch.matmul(alpha, pretrained_embedding_matrix))

  res = torch.stack(data,0)
  return res

In [None]:
gen_samples = generate(probs, 5)
gen_samples.shape

torch.Size([5, 32, 26, 768])

In [None]:
get_sent_emb(gen_samples[0],mask) #Пример sentence emb из paraph_model для сгенерировааных эмбедингов токенов

tensor([[-0.0372,  0.0619, -0.0489,  ...,  0.0058,  0.1252, -0.1128],
        [-0.1124, -0.0154, -0.0595,  ...,  0.1633, -0.0281, -0.0782],
        [-0.2400,  0.0436, -0.0118,  ...,  0.1230, -0.0065,  0.0726],
        ...,
        [-0.1957,  0.3017, -0.0052,  ...,  0.1163, -0.1161, -0.0472],
        [ 0.1102, -0.1743, -0.0635,  ...,  0.1053, -0.0612, -0.1274],
        [ 0.0087, -0.0664, -0.0526,  ...,  0.0788, -0.0624, -0.1118]],
       device='cuda:0')

In [None]:
def learned_parapharase_loss(n_samples, logits, paraphrase_emb, attention_mask):
    probs = F.softmax(logits, dim=-1)
    gen_samples = generate(probs,n_samples)
    gen_paraphrase_emb = []
    for sample in gen_samples:
      gen_paraphrase_emb.append(get_sent_emb(sample, attention_mask))

    gen_paraphrase_emb  = torch.stack(gen_paraphrase_emb,0)
    loss=torch.tensor(1.0,device=device)-(F.normalize(gen_paraphrase_emb,dim=-1)*F.normalize(paraphrase_emb,dim=-1)).sum(-1).mean() #считаем cos_similarity, стремимся, что было близко к 1
    return loss

In [None]:
paraph_emb = paraph_model({"input_ids": tokens,"attention_mask": mask})["sentence_embedding"] #sentence emb из tokens датасета

In [None]:
learned_parapharase_loss(5,logits,paraph_embed,mask)

tensor(0.9244, device='cuda:0')

In [None]:
def train_epoch(dataloader: DataLoader,model: nn.Module,optimizer: torch.optim.Optimizer):
    model.train()
    loss_total = 0
    n_total = 0
    n_correct = 0

    for tokens, mask in dataloader:
        tokens = tokens.to(device)
        logits = model(tokens)

        entropy_loss = F.cross_entropy(
            logits.reshape(-1, len(paraphrase_tokenizer)),
            tokens.reshape(-1),
        )

        loss = entropy_loss+learned_parapharase_loss(10,logits,paraph_model({"input_ids":tokens,"attention_mask":mask})['sentence_embedding'],mask)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        b_size = tokens.size(0)*tokens.size(1)
        n_total += b_size
        loss_total +=  b_size*loss.item()
        n_correct += (tokens == logits.argmax(dim=-1)).sum().item()

    loss=loss_total / n_total
    acc = n_correct / n_total

    train_loss.append(loss)
    train_acc.append(acc)

In [None]:
@torch.no_grad()
def test_epoch(dataloader: DataLoader,model: nn.Module):
    model.eval()
    loss_total = 0
    n_total = 0
    n_correct = 0

    for tokens, mask in dataloader:
        tokens = tokens.to(device)
        logits = model(tokens)

        entropy_loss = F.cross_entropy(
            logits.reshape(-1, len(paraphrase_tokenizer)),
            tokens.reshape(-1),
        )


        loss = entropy_loss+learned_parapharase_loss(10,logits,paraph_model({"input_ids":tokens,"attention_mask":mask})['sentence_embedding'],mask)
        b_size = tokens.size(0)*tokens.size(1)
        n_total += b_size
        loss_total +=  b_size*loss.item()
        n_correct += (tokens == logits.argmax(dim=-1)).sum().item()

    loss=loss_total / n_total
    acc = n_correct / n_total

    val_loss.append(loss)
    val_acc.append(acc)

In [None]:
#Обучение довольно долгое из-за постоянного изменения в слое paraph_model для подачи сгенерированных tokens emb (но accyracy 0.58 сразу в 0 эпоху)
torch.manual_seed(42)
model1 = SentenceAutoEncoder(vocab_size=len(paraphrase_tokenizer), hidden_dim = 1024, pretrained_emb_matrix=paraphrase_embedding_matrix).to(device)
optimizer = torch.optim.Adam(model1.parameters(), lr=0.0001, weight_decay=1e-5)
EPOCHS = 5
train_loss = []
val_loss = []
train_acc = []
val_acc = []

for epoch in range(EPOCHS):
    print(f"EPOCH {epoch}")
    train_epoch(train_loader,model1,optimizer)
    test_epoch(test_loader,model1)
    print(f"Train_loss={train_loss[-1]}, Val_loss={val_loss[-1]}")
    print(f"Train_accuracy={train_acc[-1]}, Val_accuracy={val_acc[-1]}")

EPOCH 0
Train_loss=3.9825067124612827, Val_loss=3.5035101814313196
Train_accuracy=0.5269569252265309, Val_accuracy=0.5848975848364415
EPOCH 1
Train_loss=3.5836005684249033, Val_loss=3.4577849089587565
Train_accuracy=0.5630687829303984, Val_accuracy=0.589560874437412
EPOCH 2
Train_loss=3.4741484010859254, Val_loss=3.4179283087107444
Train_accuracy=0.5817850585227651, Val_accuracy=0.592112368872111
EPOCH 3


In [None]:
#Для других tokens emb нужно использовать TransformerSentenceEmbeddingModel для получения sent emb из paraph_model (Или обучать свою модель на подобии paraph_model со своими токенами)