In [41]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from tokenizers import Tokenizer, models, trainers, processors
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


def train_tokenizer(texts):
    tokenizer = Tokenizer(models.BPE())
    trainer = trainers.BpeTrainer(vocab_size=50000, special_tokens=["<pad>", "<sos>", "<eos>"])
    tokenizer.train_from_iterator(texts, trainer)
    tokenizer.post_processor = processors.TemplateProcessing(
        single="<sos> $A <eos>",
        pair="<sos> $A <eos> $B:1 <eos>",
        special_tokens=[("<sos>", 1), ("<eos>", 2)]
    )
    return tokenizer


class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.texts = dataframe['text'].tolist()
        self.summaries = dataframe['summary'].tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = torch.tensor(self.tokenizer.encode(self.texts[idx]).ids)
        summary = torch.tensor(self.tokenizer.encode(self.summaries[idx]).ids)
        return text, summary


def collate_fn(batch):
    texts, summaries = zip(*batch)
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=0)
    summaries_padded = pad_sequence(summaries, batch_first=True, padding_value=0)
    return texts_padded, summaries_padded


class Seq2Seq(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(Seq2Seq, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, src, trg):
        embedded_src = self.embedding(src)
        encoder_outputs, (hidden, cell) = self.encoder(embedded_src)

        embedded_trg = self.embedding(trg)
        decoder_outputs, _ = self.decoder(embedded_trg, (hidden, cell))

        output = self.fc(decoder_outputs)
        return output


def train_model(model, dataloader, tokenizer, num_epochs=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for src, trg in tqdm(dataloader):
            src = src.to(device)
            trg = trg.to(device)

            optimizer.zero_grad()

            output = model(src, trg[:, :-1])
            loss = criterion(output.view(-1, output.shape[-1]), trg[:, 1:].reshape(-1))

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(dataloader):.4f}')

# Функция для генерации резюме
def generate_summary(model, tokenizer, text, max_len=100):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    with torch.no_grad():
        input_tensor = torch.tensor([tokenizer.encode(text).ids]).to(device)

        embedded = model.embedding(input_tensor)
        encoder_outputs, (hidden, cell) = model.encoder(embedded)

        trg_indexes = [tokenizer.token_to_id('<sos>')]

        for _ in range(max_len):
            trg_tensor = torch.tensor([trg_indexes]).to(device)
            embedded_trg = model.embedding(trg_tensor)

            output, (hidden, cell) = model.decoder(embedded_trg, (hidden, cell))
            prediction = model.fc(output[:, -1, :]).argmax(1).item()

            if prediction == tokenizer.token_to_id('<eos>'):
                break

            trg_indexes.append(prediction)

        return tokenizer.decode(trg_indexes[1:])

In [42]:

df = pd.read_csv("/content/train_data20000.csv")

texts = df['text'].tolist()
summaries = df['summary'].tolist()

tokenizer = train_tokenizer(texts)

In [84]:
dataset = TextDataset(df, tokenizer)

batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

vocab_size = len(tokenizer.get_vocab())
embedding_dim = 128
hidden_size = 128
model = Seq2Seq(vocab_size, embedding_dim, hidden_size)

In [85]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [86]:
model = model.to(device)

In [87]:
train_model(model, dataloader, tokenizer, num_epochs=10)

100%|██████████| 625/625 [02:07<00:00,  4.91it/s]


Epoch [1/10], Loss: 9.5212


100%|██████████| 625/625 [02:07<00:00,  4.92it/s]


Epoch [2/10], Loss: 9.0271


100%|██████████| 625/625 [02:05<00:00,  4.98it/s]


Epoch [3/10], Loss: 8.4633


100%|██████████| 625/625 [02:08<00:00,  4.87it/s]


Epoch [4/10], Loss: 7.9181


100%|██████████| 625/625 [02:06<00:00,  4.94it/s]


Epoch [5/10], Loss: 7.4330


100%|██████████| 625/625 [02:08<00:00,  4.88it/s]


Epoch [6/10], Loss: 7.0104


100%|██████████| 625/625 [02:08<00:00,  4.87it/s]


Epoch [7/10], Loss: 6.6447


100%|██████████| 625/625 [02:08<00:00,  4.86it/s]


Epoch [8/10], Loss: 6.3237


100%|██████████| 625/625 [02:07<00:00,  4.89it/s]


Epoch [9/10], Loss: 6.0413


100%|██████████| 625/625 [02:07<00:00,  4.91it/s]

Epoch [10/10], Loss: 5.7895





In [104]:
reference_summary = "По информации ведомства, лодки прибыли ранним утром в среду. Это первый случай в ходе нынешнего кризиса с мигрантами в Европе, когда группы беженцев прибывают на территорию, подконтрольную Британии. База Акротири, расположенная на южном берегу острова, используется для боевых вылетов британской авиации, наносящей удары по позициям боевиков экстремистского движения ""Исламское государство"" в Ираке. Ранее мигранты высаживались в основном на островах Греции и Италии, обходя Кипр стороной. С начала года в европейские страны морским путем прибыли около 600 тысяч мигрантов. Основная часть беженцев прибывает из Афганистана, Сирии и Ирака. Многие идут пешком из Турции через Грецию, Македонию и Сербию."
generated_summary = generate_summary(model, tokenizer, reference_summary)
print(f"Generated summary: {generated_summary}")

Generated summary: В  Москве  задержан  главный  санитар ный  врач  РФ  Анна  Попова .


In [105]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')


def calculate_bleu(reference, candidate):
    reference_tokens = [nltk.word_tokenize(reference.lower())]
    candidate_tokens = nltk.word_tokenize(candidate.lower())
    return sentence_bleu(reference_tokens, candidate_tokens)


def calculate_meteor(reference, candidate):
    reference_tokens = nltk.word_tokenize(reference.lower())
    candidate_tokens = nltk.word_tokenize(candidate.lower())
    return meteor_score([reference_tokens], candidate_tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [106]:
!pip install rouge_score



In [107]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def calculate_rouge(reference, candidate):
    # Вычисление ROUGE
    scores = scorer.score(reference, candidate)
    return scores

In [108]:
bleu_score = calculate_bleu(reference_summary, generated_summary)
print(f"BLEU score: {bleu_score:.4f}")

BLEU score: 0.0000


In [109]:
meteor_score_value = calculate_meteor(reference_summary, generated_summary)
print(f"METEOR score: {meteor_score_value:.4f}")

METEOR score: 0.0097


In [110]:
rouge_scores = calculate_rouge(reference_summary, generated_summary)
for key, value in rouge_scores.items():
    print(f"{key}: precision={value.precision:.4f}, recall={value.recall:.4f}, fmeasure={value.fmeasure:.4f}")

rouge1: precision=0.0000, recall=0.0000, fmeasure=0.0000
rouge2: precision=0.0000, recall=0.0000, fmeasure=0.0000
rougeL: precision=0.0000, recall=0.0000, fmeasure=0.0000


In [111]:
import torch.nn.functional as F

def calculate_perplexity(model, tokenizer, text, device="cuda" if torch.cuda.is_available() else "cpu"):
    model.eval()
    model.to(device)

    tokens = tokenizer.encode(text).ids
    input_ids = torch.tensor([tokens]).to(device)

    with torch.no_grad():
        outputs = model(input_ids[:, :-1], input_ids[:, 1:])
        logits = outputs
        log_probs = F.log_softmax(logits, dim=-1)

    nll = 0
    for i in range(len(tokens) - 1):
        nll += -log_probs[0, i, tokens[i + 1]].item()

    perplexity = torch.exp(torch.tensor(nll / (len(tokens) - 1)))
    return perplexity.item()

In [112]:
perplexity = calculate_perplexity(model, tokenizer, reference_summary)
print(f"Perplexity: {perplexity:.4f}")

Perplexity: 337824.8438
