Импорты из скриптов и библиотек

In [None]:
import torch
import torch.nn as nn
import pandas as pd
#from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from rouge_score import rouge_scorer
import mlflow

from src.model import Seq2Seq
from src.dataset import TextDataset, collate_fn, train_tokenizer
from src.train_eval import set_seed, train_model, generate_summary
from src.metrics import calculate_bleu, calculate_meteor, calculate_rouge, calculate_perplexity

set_seed(42)

Открытие датасета и подготовка токенайзера

In [None]:
df = pd.read_csv("train_data.csv")

texts = df['text'].tolist()
summaries = df['summary'].tolist()

tokenizer = train_tokenizer(texts)

Выбор гиперпараметров, подготовка дата лоадера и модели

In [None]:
dataset = TextDataset(df, tokenizer)

batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

vocab_size = len(tokenizer.get_vocab())
embedding_dim = 128
hidden_size = 128
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = nn.DataParallel(Seq2Seq(vocab_size, embedding_dim, hidden_size)).to(device)

Логирование гиперпараметров

In [None]:
mlflow.set_tracking_uri("file:./mlruns")  # Локальное хранилище
mlflow.start_run()

mlflow.log_param("embedding_dim", embedding_dim)
mlflow.log_param("hidden_size", hidden_size)
mlflow.log_param("batch_size", batch_size)
print("Логирование начато")

Запуск обучения модели

In [None]:
set_seed(42)
train_model(model, dataloader, tokenizer, num_epochs=1)

mlflow.pytorch.log_model(model, "model")
mlflow.end_run()

Eval модели

In [None]:
reference_summary = df.iloc[3]['text']
generated_summary = generate_summary(model, tokenizer, reference_summary)
print(f"Generated summary: {generated_summary}")

Вычисление метрик

In [None]:
bleu_score = calculate_bleu(reference_summary, generated_summary)
print(f"BLEU score: {bleu_score:.4f}")

meteor_score_value = calculate_meteor(reference_summary, generated_summary)
print(f"METEOR score: {meteor_score_value:.4f}")

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = calculate_rouge(reference_summary, generated_summary, scorer)
for key, value in rouge_scores.items():
    print(f"{key}: precision={value.precision:.4f}, recall={value.recall:.4f}, fmeasure={value.fmeasure:.4f}")

perplexity = calculate_perplexity(model, tokenizer, reference_summary)
print(f"Perplexity: {perplexity:.4f}")