# Генерация заголовков научных статей: слабый baseline

Источник: https://github.com/bentrevett/pytorch-seq2seq

In [1]:
! pip install transformers
! pip install sentencepiece
! pip install rouge_score
! pip install datasets==1.12.0
! pip install tabulate

In [2]:
import torch
import numpy as np
import pandas as pd
import datasets
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

import nltk

In [3]:
model_name = "sshleifer/distilbart-xsum-12-3"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# tokenization
encoder_max_length = 256
decoder_max_length = 64

In [4]:
train_df = pd.read_csv('../input/title-generation/train.csv')
print(f'Shape with duplicates: {train_df.shape}')
train_df.drop_duplicates(inplace=True)
print(f'Shape without duplicates: {train_df.shape}')

In [5]:
data = datasets.Dataset.from_pandas(train_df)
data = data.remove_columns('__index_level_0__')

def flatten(example):
    
    return {
        'document': example['abstract'],
        'summary': example['title'],
    }

dataset = data.map(flatten, remove_columns=['abstract', 'title'])
print(dataset.shape)

train_data_txt, validation_data_txt = dataset.train_test_split(test_size=0.1).values()

In [6]:
# Take a look at the data
for k, v in data[0].items():
    print(k)
    print(v)

In [7]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["document"], batch["summary"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch


train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

## Обучение модели

### Метрики

In [8]:
# Borrowed from https://github.com/huggingface/transformers/blob/master/examples/seq2seq/run_summarization.py

nltk.download("punkt", quiet=True)

metric = datasets.load_metric("rouge")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

### Параметры обучения

In [9]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=5,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    # learning_rate=3e-05,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=2000,
    save_total_limit=3,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

### Обучение

In [10]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

Finally, we load the parameters from our best validation loss and get our results on the test set.

# Evaluation

### Generate summaries from the fine-tuned model and compare them with those generated from the original, pre-trained one.

In [None]:
def generate_summary(test_samples, model, is_get_document=False):
    test_samples = test_samples if is_get_document else test_samples["document"]
    inputs = tokenizer(
        test_samples,
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    output_str = [' '.join(output_str[0].split())]
    return outputs, output_str

In [None]:
model_before_tuning = AutoModelForSeq2SeqLM.from_pretrained(model_name)

test_samples = validation_data_txt.select(range(16))

summaries_before_tuning = generate_summary(test_samples, model_before_tuning)[1]
summaries_after_tuning = generate_summary(test_samples, model)[1]

In [None]:
from tabulate import tabulate

In [None]:
print(
    tabulate(
        zip(
            range(len(summaries_after_tuning)),
            summaries_after_tuning,
            summaries_before_tuning,
        ),
        headers=["Id", "Summary after", "Summary before"],
    )
)
print("\nTarget summaries:\n")
print(
    tabulate(list(enumerate(test_samples["summary"])), headers=["Id", "Target summary"])
)
print("\nSource documents:\n")
print(tabulate(list(enumerate(test_samples["document"])), headers=["Id", "Document"]))

### Делаем submission в Kaggle

In [None]:
submission_data = pd.read_csv('../input/title-generation/test.csv')
abstracts = submission_data['abstract'].values

Объединяем тестовые данные и данные из обучения, т.к. некоторые пересекаются

In [None]:
test_samples_from_train = set(train_df['abstract']).intersection(set(submission_data['abstract']))
wtf_df = train_df[train_df['abstract'].isin(test_samples_from_train)]
wtf_df.describe()

In [None]:
bugged_title = wtf_df.abstract.mode()[0]
# https://arxiv.org/pdf/1410.0163.pdf
wtf_df[wtf_df['abstract'] == bugged_title]

In [None]:
wtf_df = wtf_df[wtf_df['abstract'] != bugged_title]
uncertain_title = wtf_df.abstract.mode()[0]
wtf_df[wtf_df['abstract'] == uncertain_title]

In [None]:
wtf_df = wtf_df[wtf_df['abstract'] != uncertain_title].reset_index(drop=True)

Генерация заголовков для тестовых данных:

In [None]:
from tqdm import tqdm
titles = []
for abstract in tqdm(abstracts):
    if abstract not in wtf_df['abstract'].values:
        _, title = generate_summary(abstract, model, True)
        wtf_df.loc[wtf_df.shape[0]] = abstract, title

Проверяем, что wtf_df.shape НЕ совпадает с shape изначальных тестовых данных (не должно, т.к. мы выкинули дублирующиеся данные)

In [None]:
wtf_df.shape[0] == submission_data.shape[0]

In [None]:
submission_df = pd.merge(submission_data, wtf_df, on='abstract', how='left')
submission_df['title'].isnull().values.any()

In [None]:
submission_df.shape[0] == submission_data.shape[0]

Download python script.

In [None]:
! wget -q https://raw.githubusercontent.com/Samsung-IT-Academy/stepik-dl-nlp/master/task11_kaggle/create_submission.py

Generate serialized .csv

In [None]:
from create_submission import generate_csv

submission_df.rename(columns={"A": "a", "B": "c"})
submission_df.to_csv('./logs/predicted_titles.csv', index=False)
generate_csv('./logs/predicted_titles.csv', './logs/kaggle_pred.csv', '../input/title-generation/vocs.pkl')

In [None]:
!wc -l ./logs/kaggle_pred.csv

In [None]:
!head ./logs/kaggle_pred.csv