In [None]:
!pip install git+https://github.com/huggingface/transformers
!pip install pytorch_lightning==1.2.2
!pip install sentencepiece==0.1.85
!pip install torch==1.7.1
!pip install torchvision==0.8.2
!pip install torchtext==0.8.0
!pip install fairseq
!pip install tensorflow_datasets -U
!pip install datasets

In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
!tar -xf gazeta_jsonl.tar.gz

In [None]:
from transformers import (
    AdamW,
    MT5ForConditionalGeneration, 
    AutoTokenizer,
    get_linear_schedule_with_warmup
)

# training

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
from fairseq.optim.adafactor import Adafactor
class MT5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(MT5FineTuner, self).__init__()
        self.hparams = hparams

        self.model = MT5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = AutoTokenizer.from_pretrained(hparams.tokenizer_name_or_path)

    def is_logger(self):
        return True

    def forward(
            self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
    ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
        )

    def _step(self, batch):
        labels = batch["target_ids"]
        labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            labels=labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]

        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.log('loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.log('val_loss', loss)

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        optimizer = Adafactor(model.parameters(), lr=0.001, scale_parameter=False, relative_step=False)
        # optimizer = AdamW(model.parameters(), lr=0.001)
        self.opt = optimizer
        return [optimizer]

    def train_dataloader(self):
        train_dataset = get_dataset(tokenizer=self.tokenizer, data_type="train", args=self.hparams)
        dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True,
                                num_workers=4)
        t_total = (
                (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
                // self.hparams.gradient_accumulation_steps
                * float(self.hparams.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        # val_dataset = get_dataset(tokenizer=self.tokenizer, data_type="validation", args=self.hparams)
        val_dataset = get_dataset(tokenizer=self.tokenizer, data_type="val", args=self.hparams)
        return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)


In [None]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
        def on_validation_end(self, trainer, pl_module):
            logger.info("***** Validation results *****")
            if pl_module.is_logger():
                  metrics = trainer.callback_metrics
                  # Log results
                  for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                      logger.info("{} = {}\n".format(key, str(metrics[key])))

        def on_test_end(self, trainer, pl_module):
            logger.info("***** Test results *****")

            if pl_module.is_logger():
                metrics = trainer.callback_metrics

                  # Log and save results to file
                output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
                with open(output_test_results_file, "w") as writer:
                    for key in sorted(metrics):
                          if key not in ["log", "progress_bar"]:
                            logger.info("{} = {}\n".format(key, str(metrics[key])))
                            writer.write("{} = {}\n".format(key, str(metrics[key])))

Let's define the hyperparameters and other arguments. You can overide this dict for specific task as needed. While in most of cases you'll only need to change the data_dirand output_dir.

Here the batch size is 8 and gradient_accumulation_steps are 8 so the effective batch size is 64

In [None]:
args = dict(
    data_dir="./", # path for data files
    output_dir="./", # path to save the checkpoints
    model_name_or_path='google/mt5-small',
    tokenizer_name_or_path='google/mt5-large',
    max_seq_length=512,
    learning_rate=1e-3,
    weight_decay=0.0,
    warmup_steps=0,
    train_batch_size=1,
    eval_batch_size=1,
    num_train_epochs=20,
    gradient_accumulation_steps=8,
    n_gpu=1,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    # max_grad_norm=0.5, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)
args = argparse.Namespace(**args_dict)

In [None]:
try:
  from datasets import load_dataset
except:
  !pip install datasets
  from datasets import load_dataset

import copy
class ru_sum_dataset(Dataset):
    def __init__(self, tokenizer, data_type, max_len=30): 
        # dataset = load_dataset(
        #   'mlsum', 'ru')
        # self.data = dataset[data_type]
        self.max_len = max_len
        self.tokenizer = tokenizer
        import json

        with open(f'gazeta_{data_type}.jsonl', 'r') as json_file:
            self.data = list(json_file)
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # print(self.inputs[index])
        # input_text, output_text= self.data['text'][index], self.data['summary'][index]
        splitted_row = self.data[index].split('"')
        input_text, output_text= splitted_row[7], splitted_row[-6]
        input_ = "summarize: %s" % (input_text)
        target = "%s </s>" %(output_text)

        # tokenize inputs
        tokenized_inputs = self.tokenizer.batch_encode_plus(
            [input_], max_length=self.max_len, padding="max_length", 
            truncation=True, return_tensors="pt"
        )
        # tokenize targets
        tokenized_targets = self.tokenizer.batch_encode_plus(
            [target], max_length=512, padding="max_length",
            truncation=True, return_tensors="pt"
        )
        source_ids = tokenized_inputs["input_ids"].squeeze()
        target_ids = tokenized_targets["input_ids"].squeeze()

        src_mask = tokenized_inputs["attention_mask"].squeeze()  # might need to squeeze
        target_mask = tokenized_targets["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}    

In [None]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    period =1, dirpath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    # gradient_clip_val=args.max_grad_norm,
    # amp_backend='native',
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)




In [None]:
def get_dataset(tokenizer, data_type, args):
    return ru_sum_dataset(tokenizer=tokenizer, data_type=data_type, max_len=args.max_seq_length)

In [None]:
print ("Initialize model")
model = MT5FineTuner(args)

trainer = pl.Trainer(**train_params)

Initialize model


GPU available: True, used: True
TPU available: None, using: 0 TPU cores


In [None]:
print ("Training model")
trainer.fit(model)
print ("training finished")

print ("Saving model")
model.model.save_pretrained("./")
print ("Saved model")

Training model



  | Name  | Type                        | Params
------------------------------------------------------
0 | model | MT5ForConditionalGeneration | 300 M 
------------------------------------------------------
300 M     Trainable params
0         Non-trainable params
300 M     Total params
1,200.707 Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




training finished
Saving model


KeyboardInterrupt: ignored

In [None]:
!nvidia-smi

Sun Apr  4 12:21:43 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P0    43W / 250W |    935MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# inference 

In [None]:
!unzip mt5_large_gazeta.zip

Archive:  /content/drive/MyDrive/mt5_large_gazeta.zip
   creating: mt5_large/
  inflating: mt5_large/config.json   
  inflating: mt5_large/pytorch_model.bin  


In [None]:
article_text = '''
Руководство Сбербанка планирует запустить продажу лекарств в отделениях кредитной организации. Об этом сообщает РИА Новости со ссылкой на первого зампредседателя правления банка Льва Хасиса.
Мы планируем сейчас провести пилоты по открытию физических «СберЕАптек» на площадях наших отделений банков, — приводит агентство его слова.
Хасис отметил, что если клиентам понравится такое совмещение, то проект получит дальнейшее развитие.
Он добавил, что помещения в отделениях Сбербанка будут предоставляться аптеке на весьма льготных условиях, поэтому сможет устанавливать низкие цены на лекарства.
Кроме того, предлагается обговорить с производителями возможность оптимизации фасовки препаратов. В случае достижения договорённости, у покупателей будет возможность приобретать точно предписанное врачом количество таблеток, а не платить за всю упаковку, пояснил Хасис.
Помимо всего прочего, у «СберЕАптеки» есть планы по запуску собственной линии дешёвых дженериков.
Ранее сообщалось, что компания «Российские железные дороги» (РЖД) планирует привлечь на вокзалы Москвы сетевые компании, в том числе магазины одежды, аптеки и кафе. Это будет предусмотрено единым стандартом развития вокзальных комплексов, который будет разработан в текущем году.
'''

In [None]:
from transformers import MT5ForConditionalGeneration, AutoTokenizer


model_name = "./mt5_large"
model = MT5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained('google/mt5-large')

input_ids = tokenizer.prepare_seq2seq_batch(
    [article_text],
    src_lang="en_XX", # fairseq training artifact
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=1024
)["input_ids"]

output_ids = model.generate(
    input_ids=input_ids,
    max_length=256,
    no_repeat_ngram_size=3,
    num_beams=5,
    top_k=0
)[0]

summary = tokenizer.decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(summary)



Руководство Сбербанка планирует запустить продажу лекарств в отделениях банков. По словам первого зампредседателя правления банка Льва Хасиса, если клиентам понравится такое совмещение, то проект получит дальнейшее развитие. Ранее Сбербанк планировал привлечь на вокзалы Москвы сетевые компании, в том числе магазины одежды, аптеки и кафе. 
