In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as ss

from tqdm.notebook import tqdm

import json
import re
from collections import defaultdict
import datetime
import os

import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig

import sys
sys.path.append('..')
from eval_utils import get_parser, compute_test_metrics
from utils import Format, get_to_string_processor

[nltk_data] Downloading package punkt to /home/vlad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/vlad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
ls configs

config_cointegrated-st.json
config_fredT5-large-st.json
config_fredT5-large-st-no-shuf.json
config_fredT5-large-wn.json
config_fredT5-xl-st.json
config_ruT5-base-st.json
config_ruT5-large-jj.json
config_ruT5-large-jj-no-shuffle.json
config_ruT5-large-jj-overfit.json
config_ruT5-large-jj-overfit-no-shuffle.json
config_ruT5-large-st.json
config_ruT5-large-st-no-shuffle.json
config_ruT5-large-st-overfit.json
config_ruT5-large-st-overfit-no-shuffle.json
config_ruT5-large-st-overfit-no-shuffle-nl.json


In [17]:
# configs = ["-jj-overfit-no-shuffle", "-st-overfit"]
configs = ["ruT5-large-jj-overfit-no-shuffle", "ruT5-large-st-overfit", "fredT5-large-st", "fredT5-large-wn", "fredT5-xl-st"]
params = {}

for conf in configs:
    with open(f"configs/config_{conf}.json", "rb") as config:
        params[conf.strip("-")] = json.load(config)
    
len(params)

5

In [18]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

In [19]:
data = pd.read_csv("~/data/ads_test_100.csv")
data.head()

Unnamed: 0,Text,bundles,n_bundles
0,продаются паллеты поддоны\n25 лари/шт\nдоставк...,"[{""Title"": ""\u043f\u0430\u043b\u043b\u0435\u04...",1
1,сковородка блинница 20 см состояние отличное ц...,"[{""Title"": ""\u0441\u043a\u043e\u0432\u043e\u04...",1
2,Комбинезон Зима р 68 \nОчень тёплый \nРучки но...,"[{""Title"": ""\u041a\u043e\u043c\u0431\u0438\u04...",1
3,"Кожа ,, Снежная королева,,, раз 42-44,4000 руб...","[{""Title"": ""\u041a\u043e\u0436\u0430 \u0421\u0...",2
4,Рюкзак - 400 руб.\nBlumarine Kiss Me On The Li...,"[{""Title"": ""\u0420\u044e\u043a\u0437\u0430\u04...",3


In [20]:
repeat_times = 5
results = defaultdict(lambda: list())

for conf in params:
    print(conf)
    
    out_format = Format(params[conf]["format"])
    ckpt = params[conf]["save_folder"]
    model_checkpoint = f"../good_checkpoints/{ckpt}"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

    parser = get_parser(tokenizer, out_format)
    to_string_processor = get_to_string_processor(out_format)

    model.to(device)
    
    bs = 8

    for _ in range(repeat_times):
        start_time = datetime.datetime.now()
        responses = []
        for ind in tqdm(range(0, len(data), bs), total=(len(data) + bs - 1) // bs):
            tokenized = tokenizer([data.loc[data.index[i], "Text"] + (tokenizer.eos_token if params.get("add_eos_token", False) else "")
                           for i in range(ind, min(ind + bs, len(data)))], max_length=512, padding=True, truncation=True, return_tensors="pt")["input_ids"]
            preds = model.generate(
                input_ids=tokenized.to(device),
                max_length=512,
                num_beams=4,
                early_stopping=True,
                eos_token_id=tokenizer.eos_token_id
            ).cpu()
            
            preds = torch.where(preds == -100, tokenizer.eos_token_id, preds)
            preds = tokenizer.batch_decode(preds, ignore_special_tokens=True)
            responses += [re.sub(tokenizer.pad_token, "", pred) for pred in preds]
        results[conf].append((datetime.datetime.now() - start_time) / datetime.timedelta(seconds=1))

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


ruT5-large-jj-overfit-no-shuffle


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

ruT5-large-st-overfit


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


fredT5-large-st


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


fredT5-large-wn


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


fredT5-xl-st


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

In [21]:
data = pd.DataFrame(results)
data.head()

Unnamed: 0,ruT5-large-jj-overfit-no-shuffle,ruT5-large-st-overfit,fredT5-large-st,fredT5-large-wn,fredT5-xl-st
0,142.851747,106.033536,351.740312,99.39152,62.161764
1,146.683439,106.032624,352.544853,98.578071,61.561672
2,145.276384,105.323788,348.726243,99.596139,62.252137
3,149.006137,105.904221,352.116137,98.93962,62.305308
4,146.613533,105.816778,350.285621,98.90666,62.14214


In [22]:
path = "/home/vlad/leonya/bench_results/time_benchmarking.csv"

if os.path.exists(path):
    orig_data = pd.read_csv(path)
    data = pd.concat([orig_data, data], axis=1)

In [23]:
data.to_csv(path, index=False)

In [24]:
z = ss.t(4).ppf(0.975) / np.sqrt(5)

for col in data.columns:
    print(col)
    vals = data[col].values
    print(f"\t({vals.mean() - z * vals.std()}, {vals.mean() + z * vals.std()})")

ruT5-large-jj-overfit-no-shuffle
	(143.58570226306594, 148.58679373693403)
ruT5-large-st-overfit
	(105.49645072223335, 106.14792807776665)
fredT5-large-st
	(349.3419494518929, 352.82331694810716)
fredT5-large-wn
	(98.6295430015078, 99.53526099849218)
fredT5-xl-st
	(61.75166247193706, 62.41754592806294)


In [25]:
data.head()

Unnamed: 0,ruT5-large-jj-overfit-no-shuffle,ruT5-large-st-overfit,fredT5-large-st,fredT5-large-wn,fredT5-xl-st
0,142.851747,106.033536,351.740312,99.39152,62.161764
1,146.683439,106.032624,352.544853,98.578071,61.561672
2,145.276384,105.323788,348.726243,99.596139,62.252137
3,149.006137,105.904221,352.116137,98.93962,62.305308
4,146.613533,105.816778,350.285621,98.90666,62.14214


In [26]:
os.path.exists(path)

True