In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

import json
import re

import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig

import sys
from eval_utils import get_parser, compute_test_metrics
from utils import Format, get_to_string_processor



In [2]:
with open("configs/config_ruT5-base-st.json", "rb") as config:
    params = json.load(config)

params

{'format': 'SpecTokens',
 'max_bundles': 5,
 'model': 'ai-forever/ruT5-base',
 'add_nl_token': False,
 'add_eos_token': False,
 'change_pad_to_eos': False,
 'shuffle_bundles': True,
 'save_folder': 'ruT5-base',
 'train': {'n_epochs': 10,
  'lr': 5e-05,
  'batch_size': 16,
  'weight_decay': 0.01,
  'scheduler': 'cosine',
  'warmup_steps': 500,
  'fp16': True},
 'eval': {'batch_size': 16, 'show': 5}}

In [3]:
ckpt = params["save_folder"]

In [4]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [5]:
out_format = Format.SpecTokens if params["format"] == "SpecTokens" else Format.JustJson
out_format

<Format.SpecTokens: 0>

In [6]:
data = pd.read_csv(f"~/work/resources/bench_results/{ckpt}_preds.csv")
data.head()

Unnamed: 0,Text,bundles,n_bundles,Responses
0,"самокат hudora, в отличном состоянии, от 5+ и ...","[{""Title"": ""\u0441\u0430\u043c\u043e\u043a\u04...",1,<BOB> <BOT> самокат hudora <EOT> <BOP> 65 <EOP...
1,2 мяча и корзина 5€ лимассол,"[{""Title"": ""\u043d\u0430\u0431\u043e\u0440 \u0...",1,<BOB> <BOT> 2 мяча и корзина <EOT> <BOP> 5 <EO...
2,принимаются предзаказы на 100% органическое ма...,"[{""Title"": ""100% \u043e\u0440\u0433\u0430\u043...",1,<BOB> <BOT> Органическое масло миндаля 100 мл ...
3,"колонки, в рабочем состоянии! использовались р...","[{""Title"": ""\u043a\u043e\u043b\u043e\u043d\u04...",1,"<BOB> <BOT> колонки, в рабочем состоянии <EOT>..."
4,гироскутер 100 евро с зарядным,"[{""Title"": ""\u0433\u0438\u0440\u043e\u0441\u04...",1,<BOB> <BOT> гироскутер с зарядным устройством ...


In [7]:
model_checkpoint = f"../good_checkpoints/{ckpt}"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
parser = get_parser(tokenizer, out_format)
to_string_processor = get_to_string_processor(out_format)

In [None]:
responses = {
    "pred_bundles": [],
    "valid_structure": [],
    "pred_n_bundles": [],
    "valid_bundles": [],
    "delta_bundles": [],
    "mean_bleu": [],
    "mean_chrf": [],
    "mean_price_match": [],
    "mean_currency_match": [],
    "mean_count_match": [],
}

for ind in tqdm(data.index, total=len(data)):
    preds = [data.loc[ind, "Responses"]]
    labels = [to_string_processor(data.loc[ind, "bundles"]) + tokenizer.eos_token]
    
    is_valid, bundles = parser(re.sub(r'(</s>)+', '</s>', preds[0]))
    responses["pred_bundles"].append(str(bundles) if is_valid else None)
    
    metrics = compute_test_metrics(preds, labels, parser)
    for key in responses:
        if key != "pred_bundles" and key != "true_bundles":
            responses[key].append(metrics.get(key))

  0%|          | 0/1000 [00:00<?, ?it/s]


Downloading builder script: 100%|██████████| 8.15k/8.15k [00:00<00:00, 4.08MB/s]

Downloading builder script: 100%|██████████| 9.01k/9.01k [00:00<00:00, 3.24MB/s]


In [None]:
data = pd.concat([data, pd.DataFrame(responses)], axis=1)
data.head()

In [None]:
data.to_csv(f"~/work/resources/bench_results/{ckpt}_metrics.csv", index=False)