In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

import json
import re

import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig

import sys
sys.path.append('..')
from eval_utils import get_parser, compute_test_metrics
from utils import Format, get_to_string_processor

[nltk_data] Downloading package punkt to /home/vlad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
with open("configs/config_ruT5-large-jj-overfit-no-shuffle.json", "rb") as config:
    params = json.load(config)

params

{'format': 'JustJson',
 'max_bundles': 5,
 'max_eval_size': 100,
 'model': 'ai-forever/ruT5-large',
 'add_nl_token': False,
 'add_eos_token': False,
 'change_pad_to_eos': False,
 'shuffle_bundles': False,
 'save_folder': 'ruT5-large-JJ-overfit-no-shuffle',
 'train': {'n_epochs': 10,
  'lr': 5e-05,
  'batch_size': 8,
  'weight_decay': 0.01,
  'scheduler': 'cosine',
  'warmup_steps': 500,
  'fp16': True},
 'eval': {'batch_size': 8, 'show': 5}}

In [26]:
ckpt = params["save_folder"]
ckpt

'ruT5-large-JJ-overfit-no-shuffle'

In [27]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

In [28]:
out_format = Format.SpecTokens if params["format"] == "SpecTokens" else Format.JustJson
out_format

<Format.JustJson: 2>

In [29]:
data = pd.read_csv(f"~/leonya/bench_results/{ckpt}_preds.csv")
data.head()

Unnamed: 0,Text,bundles,n_bundles,Responses
0,"самокат hudora, в отличном состоянии, от 5+ и ...","[{""Title"": ""\u0441\u0430\u043c\u043e\u043a\u04...",1,"[{ 'title': 'самокат hudora, от 5+ и старше',..."
1,2 мяча и корзина 5€ лимассол,"[{""Title"": ""\u043d\u0430\u0431\u043e\u0440 \u0...",1,"[{ 'title': 'мячи, корзина', 'price': '5', 'c..."
2,принимаются предзаказы на 100% органическое ма...,"[{""Title"": ""100% \u043e\u0440\u0433\u0430\u043...",1,[{ 'title':'100% органическое масло из миндал...
3,"колонки, в рабочем состоянии! использовались р...","[{""Title"": ""\u043a\u043e\u043b\u043e\u043d\u04...",1,"[{ 'title': 'колонки в рабочем состоянии', 'p..."
4,гироскутер 100 евро с зарядным,"[{""Title"": ""\u0433\u0438\u0440\u043e\u0441\u04...",1,[{ 'title': 'гироскутер с зарядным устройство...


In [30]:
model_checkpoint = f"../good_checkpoints/{ckpt}"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [31]:
parser = get_parser(tokenizer, out_format)
to_string_processor = get_to_string_processor(out_format)

In [32]:
responses = []
keys_set = set()

for ind in tqdm(data.index, total=len(data)):
    preds = [data.loc[ind, "Responses"]]
    labels = [to_string_processor(data.loc[ind, "bundles"]) + tokenizer.eos_token]
    
    is_valid, bundles = parser(re.sub(r'(</s>)+', '</s>', preds[0]))
    responses.append(compute_test_metrics(preds, labels, parser))
    responses[-1]["pred_bundles"] = str(bundles) if is_valid else None
    for key in responses[-1]:
        keys_set.add(key)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [33]:
metrics = {key: [] for key in keys_set}
for resp in responses:
    for key in metrics:
        metrics[key].append(resp.get(key))

In [34]:
data = pd.concat([data, pd.DataFrame(metrics)], axis=1)
data.head()

Unnamed: 0,Text,bundles,n_bundles,Responses,count_match,f1,pred_bundles,n_duplicates,exact_match,delta_bundles,valid_bundles,pred_n_bundles,bleu_score,match,valid_structure,precision,chrf_score,price_match,currency_match,recall
0,"самокат hudora, в отличном состоянии, от 5+ и ...","[{""Title"": ""\u0441\u0430\u043c\u043e\u043a\u04...",1,"[{ 'title': 'самокат hudora, от 5+ и старше',...",1.0,0.75,"[{'title': 'самокат hudora, от 5+ и старше', '...",0.0,0.0,0.0,1.0,1.0,31.024517,0.75,True,1.0,47.946269,1.0,1.0,0.6
1,2 мяча и корзина 5€ лимассол,"[{""Title"": ""\u043d\u0430\u0431\u043e\u0440 \u0...",1,"[{ 'title': 'мячи, корзина', 'price': '5', 'c...",0.0,0.0,"[{'title': 'мячи, корзина', 'price': '5', 'cur...",0.0,0.0,0.0,1.0,1.0,0.0,0.0,True,0.0,33.777482,1.0,1.0,0.0
2,принимаются предзаказы на 100% органическое ма...,"[{""Title"": ""100% \u043e\u0440\u0433\u0430\u043...",1,[{ 'title':'100% органическое масло из миндал...,1.0,1.0,[{'title': '100% органическое масло из миндаля...,0.0,1.0,0.0,1.0,1.0,100.0,1.0,True,1.0,100.0,1.0,1.0,1.0
3,"колонки, в рабочем состоянии! использовались р...","[{""Title"": ""\u043a\u043e\u043b\u043e\u043d\u04...",1,"[{ 'title': 'колонки в рабочем состоянии', 'p...",0.0,1.0,"[{'title': 'колонки в рабочем состоянии', 'pri...",0.0,1.0,0.0,1.0,1.0,100.0,0.0,True,1.0,100.0,1.0,1.0,1.0
4,гироскутер 100 евро с зарядным,"[{""Title"": ""\u0433\u0438\u0440\u043e\u0441\u04...",1,[{ 'title': 'гироскутер с зарядным устройство...,1.0,0.857143,[{'title': 'гироскутер с зарядным устройством'...,0.0,0.0,0.0,1.0,1.0,59.460356,0.857143,True,0.75,88.167996,1.0,1.0,1.0


In [35]:
data.to_csv(f"~/leonya/bench_results/{ckpt}_metrics.csv", index=False)

In [None]:
agg_metrics = {}

In [None]:
data.isna().sum()

In [None]:
agg_metrics["valid_structure"] = data["valid_structure"].mean()
agg_metrics["valid_structure"]

In [None]:
data_empty = data[data["n_bundles"] == 0].copy()
data = data[data["n_bundles"] > 0]

Data with invalid ads

In [None]:
data_empty.shape

In [None]:
# Способность блокировать мусор
(data_empty["pred_n_bundles"] == 0).mean()

In [None]:
agg_metrics["gurbage_recongning_accuracy"] = (data_empty["pred_n_bundles"] == 0).mean()

Data with normal ads

In [None]:
data.isna().sum()

In [None]:
data[data["recall"].isna()].head()

In [None]:
data.dropna(inplace=True)

In [None]:
agg_metrics["recall"] = data["pred_n_bundles"].sum() / data["n_bundles"].sum()
# agg_metrics

In [None]:
data["perfect_match"] = (data["valid_bundles"] == data["pred_n_bundles"]) \
                        & (data["mean_price_match"] == 1.) \
                        & (data["mean_currency_match"] == 1.) \
                        & (data["mean_count_match"] == 1.) \
                        & (data["delta_bundles"] == 0.)

In [None]:
agg_metrics["prefect_match"] = data["perfect_match"].mean()
# agg_metrics

In [31]:
agg_metrics["1b_perfect_match"] = data.loc[data["n_bundles"] == 1, "perfect_match"].mean()
agg_metrics["mb_perfect_match"] = data.loc[data["n_bundles"] > 1, "perfect_match"].mean()

In [32]:
for feature in ("price", "currency", "count"):
    agg_metrics[f"1b_mean_{feature}_match"] = data.loc[data["n_bundles"] == 1, f"mean_{feature}_match"].mean()
    agg_metrics[f"mb_mean_{feature}_match"] = data.loc[data["n_bundles"] > 1, f"mean_{feature}_match"].mean()
    agg_metrics[f"mean_{feature}_match"] = data[f"mean_{feature}_match"].mean()
# agg_metrics

In [33]:
for feature in ("bleu", "chrf"):
    agg_metrics[f"1b_mean_{feature}"] = data.loc[data["n_bundles"] == 1, f"mean_{feature}"].mean()
    agg_metrics[f"mb_mean_{feature}"] = data.loc[data["n_bundles"] > 1, f"mean_{feature}"].mean()
    agg_metrics[f"mean_{feature}"] = data[f"mean_{feature}"].mean()
# agg_metrics

KeyError: 'mean_bleu'

In [None]:
agg_metrics["too_many_bundles_ratio"] = data["delta_bundles"].apply(lambda x: max(x, 0)).mean()
agg_metrics["not_enough_bundles_ratio"] = data["delta_bundles"].apply(lambda x: -min(x, 0)).mean()
agg_metrics["n_bundles_mae"] = data["delta_bundles"].apply(lambda x: abs(x)).mean()
agg_metrics["model"] = ckpt
# agg_metrics

In [None]:
agg_metrics

In [33]:
agg_metrics = {key: [val] for key, val in agg_metrics.items()}
compare_data = pd.DataFrame(agg_metrics)
compare_data.head()

Unnamed: 0,valid_structure,gurbage_recongning_accuracy,recall,prefect_match,1b_perfect_match,mb_perfect_match,1b_mean_price_match,mb_mean_price_match,mean_price_match,1b_mean_currency_match,...,1b_mean_bleu,mb_mean_bleu,mean_bleu,1b_mean_chrf,mb_mean_chrf,mean_chrf,too_many_bundles_ratio,not_enough_bundles_ratio,n_bundles_mae,model
0,0.986,0.752809,0.910049,0.721788,0.84022,0.213018,0.958678,0.781164,0.925158,0.931129,...,37.048364,30.224089,35.759758,76.98801,69.593539,75.591736,0.032402,0.175419,0.207821,ruT5-large-JJ-no-shuffle


In [38]:
comp_data = pd.read_csv("~/work/resources/bench_results/compare.csv")
comp_data = pd.concat([comp_data, compare_data])
comp_data.head()

Unnamed: 0,valid_structure,gurbage_recongning_accuracy,recall,prefect_match,1b_perfect_match,mb_perfect_match,1b_mean_price_match,mb_mean_price_match,mean_price_match,1b_mean_currency_match,mb_mean_currency_match,mean_currency_match,1b_mean_count_match,mb_mean_count_match,mean_count_match,1b_mean_bleu,mb_mean_bleu,mean_bleu,1b_mean_chrf,mb_mean_chrf,mean_chrf,too_many_bundles_ratio,not_enough_bundles_ratio,n_bundles_mae,model
0,0.998,0.483146,0.89397,0.694505,0.820408,0.165714,0.952381,0.682571,0.900495,0.922449,0.868571,0.912088,0.936054,0.886571,0.926538,37.595213,29.048852,35.951682,77.02925,65.41845,74.796404,0.051648,0.227473,0.279121,ruT5-base
0,0.989,0.730337,0.896879,0.712222,0.835391,0.187135,0.958848,0.72115,0.913685,0.931413,0.859649,0.917778,0.943759,0.897271,0.934926,36.149039,30.05038,34.990294,76.443918,65.974156,74.454663,0.036667,0.205556,0.242222,ruT5-large-JJ-overfit


In [39]:
comp_data.to_csv("~/work/resources/bench_results/compare.csv", index=False)

In [42]:
data.loc[(data["n_bundles"] > 1) & (data["mean_price_match"] < 1), "Text"].values[:5]

array(['#БНИ_ПродамБУ_Владивосток\n#БНИ_id295710053\n#БНИ_почта \n#БНИ_авито \n\nВсе игры в отличном состоянии. \n\n1. Живой лес. 2000р.\n2. Hamlet. 2700р.\n3. Секигахара. 2900р.\n4. Волки. 2600р.\n5. Тираны подземья + 3 фанатские колоды. 5000р.\n6. Зверь. 4000р. \n7. Прослушка. 1400р.\n8. Трон кубов. 9000р.\n9. Hoplomachus. 16000р.\n10. Cloudspire. 17000р.',
       'Сапожки зима, Demar, р-р 20-21, отлично подойдут для первой зимней обуви вашему малышу, очень теплые, 500₽.\nПолукомбенизон р-р 80, демисезон, на прохладную осень, непромокаемые, 150₽\nНовосиньково\n\n#БД_обувь',
       '1. плащ классический черный\nразмер 42\nторжковская швейная фабрика\n70 лари\n2. бежевый топ-лапша\nразмер m\n20 лари\nзолотая цепь\n20 лари\n3. красная вельветовая юбка\nразмер m\n20 лари\n4. футболка с ахегао\nразмер xxl\n20 лари\n5. настолка fluxx\n20 лари\n6. бижутерия и значки по 5 лари\n\n📍м. технический университет/центр',
       'Продаю:\n1) Коврик 100х60 см, безворсовый — 300 руб\n2) Зеркало 60х50

In [43]:
data.loc[(data["n_bundles"] > 1) & (data["mean_price_match"] < 1), "Responses"].values[:5]

array([" [{ 'title': 'Тираны подземья + 3 фанатские колоды', 'price': '5000', 'currency': 'RUB', 'count': '1'}, { 'title': 'Тираны подземья + 3 фанатские колоды', 'price': '5000', 'currency': 'RUB', 'count': '1'}, { 'title': 'Hamlet', 'price': '2700', 'currency': 'RUB', 'count': '1'}, { 'title': 'Cloudspire', 'price': '17000', 'currency': 'RUB', 'count': '1'}, { 'title': 'Тираны подземья + 3 фанатские колоды', 'price': '5000', 'currency': 'RUB', 'count': '1'}, { 'title': 'Hamlet', 'price': '2700', 'currency': 'RUB', 'count': '1'} ]</s>",
       " [{ 'title': 'Сапожки зима Demar, размер 20-21', 'price': '500', 'currency': 'RUB', 'count': '1'}, { 'title': 'Сапожки зима Demar, размер 20-21', 'price': '500', 'currency': 'RUB', 'count': '1'} ]</s>",
       " [{ 'title': 'футболка с ахегао размер xxl', 'price': '20', 'currency': 'лари', 'count': '1'}, { 'title': 'бежевый топ-лапша размер m', 'price': '20', 'currency': 'лари', 'count': '1'}, { 'title': 'бежевый топ-лапша размер m', 'price': '