In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import datetime

from tqdm.notebook import tqdm

import json
import re

import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig

import sys
sys.path.append('..')
from eval_utils import get_parser, compute_test_metrics
from utils import Format, get_to_string_processor, clean_text

[nltk_data] Downloading package punkt to /home/vlad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/vlad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
with open("configs/config_fredT5-xl-lt.json", "rb") as config:
    params = json.load(config)
    
params

{'format': 'LightTokens',
 'max_bundles': 10,
 'singles_to_mults_ratio': 2,
 'max_eval_size': 100,
 'model': 'ai-forever/FRED-T5-1.7B',
 'add_nl_token': False,
 'add_eos_token': True,
 'change_pad_to_eos': False,
 'shuffle_bundles': True,
 'add_lm_token': True,
 'save_folder': 'fredT5-xl-lt',
 'train': {'n_epochs': 10,
  'lr': 0.0003,
  'batch_size': 4,
  'weight_decay': 0.0,
  'scheduler': 'cosine',
  'warmup_steps': 2000},
 'eval': {'batch_size': 4, 'show': 5},
 'lora': {'rank': 64}}

In [3]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

In [4]:
out_format = Format(params["format"])
out_format

<Format.LightTokens: 'LightTokens'>

In [5]:
data = pd.read_csv("~/data/ads_test_1000.csv")
data.head()

Unnamed: 0,Text,bundles,n_bundles
0,"самокат hudora, в отличном состоянии, от 5+ и ...","[{""Title"": ""\u0441\u0430\u043c\u043e\u043a\u04...",1
1,2 мяча и корзина 5€ лимассол,"[{""Title"": ""\u043d\u0430\u0431\u043e\u0440 \u0...",1
2,принимаются предзаказы на 100% органическое ма...,"[{""Title"": ""100% \u043e\u0440\u0433\u0430\u043...",1
3,"колонки, в рабочем состоянии! использовались р...","[{""Title"": ""\u043a\u043e\u043b\u043e\u043d\u04...",1
4,гироскутер 100 евро с зарядным,"[{""Title"": ""\u0433\u0438\u0440\u043e\u0441\u04...",1


In [6]:
data["n_bundles"].value_counts()

n_bundles
1     735
0      89
2      54
3      34
4      25
5      18
6      16
7      10
10      7
9       7
8       2
36      1
14      1
13      1
Name: count, dtype: int64

In [7]:
ckpt = params["save_folder"]
ckpt

'fredT5-xl-lt'

In [8]:
model_checkpoint = f"../good_checkpoints/{ckpt}"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
parser = get_parser(tokenizer, out_format)
to_string_processor = get_to_string_processor(out_format)

In [10]:
responses = []

model.to(device)
bs = 8

start_time = datetime.datetime.now()

postfix = (tokenizer.eos_token if params.get("add_eos_token", False) else "")
prefix = ("<LM>" if params.get("add_lm_token", False) else "")

print("Total steps:", (len(data) + bs - 1) // bs)
for ind in tqdm(range(0, len(data), bs), total=(len(data) + bs - 1) // bs):
    if ind % 10 == 0:
        print(f"{ind // bs} steps made")
    postfix = tokenizer.eos_token if params.get("add_eos_token", False) else ""
    prefix = "<LM>" if params.get("add_lm_token", False) else ""
    cleaner = clean_text if params.get("clean_text", False) else (lambda x: x)
    batch = tokenizer(
        [prefix + cleaner(data.loc[data.index[i], "Text"]) + postfix for i in range(ind, min(ind + bs, len(data)))],
        max_length=512,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )["input_ids"]

    preds = model.generate(
        input_ids=batch.to(device),
        max_length=512,
        num_beams=4,
        early_stopping=True,
        eos_token_id=tokenizer.eos_token_id
    ).cpu()
    
    preds = torch.where(preds == -100, tokenizer.eos_token_id, preds)
    preds = tokenizer.batch_decode(preds, ignore_special_tokens=True)
    responses += [re.sub(tokenizer.pad_token, "", pred) for pred in preds]

(datetime.datetime.now() - start_time) / datetime.timedelta(minutes=1)

Total steps: 125


  0%|          | 0/125 [00:00<?, ?it/s]

0 steps made
5 steps made
10 steps made
15 steps made
20 steps made
25 steps made
30 steps made
35 steps made
40 steps made
45 steps made
50 steps made
55 steps made
60 steps made
65 steps made
70 steps made
75 steps made
80 steps made
85 steps made
90 steps made
95 steps made
100 steps made
105 steps made
110 steps made
115 steps made
120 steps made


21.635578483333333

In [11]:
data["Responses"] = responses
data.head()

Unnamed: 0,Text,bundles,n_bundles,Responses
0,"самокат hudora, в отличном состоянии, от 5+ и ...","[{""Title"": ""\u0441\u0430\u043c\u043e\u043a\u04...",1,1<BOB>0<BOT>самокат hudora<BOP>65<BOC1>eur<BOC...
1,2 мяча и корзина 5€ лимассол,"[{""Title"": ""\u043d\u0430\u0431\u043e\u0440 \u0...",1,1<BOB>0<BOT>мячи и корзина<BOP>5<BOC1>eur<BOC2...
2,принимаются предзаказы на 100% органическое ма...,"[{""Title"": ""100% \u043e\u0440\u0433\u0430\u043...",1,1<BOB>0<BOT>100% органическое масло из миндаля...
3,"колонки, в рабочем состоянии! использовались р...","[{""Title"": ""\u043a\u043e\u043b\u043e\u043d\u04...",1,1<BOB>0<BOT>колонки<BOP>80<BOC1>eur<BOC2>4 пар...
4,гироскутер 100 евро с зарядным,"[{""Title"": ""\u0433\u0438\u0440\u043e\u0441\u04...",1,1<BOB>0<BOT>гироскутер с зарядным<BOP>100<BOC1...


In [12]:
data.to_csv(f"~/leonya/bench_results/{ckpt}_preds.csv", index=False)

In [13]:
ckpt

'fredT5-xl-lt'

In [14]:
data.shape

(1000, 4)