## Импорт библиотек

In [None]:
!pip install transformers evaluate rouge_score sacrebleu
!pip install sacremoses openpyxl scikit-learn ipywidgets

In [1]:
import os
import re
import json

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import evaluate
from datasets import load_dataset, Dataset

from transformers import AutoTokenizer, T5ForConditionalGeneration
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import train_test_split
import torch

---

## Загрузка Spider датасета (подготовка spider_dataset.json в Making Spider Dataset.ipynb)

In [5]:
with open("datasets/spider_dataset.json") as f:
    spider_data = json.load(f)

for sample in spider_data:
    sample["query"] = sample["query"].lower()
    sample["query"] = sample["query"].replace('(', ' ').replace(')', ' ').replace(' ,', ',')
    sample["query"] = re.sub("\s+,", ',', sample["query"])
    sample["query"] = re.sub('\s{2,}', ' ', sample["query"])

    sample["input"] = sample.pop("question")
    sample["target"] = sample.pop("query")
    

train_spider, test_spider = train_test_split(spider_data, train_size=0.85, random_state = 123)

In [6]:
train_data = Dataset.from_list(train_spider)
test_data = Dataset.from_list(test_spider)

## Модель

### Обучение на Spider

In [4]:
CKPT = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(CKPT)
model = T5ForConditionalGeneration.from_pretrained(CKPT)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [5]:
# tokenize the examples
MAX_LENGHT_INPUT = 256
MAX_LENGHT_TARGET = 128
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(
        example_batch['input'], pad_to_max_length=True, max_length=MAX_LENGHT_INPUT)
    
    target_encodings = tokenizer.batch_encode_plus(
        example_batch['target'], pad_to_max_length=True, max_length=MAX_LENGHT_TARGET)

    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

In [6]:
train_data = train_data.map(convert_to_features, batched=True, remove_columns=train_data.column_names)
test_data = test_data.map(convert_to_features, batched=True, remove_columns=test_data.column_names)

columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']

train_data.set_format(type='torch', columns=columns)
test_data.set_format(type='torch', columns=columns)

  0%|          | 0/6 [00:00<?, ?ba/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/2 [00:00<?, ?ba/s]

In [7]:
# set training arguments - Feel free to adapt it

### TODO data_collator

training_args = Seq2SeqTrainingArguments(
    output_dir="models/t5-base-finetuned-only-spider",
    per_device_train_batch_size=2,
    num_train_epochs=5,
    per_device_eval_batch_size=32,
    predict_with_generate=True,
    # evaluation_strategy="epoch",
    evaluation_strategy="no",
    do_train=True,
    # do_eval=True,
    logging_steps=500,
    save_strategy="epoch",
    #save_steps=1000,
    #eval_steps=1000,
    overwrite_output_dir=True,
    save_total_limit=3,
    # load_best_model_at_end=True,
    # push_to_hub=True,
    fp16=True,
)

In [8]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("sacrebleu")
accuracy = evaluate.load("accuracy")

In [9]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    # print(decoded_preds, '\n', decoded_labels)

    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    accuracy_result = sum([
        decoded_preds[i] in decoded_labels[i] for i in range(len(decoded_preds))
                                                        ]) / len(decoded_preds)


    return {
        "accuracy" :  accuracy_result,
        "bleu":       bleu_result["score"],
        "rouge1":     rouge_result["rouge1"],
        "rouge2":     rouge_result["rouge2"],
        "rougeL":     rouge_result["rougeL"],
        "rougeLsum" : rouge_result["rougeLsum"],
        }

In [10]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    # eval_dataset=test_data,
)

Using cuda_amp half precision backend


### Evaluation

In [11]:
trainer.evaluate()

ValueError: Trainer: evaluation requires an eval_dataset.

### Обучение

In [13]:
trainer.train()

***** Running training *****
  Num examples = 5950
  Num Epochs = 7
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 20825
  Number of trainable parameters = 222903552


Step,Training Loss
500,0.6053
1000,0.1754
1500,0.1325
2000,0.1208
2500,0.1056
3000,0.0901
3500,0.0761
4000,0.0703
4500,0.0686
5000,0.0679


Saving model checkpoint to models/t5-base-finetuned-only-spider/checkpoint-2975
Configuration saved in models/t5-base-finetuned-only-spider/checkpoint-2975/config.json
Model weights saved in models/t5-base-finetuned-only-spider/checkpoint-2975/pytorch_model.bin
Saving model checkpoint to models/t5-base-finetuned-only-spider/checkpoint-5950
Configuration saved in models/t5-base-finetuned-only-spider/checkpoint-5950/config.json
Model weights saved in models/t5-base-finetuned-only-spider/checkpoint-5950/pytorch_model.bin
Saving model checkpoint to models/t5-base-finetuned-only-spider/checkpoint-8925
Configuration saved in models/t5-base-finetuned-only-spider/checkpoint-8925/config.json
Model weights saved in models/t5-base-finetuned-only-spider/checkpoint-8925/pytorch_model.bin
Saving model checkpoint to models/t5-base-finetuned-only-spider/checkpoint-11900
Configuration saved in models/t5-base-finetuned-only-spider/checkpoint-11900/config.json
Model weights saved in models/t5-base-finetu

TrainOutput(global_step=20825, training_loss=0.06151302669657951, metrics={'train_runtime': 5607.3692, 'train_samples_per_second': 7.428, 'train_steps_per_second': 3.714, 'total_flos': 1.2681548070912e+16, 'train_loss': 0.06151302669657951, 'epoch': 7.0})

### Сохранение модели

In [12]:
trainer.save_model('models/t5-base-finetuned-only-spider')
tokenizer.save_pretrained('models/t5-base-finetuned-only-spider')

Saving model checkpoint to models/t5-base-finetuned-only-spider-v3
Configuration saved in models/t5-base-finetuned-only-spider-v3/config.json
Model weights saved in models/t5-base-finetuned-only-spider-v3/pytorch_model.bin
tokenizer config file saved in models/t5-base-finetuned-only-spider-v3/tokenizer_config.json
Special tokens file saved in models/t5-base-finetuned-only-spider-v3/special_tokens_map.json


('models/t5-base-finetuned-only-spider-v3/tokenizer_config.json',
 'models/t5-base-finetuned-only-spider-v3/special_tokens_map.json',
 'models/t5-base-finetuned-only-spider-v3/tokenizer.json')

### Загрузка модели

In [2]:
CKPT = 'models/t5-base-finetuned-only-spider-v3'
tokenizer = AutoTokenizer.from_pretrained(CKPT)
model = T5ForConditionalGeneration.from_pretrained(CKPT)

In [3]:
# tokenize the examples
MAX_LENGHT_INPUT = 256
MAX_LENGHT_TARGET = 128
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(
        example_batch['input'], pad_to_max_length=True, max_length=MAX_LENGHT_INPUT)
    
    target_encodings = tokenizer.batch_encode_plus(
        example_batch['target'], pad_to_max_length=True, max_length=MAX_LENGHT_TARGET)

    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

In [7]:
train_data = train_data.map(convert_to_features, batched=True, remove_columns=train_data.column_names)
test_data = test_data.map(convert_to_features, batched=True, remove_columns=test_data.column_names)

columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']

train_data.set_format(type='torch', columns=columns)
test_data.set_format(type='torch', columns=columns)

  0%|          | 0/6 [00:00<?, ?ba/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/2 [00:00<?, ?ba/s]

In [8]:
# set training arguments - Feel free to adapt it

### TODO data_collator

training_args = Seq2SeqTrainingArguments(
    output_dir="models/t5-base-finetuned-only-spider",
    per_device_train_batch_size=2,
    num_train_epochs=7,
    per_device_eval_batch_size=32,
    predict_with_generate=True,
    # evaluation_strategy="epoch",
    evaluation_strategy="no",
    do_train=True,
    # do_eval=True,
    logging_steps=500,
    save_strategy="epoch",
    #save_steps=1000,
    #eval_steps=1000,
    overwrite_output_dir=True,
    save_total_limit=3,
    # load_best_model_at_end=True,
    # push_to_hub=True,
    fp16=True,
)

In [9]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("sacrebleu")
accuracy = evaluate.load("accuracy")

In [10]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    # print(decoded_preds, '\n', decoded_labels)

    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    accuracy_result = sum([
        decoded_preds[i] in decoded_labels[i] for i in range(len(decoded_preds))
                                                        ]) / len(decoded_preds)


    return {
        "accuracy" :  accuracy_result,
        "bleu":       bleu_result["score"],
        "rouge1":     rouge_result["rouge1"],
        "rouge2":     rouge_result["rouge2"],
        "rougeL":     rouge_result["rougeL"],
        "rougeLsum" : rouge_result["rougeLsum"],
        }

### Тест (Evaluation)

In [11]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data,
)

Using cuda_amp half precision backend


In [12]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1050
  Batch size = 32


{'eval_loss': 0.025174934417009354,
 'eval_accuracy': 0.32476190476190475,
 'eval_bleu': 32.865668478264794,
 'eval_rouge1': 0.76403104355185,
 'eval_rouge2': 0.708718414127314,
 'eval_rougeL': 0.7507727421939794,
 'eval_rougeLsum': 0.7509353595300937,
 'eval_runtime': 39.3143,
 'eval_samples_per_second': 26.708,
 'eval_steps_per_second': 0.839}

---

### Тест (для выявления ошибок)

In [11]:
test_data = Dataset.from_list(test_spider)

In [12]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model.to(device)

def translate_to_sql(text):
    inputs = tokenizer(text, padding='longest', max_length=64, return_tensors='pt').to(device)
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=64, )

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [13]:
for i in tqdm(range(len(test_data))):
    predict = translate_to_sql(test_data[i]["input"])
    expected = test_data[i]['target']

    if predict != expected:
        print(predict + '\n' + expected)
        print('=================================\n')

  0%|          | 0/1050 [00:00<?, ?it/s]

select distinct driverid, stop from pitstops where duration > select max duration from pitstops where raceid = 841
select distinct driverid, stop from pitstops where duration > select min duration from pitstops where raceid = 841 

select t2.name, t2.year from qualifying as t1 join races as t2 on t1.raceid = t2.raceid join drivers as t3 on t1.driverid = t3.driverid where t3.surname = "le
select t2.name, t2.year from results as t1 join races as t2 on t1.raceid = t2.raceid join drivers as t3 on t1.driverid = t3.driverid where t3.forename = "lewis"

select sum t1.attendance from home_game as t1 join team as t2 on t1.team_id_attendance = t2.team_id_br where t2.name = 'boston red stockings' and t1.
select sum t1.attendance from home_game as t1 join team as t2 on t1.team_id = t2.team_id_br where t2.name = 'boston red stockings' and t1.year between 2000 and 2010;

select sum t2.order_quantity from customer_orders as t1 join order_items as t2 on t1.order_id = t2.order_id where t1.order_date  "

KeyboardInterrupt: 