In [None]:
# MOUNTING GOOGLE DRIVE
from google.colab import drive
drive.mount('/content/drive')

import os
print(os.getcwd())

wd = '/content/drive/MyDrive/CS 685/cs685_project/notebooks'
print(os.listdir(wd))
os.chdir(wd)
print(os.getcwd())

In [None]:
# !pip install tokenizers
# !pip install transformers
# !pip install sentencepiece

In [None]:
import os
import json
import torch
import argparse
from tqdm import trange
import torch.optim as optim

import transformers

from tqdm import tqdm
from tokenizers import AddedToken

from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from transformers import T5TokenizerFast, T5ForConditionalGeneration
from transformers.optimization import Adafactor
from transformers.trainer_utils import set_seed
# from utils.spider_metric.evaluator import EvaluateTool
# from utils.load_dataset import Text2SQLDataset
from load_dataset import Text2SQLDataset
# from utils.text2sql_decoding_utils import decode_sqls, decode_natsqls

In [None]:
 #'file path of test2sql training set.')
# train_filepath = "/Users/aishwarya/Downloads/spring23/cs685-NLP/project/spider/baselines/seq2seq_attention_copy/data/datasets/data_final/spider_combined_train.json"
# train_filepath = "/Users/aishwarya/Downloads/spring23/cs685-NLP/project/data/resdsql_pre/preprocessed_dataset.json" 
train_filepath = "../data/resdsql_pre/preprocessed_dataset_train.json"
batch_size = 2 #'input batch size.')


In [None]:
train_dataset = Text2SQLDataset(
        dir_ = train_filepath,
        mode = "train")

train_dataloder = DataLoader(
        train_dataset, 
        batch_size = batch_size, 
        shuffle = True,
        collate_fn = lambda x: x,
        drop_last = True
    )

In [None]:
for batch in train_dataloder:
    batch_inputs = [data[0] for data in batch]
    batch_sqls = [data[1] for data in batch]
    break

In [None]:
batch_inputs, batch_sqls

In [None]:
max_encoder_len = 43
max_decoder_len = 127

# max_encoder_len += 2
# max_decoder_len += 2

gradient_descent_step = 4 #'perform gradient descent per "gradient_descent_step" steps.')
# device = "2" #'the id of used GPU device.')
learning_rate = 3e-5 #'learning rate.')
epochs = 1 #'training epochs.')
seed = 42 #'random seed.')
save_path = "models/text2sql" #'save path of best fine-tuned text2sql model.')
tensorboard_save_path= "tb/text2sql" #'save path of tensorboard log.')
'''
pre-trained model name. 
options: 
    t5-base, https://huggingface.co/t5-base;
    t5-large, https://huggingface.co/t5-large;
    t5-3b, https://huggingface.co/t5-3b;
)'''

model_name_or_path = "t5-small" #"t5-3b",
use_adafactor = True #'whether to use adafactor optimizer.')
mode = "train" #'trian, eval or test.')
# dev_filepath = "data/preprocessed_data/resdsql_dev.json" #'file path of test2sql dev set.')
# original_dev_filepath = "data/spider/dev.json" #'file path of the original dev set (for registing evaluator).')
db_path = "database" #file path of database.')
# tables_for_natsql = "NatSQL/NatSQLv1_6/tables_for_natsql.json" #'file path of tables_for_natsql.json.')
num_beams = 8 #'beam size in model.generate() function.')
num_return_sequences = 8 #'the number of returned sequences in model.generate() function (num_return_sequences <= num_beams).')

output = "predicted_sql.txt" #"save file of the predicted sqls.")
    

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

set_seed(seed)
writer = SummaryWriter(tensorboard_save_path)

device

In [None]:
text2sql_tokenizer = T5TokenizerFast.from_pretrained(
    model_name_or_path,
    add_prefix_space = True
)

In [None]:
if isinstance(text2sql_tokenizer, T5TokenizerFast):
    text2sql_tokenizer.add_tokens([AddedToken(" <="), AddedToken(" <")])

train_dataset = Text2SQLDataset(
    dir_ = train_filepath,
    mode = "train")

train_dataloder = DataLoader(
    train_dataset, 
    batch_size = batch_size, 
    shuffle = True,
    collate_fn = lambda x: x,
    drop_last = True
)

In [None]:
print("initializing text2sql model.")
# initialize model
model = T5ForConditionalGeneration.from_pretrained(model_name_or_path)
model.resize_token_embeddings(len(text2sql_tokenizer))
if torch.cuda.is_available():
    model = model.cuda()
print("finished.")

In [None]:
# warm up steps (10% training step)
num_warmup_steps = int(0.1*epochs*len(train_dataset)/batch_size)
# total training steps
num_training_steps = int(epochs*len(train_dataset)/batch_size)
# save checkpoint
num_checkpoint_steps = 500

print("Let's use Adafactor!")
optimizer = Adafactor(
    model.parameters(), 
    lr=learning_rate, 
    scale_parameter=False, 
    relative_step=False, 
    clip_threshold = 1.0,
    warmup_init=False)

#     print("Let's use AdamW!")
#     optimizer = optim.AdamW(
#         model.parameters(), 
#         lr = learning_rate)

scheduler = transformers.get_cosine_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = num_warmup_steps,
    num_training_steps = num_training_steps
)

In [None]:
model.train()
train_step = 0
# initialize array of losses 
losses = {'train': {}, "val": {}}

# for epoch in range(epochs):
with trange(epochs) as tr:
    for epoch in tr:
#         print(f"This is epoch {epoch+1}.")
        batch_loss = 0
    
        for idx, batch in enumerate(train_dataloder):
            train_step += 1

            batch_inputs = [data[0] for data in batch]
            batch_sqls = [data[1] for data in batch]
    #             batch_db_ids = [data[2] for data in batch] # unused
    #             batch_tc_original = [data[3] for data in batch] # unused

    #         if epoch == 0 and idx == 0:
    #             for batch_id in range(len(batch_inputs)):
    #                 print(f"batch_inputs - {batch_inputs[batch_id]}")
    #                 print(f"batch_sqls - {batch_sqls[batch_id]}")
    # #                 print("----------------------")

            tokenized_inputs = text2sql_tokenizer(
                batch_inputs, 
                padding = "max_length",
                return_tensors = "pt",
                max_length = max_encoder_len, #512,
                truncation = True
            )

            with text2sql_tokenizer.as_target_tokenizer():
                tokenized_outputs = text2sql_tokenizer(
                    batch_sqls, 
                    padding = "max_length", 
                    return_tensors = 'pt',
                    max_length = max_decoder_len, #256,
                    truncation = True
                )

            encoder_input_ids = tokenized_inputs["input_ids"]
            encoder_input_attention_mask = tokenized_inputs["attention_mask"]

            decoder_labels = tokenized_outputs["input_ids"]
            # replace padding token id's of the labels by -100 so it's ignored by the loss
            decoder_labels[decoder_labels == text2sql_tokenizer.pad_token_id] = -100
            decoder_attention_mask = tokenized_outputs["attention_mask"]

    #         if idx == 0:
    #             print(f"tokenized_inputs - {tokenized_inputs}")
    #             print(f"tokenized_outputs - {tokenized_outputs}")
    # #             print(f"encoder_input_ids - {encoder_input_ids}")
    #             print(f"encoder_input_attention_mask - {encoder_input_attention_mask}")
    #             print(f"decoder_labels - {decoder_labels}")
    #             print(f"decoder_attention_mask - {decoder_attention_mask}")

            if torch.cuda.is_available():
                encoder_input_ids = encoder_input_ids.cuda()
                encoder_input_attention_mask = encoder_input_attention_mask.cuda()
                decoder_labels = decoder_labels.cuda()
                decoder_attention_mask = decoder_attention_mask.cuda()

            model_outputs = model(
                input_ids = encoder_input_ids,
                attention_mask = encoder_input_attention_mask,
                labels = decoder_labels,
                decoder_attention_mask = decoder_attention_mask,
                return_dict = True
            )

            loss = model_outputs["loss"]
            loss.backward()
            
            batch_loss += loss

#             if scheduler is not None:
#                 scheduler.step()

            if writer is not None:
                # record training loss (tensorboard)
                writer.add_scalar('train loss', loss.item(), train_step)
                # record learning rate (tensorboard)
                writer.add_scalar('train lr', optimizer.state_dict()['param_groups'][0]['lr'], train_step)

            if train_step % gradient_descent_step == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                optimizer.zero_grad()

            if train_step % num_checkpoint_steps == 0 and epoch >= 6:
                print(f"At {train_step} training step, save a checkpoint.")
                os.makedirs(save_path, exist_ok = True)
                model.save_pretrained(save_directory = save_path + "/checkpoint-{}".format(train_step))
                text2sql_tokenizer.save_pretrained(save_directory = save_path + "/checkpoint-{}".format(train_step))

        batch_loss /= len(train_dataloder) 
        losses['train'][epoch] = f"{batch_loss:.3f}"
        #progress bar 
        tr.set_postfix({"epoch_num":epoch,
                        "loss":f"{batch_loss:.10f}"})
    #         break


In [None]:
for token_id in [42, 363, 1, 0, 58, 1738, 3,  9,  208,  122,   41, 4668,  834, 6254,  3,   61,   45, 6407]:
    vocab_word = text2sql_tokenizer.convert_ids_to_tokens(token_id)
    print(f"{token_id} - {vocab_word}")

In [1]:
import os
import time
import torch
from text2sql_decoding_utils import decode_sqls

from tokenizers import AddedToken
from transformers import T5TokenizerFast, T5ForConditionalGeneration
# from transformers.optimization import Adafactor
from transformers.trainer_utils import set_seed

from load_dataset import Text2SQLDataset
from torch.utils.data import DataLoader

from spider_metric.evaluator import EvaluateTool

from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
def _test(mode,
          dev_filepath,
          original_dev_filepath,
          save_path,
          db_path,
          batch_size,
          num_beams, num_return_sequences,
          output,
          seed, device):
    
    set_seed(seed)

    start_time = time.time()
    
    os.environ["CUDA_VISIBLE_DEVICES"] = device

    # initialize tokenizer
    tokenizer = T5TokenizerFast.from_pretrained(
        save_path,
        add_prefix_space = True
    )
    
    if isinstance(tokenizer, T5TokenizerFast):
        tokenizer.add_tokens([AddedToken(" <="), AddedToken(" <")])
    
    dev_dataset = Text2SQLDataset(
        dir_ = dev_filepath,
        mode = mode
    )

    dev_dataloder = DataLoader(
        dev_dataset, 
        batch_size = batch_size, 
        shuffle = False,
        collate_fn = lambda x: x,
        drop_last = False
    )

    # initialize model
    model = T5ForConditionalGeneration.from_pretrained(save_path)
    if torch.cuda.is_available():
        model = model.cuda()

    model.eval()
    predict_sqls = []
    for batch in tqdm(dev_dataloder):
        batch_inputs = [data[0] for data in batch]
        batch_db_ids = [data[1] for data in batch]
        batch_tc_original = [data[2] for data in batch]

        tokenized_inputs = tokenizer(
            batch_inputs, 
            return_tensors="pt",
            padding = "max_length",
            max_length = 512,
            truncation = True
        )
        
        encoder_input_ids = tokenized_inputs["input_ids"]
        encoder_input_attention_mask = tokenized_inputs["attention_mask"]
        if torch.cuda.is_available():
            encoder_input_ids = encoder_input_ids.cuda()
            encoder_input_attention_mask = encoder_input_attention_mask.cuda()

        with torch.no_grad():
            model_outputs = model.generate(
                input_ids = encoder_input_ids,
                attention_mask = encoder_input_attention_mask,
                max_length = 256,
                decoder_start_token_id = model.config.decoder_start_token_id,
                num_beams = num_beams,
                num_return_sequences = num_return_sequences
            )

            model_outputs = model_outputs.view(len(batch_inputs), num_return_sequences, model_outputs.shape[1])

            predict_sqls += decode_sqls(
                db_path, 
                model_outputs, 
                batch_db_ids, 
                batch_inputs, 
                tokenizer, 
                batch_tc_original
            )
            
        break

    new_dir = "/".join(output.split("/")[:-1]).strip()
    if new_dir != "":
        os.makedirs(new_dir, exist_ok = True)
    
    # save results
    with open(output, "w", encoding = 'utf-8') as f:
        for pred in predict_sqls:
            f.write(pred + "\n")
    
    end_time = time.time()
    print("Text-to-SQL inference spends {}s.".format(end_time-start_time))
    
    if mode == "eval":
        # initialize evaluator
        evaluator = EvaluateTool()
        evaluator.register_golds(original_dev_filepath, db_path)
        spider_metric_result = evaluator.evaluate(predict_sqls)
        print('exact_match score: {}'.format(spider_metric_result["exact_match"]))
        print('exec score: {}'.format(spider_metric_result["exec"]))
    
        return spider_metric_result["exact_match"], spider_metric_result["exec"]

In [6]:
_test(mode='eval',
      dev_filepath="../data/resdsql_pre/preprocessed_dataset_test.json",
      original_dev_filepath="../data/split/spider_test.json",
      db_path = "../spider_data/database",
      save_path="models/text2sql/checkpoint-6500",
      batch_size=1,
      num_beams=2,
      num_return_sequences=2,
      output = "predicted_sql.txt",
      seed=42,
      device="cpu")

  0%|                                                                                                                                                                                 | 0/696 [00:00<?, ?it/s]


Text-to-SQL inference spends 1.879492998123169s.


RuntimeError: asyncio.run() cannot be called from a running event loop

In [None]:
os.getcwd()