In [1]:
# # MOUNTING GOOGLE DRIVE
# from google.colab import drive
# drive.mount('/content/drive')


In [2]:
# import os
# print(os.getcwd())
# print(os.listdir("/content/drive/MyDrive"))
# # https://drive.google.com/drive/folders/1SWEp8TLKBX7NfjxO77tEkmofNNQLr6GM?usp=share_link
# wd = '/content/drive/MyDrive/CS 685/cs685_project/notebooks'
# print(os.listdir(wd))
# os.chdir(wd)
# print(os.getcwd())

In [3]:
# !pip install tokenizers
# !pip install transformers

In [4]:
import os
import json
import torch
import transformers
import argparse
import torch.optim as optim

import torch.optim as optim

from tqdm import tqdm
from copy import deepcopy
from tokenizers import AddedToken
from utils.classifier_metric.evaluator import cls_metric, auc_metric
from torch.utils.data import DataLoader
from transformers import RobertaTokenizerFast
from utils.classifier_model import MyClassifier
from utils.classifier_loss import ClassifierLoss
from transformers.trainer_utils import set_seed
from torch.utils.tensorboard import SummaryWriter
from load_dataset import ColumnAndTableClassifierDataset
# from schema_classifier import run

torch.cuda.is_available(), torch.device("cuda" if torch.cuda.is_available() else "cpu")

ModuleNotFoundError: No module named 'sklearn'

In [None]:

def parse_option():
    parser = argparse.ArgumentParser("command line arguments for fine-tuning schema item classifier.")

    parser.add_argument('--batch_size', type=int, default=2,
                        help='input batch size.')
    parser.add_argument('--gradient_descent_step', type=int, default=4,
                        help='perform gradient descent per "gradient_descent_step" steps.')
    parser.add_argument('--device', type=str, default="3",
                        help='the id of used GPU device.')
    parser.add_argument('--learning_rate', type=float, default=3e-5,
                        help='learning rate.')
    parser.add_argument('--gamma', type=float, default=1.0,
                        help='gamma parameter in the focal loss. Recommended: [0.0-2.0].')
    parser.add_argument('--alpha', type=float, default=1.0,
                        help='alpha parameter in the focal loss. Must between [0.0-1.0].')
    parser.add_argument('--epochs', type=int, default=50,
                        help='training epochs.')
    parser.add_argument('--patience', type=int, default=32,
                        help='patience step in early stopping. -1 means no early stopping.')
    parser.add_argument('--seed', type=int, default=42,
                        help='random seed.')
    parser.add_argument('--save_path', type=str, default="models/schema_item_classifier",
                        help='save path of best fine-tuned model on validation set.')
    parser.add_argument('--tensorboard_save_path', type=str, default=None,
                        help='save path of tensorboard log.')
    parser.add_argument('--train_filepath', type=str, default="../data/resdsql_pre/preprocessed_dataset_train.json",
                        help='path of pre-processed training dataset.')
    parser.add_argument('--dev_filepath', type=str, default="../data/resdsql_pre/preprocessed_dataset_test.json",
                        help='path of pre-processed development dataset.')
    parser.add_argument('--output_filepath', type=str, default="data/resdsql_pre/dataset_with_pred_probs.json",
                        help='path of the output dataset (used in eval mode).')
    parser.add_argument('--model_name_or_path', type=str, default="roberta-large",
                        help='''pre-trained model name.''')
    parser.add_argument('--use_contents', action='store_true',
                        help='whether to integrate db contents into input sequence')
    parser.add_argument('--add_fk_info', action='store_true',
                        help='whether to add [FK] tokens into input sequence')
    parser.add_argument('--mode', type=str, default="train",
                        help='trian, eval or test.')
    parser.add_argument("-f", required=False)
    opt = parser.parse_args()

    return opt


def prepare_batch_inputs_and_labels(batch, tokenizer):
    batch_size = len(batch)

    batch_questions = [data[0] for data in batch]

    batch_table_names = [data[1] for data in batch]
    batch_table_labels = [data[2] for data in batch]

    batch_column_infos = [data[3] for data in batch]
    batch_column_labels = [data[4] for data in batch]

    batch_input_tokens, batch_column_info_ids, batch_table_name_ids, batch_column_number_in_each_table = [], [], [], []
    for batch_id in range(batch_size):
        input_tokens = [batch_questions[batch_id]]
        table_names_in_one_db = batch_table_names[batch_id]
        column_infos_in_one_db = batch_column_infos[batch_id]

        batch_column_number_in_each_table.append(
            [len(column_infos_in_one_table) for column_infos_in_one_table in column_infos_in_one_db])

        column_info_ids, table_name_ids = [], []

        for table_id, table_name in enumerate(table_names_in_one_db):
            input_tokens.append("|")
            input_tokens.append(table_name)
            table_name_ids.append(len(input_tokens) - 1)
            input_tokens.append(":")

            for column_info in column_infos_in_one_db[table_id]:
                input_tokens.append(column_info)
                column_info_ids.append(len(input_tokens) - 1)
                input_tokens.append(",")

            input_tokens = input_tokens[:-1]

        batch_input_tokens.append(input_tokens)
        batch_column_info_ids.append(column_info_ids)
        batch_table_name_ids.append(table_name_ids)

    # notice: the trunction operation will discard some tables and columns that exceed the max length
    tokenized_inputs = tokenizer(
        batch_input_tokens,
        return_tensors="pt",
        is_split_into_words=True,
        padding="max_length",
        max_length=512,
        truncation=True
    )

    batch_aligned_question_ids, batch_aligned_column_info_ids, batch_aligned_table_name_ids = [], [], []
    batch_aligned_table_labels, batch_aligned_column_labels = [], []

    # align batch_question_ids, batch_column_info_ids, and batch_table_name_ids after tokenizing
    for batch_id in range(batch_size):
        word_ids = tokenized_inputs.word_ids(batch_index=batch_id)

        aligned_question_ids, aligned_table_name_ids, aligned_column_info_ids = [], [], []
        aligned_table_labels, aligned_column_labels = [], []

        # align question tokens
        for token_id, word_id in enumerate(word_ids):
            if word_id == 0:
                aligned_question_ids.append(token_id)

        # align table names
        for t_id, table_name_id in enumerate(batch_table_name_ids[batch_id]):
            temp_list = []
            for token_id, word_id in enumerate(word_ids):
                if table_name_id == word_id:
                    temp_list.append(token_id)
            # if the tokenizer doesn't discard current table name
            if len(temp_list) != 0:
                aligned_table_name_ids.append(temp_list)
                aligned_table_labels.append(batch_table_labels[batch_id][t_id])

        # align column names
        for c_id, column_id in enumerate(batch_column_info_ids[batch_id]):
            temp_list = []
            for token_id, word_id in enumerate(word_ids):
                if column_id == word_id:
                    temp_list.append(token_id)
            # if the tokenizer doesn't discard current column name
            if len(temp_list) != 0:
                aligned_column_info_ids.append(temp_list)
                aligned_column_labels.append(batch_column_labels[batch_id][c_id])

        batch_aligned_question_ids.append(aligned_question_ids)
        batch_aligned_table_name_ids.append(aligned_table_name_ids)
        batch_aligned_column_info_ids.append(aligned_column_info_ids)
        batch_aligned_table_labels.append(aligned_table_labels)
        batch_aligned_column_labels.append(aligned_column_labels)

    # update column number in each table (because some tables and columns are discarded)
    for batch_id in range(batch_size):
        if len(batch_column_number_in_each_table[batch_id]) > len(batch_aligned_table_labels[batch_id]):
            batch_column_number_in_each_table[batch_id] = batch_column_number_in_each_table[batch_id][
                                                          : len(batch_aligned_table_labels[batch_id])]

        if sum(batch_column_number_in_each_table[batch_id]) > len(batch_aligned_column_labels[batch_id]):
            truncated_column_number = sum(batch_column_number_in_each_table[batch_id]) - len(
                batch_aligned_column_labels[batch_id])
            batch_column_number_in_each_table[batch_id][-1] -= truncated_column_number

    encoder_input_ids = tokenized_inputs["input_ids"]
    encoder_input_attention_mask = tokenized_inputs["attention_mask"]
    batch_aligned_column_labels = [torch.LongTensor(column_labels) for column_labels in batch_aligned_column_labels]
    batch_aligned_table_labels = [torch.LongTensor(table_labels) for table_labels in batch_aligned_table_labels]

    # print("\n".join(tokenizer.batch_decode(encoder_input_ids, skip_special_tokens = True)))

    if torch.cuda.is_available():
        encoder_input_ids = encoder_input_ids.cuda()
        encoder_input_attention_mask = encoder_input_attention_mask.cuda()
        batch_aligned_column_labels = [column_labels.cuda() for column_labels in batch_aligned_column_labels]
        batch_aligned_table_labels = [table_labels.cuda() for table_labels in batch_aligned_table_labels]

    return encoder_input_ids, encoder_input_attention_mask, \
        batch_aligned_column_labels, batch_aligned_table_labels, \
        batch_aligned_question_ids, batch_aligned_column_info_ids, \
        batch_aligned_table_name_ids, batch_column_number_in_each_table



In [None]:
# PREDICTION

total_table_pred_probs, total_column_pred_probs = _test(opt)

with open(opt.dev_filepath, "r") as f:
    dataset = json.load(f)

# record predicted probability
truncated_data_info = []
for data_id, data in enumerate(dataset):
    table_num = len(data["table_labels"])
    if table_num == len(total_table_pred_probs[data_id]):
        table_pred_probs = total_table_pred_probs[data_id]
    else:
        table_pred_probs = total_table_pred_probs[data_id] + [-1 for _ in range(
            table_num - len(total_table_pred_probs[data_id]))]

    truncated_table_ids = []
    column_pred_probs = []
    for table_id in range(table_num):
        if table_id >= len(total_column_pred_probs[data_id]):
            truncated_table_ids.append(table_id)
            column_pred_probs.append([-1 for _ in range(len(data["column_labels"][table_id]))])
            continue
        if len(total_column_pred_probs[data_id][table_id]) == len(data["column_labels"][table_id]):
            column_pred_probs.append(total_column_pred_probs[data_id][table_id])
        else:
            truncated_table_ids.append(table_id)
            truncated_column_num = len(data["column_labels"][table_id]) - len(
                total_column_pred_probs[data_id][table_id])
            column_pred_probs.append(
                total_column_pred_probs[data_id][table_id] + [-1 for _ in range(truncated_column_num)])

    data["column_pred_probs"] = column_pred_probs
    data["table_pred_probs"] = table_pred_probs

    if len(truncated_table_ids) > 0:
        truncated_data_info.append([data_id, truncated_table_ids])

# additionally, we need to consider and predict discarded tables and columns
while len(truncated_data_info) != 0:
    truncated_dataset = []
    for truncated_data_id, truncated_table_ids in truncated_data_info:
        print(dataset[truncated_data_id]["question"])
        truncated_data = deepcopy(dataset[truncated_data_id])
        truncated_data["db_schema"] = [truncated_data["db_schema"][table_id] for table_id in
                                        truncated_table_ids]
        truncated_data["table_labels"] = [truncated_data["table_labels"][table_id] for table_id in
                                          truncated_table_ids]
        truncated_data["column_labels"] = [truncated_data["column_labels"][table_id] for table_id in
                                            truncated_table_ids]
        truncated_data["table_pred_probs"] = [truncated_data["table_pred_probs"][table_id] for table_id in
                                              truncated_table_ids]
        truncated_data["column_pred_probs"] = [truncated_data["column_pred_probs"][table_id] for table_id in
                                                truncated_table_ids]

        truncated_dataset.append(truncated_data)

    with open("./data/resdsql_pre/truncated_dataset.json", "w") as f:
        f.write(json.dumps(truncated_dataset, indent=2))

    opt.dev_filepath = "./data/resdsql_pre/truncated_dataset.json"
    total_table_pred_probs, total_column_pred_probs = _test(opt)

    for data_id, data in enumerate(truncated_dataset):
        table_num = len(data["table_labels"])
        if table_num == len(total_table_pred_probs[data_id]):
            table_pred_probs = total_table_pred_probs[data_id]
        else:
            table_pred_probs = total_table_pred_probs[data_id] + [-1 for _ in range(
                table_num - len(total_table_pred_probs[data_id]))]

        column_pred_probs = []
        for table_id in range(table_num):
            if table_id >= len(total_column_pred_probs[data_id]):
                column_pred_probs.append([-1 for _ in range(len(data["column_labels"][table_id]))])
                continue
            if len(total_column_pred_probs[data_id][table_id]) == len(data["column_labels"][table_id]):
                column_pred_probs.append(total_column_pred_probs[data_id][table_id])
            else:
                truncated_column_num = len(data["column_labels"][table_id]) - len(
                    total_column_pred_probs[data_id][table_id])
                column_pred_probs.append(
                    total_column_pred_probs[data_id][table_id] + [-1 for _ in range(truncated_column_num)])

        # fill the predicted probability into the dataset
        truncated_data_id = truncated_data_info[data_id][0]
        truncated_table_ids = truncated_data_info[data_id][1]
        for idx, truncated_table_id in enumerate(truncated_table_ids):
            dataset[truncated_data_id]["table_pred_probs"][truncated_table_id] = table_pred_probs[idx]
            dataset[truncated_data_id]["column_pred_probs"][truncated_table_id] = column_pred_probs[idx]

    # check if there are tables and columns in the new dataset that have not yet been predicted
    truncated_data_info = []
    for data_id, data in enumerate(dataset):
        table_num = len(data["table_labels"])

        truncated_table_ids = []
        for table_id in range(table_num):
            # the current table is not predicted
            if data["table_pred_probs"][table_id] == -1:
                truncated_table_ids.append(table_id)
            # some columns in the current table are not predicted
            if data["table_pred_probs"][table_id] != -1 and -1 in data["column_pred_probs"][table_id]:
                truncated_table_ids.append(table_id)

        if len(truncated_table_ids) > 0:
            truncated_data_info.append([data_id, truncated_table_ids])

    os.remove("./data/resdsql_pre/truncated_dataset.json")

with open(opt.output_filepath, "w") as f:
    f.write(json.dumps(dataset, indent=2))


In [6]:
def _train(opt):
    print(opt)
    set_seed(opt.seed)

    patience = opt.patience if opt.patience > 0 else float('inf')

    if opt.tensorboard_save_path is not None:
        writer = SummaryWriter(opt.tensorboard_save_path)
    else:
        writer = None

    os.environ["CUDA_VISIBLE_DEVICES"] = opt.device
    print(opt.device)

    tokenizer = RobertaTokenizerFast.from_pretrained(
        opt.model_name_or_path,
        add_prefix_space=True
    )
    tokenizer.add_tokens(AddedToken("[FK]"))

    train_dataset = ColumnAndTableClassifierDataset(
        dir_=opt.train_filepath,
        use_contents=opt.use_contents,
        add_fk_info=opt.add_fk_info
    )

    train_dataloder = DataLoader(
        train_dataset,
        batch_size=opt.batch_size,
        shuffle=True,
        collate_fn=lambda x: x
    )

    # initialize model
    model = MyClassifier(
        model_name_or_path=opt.model_name_or_path,
        vocab_size=len(tokenizer),
        mode=opt.mode
    )

    if torch.cuda.is_available():
        print("Model is running on cuda")
        model = model.cuda()
        # device_name = torch.cuda.get_device_name()
        # n_gpu = torch.cuda.device_count()
        # print(f"Found device: {device_name}, n_gpu: {n_gpu}")

    # warm up steps (10% training step)
    num_warmup_steps = int(0.1 * opt.epochs * len(train_dataset) / opt.batch_size)
    # total training steps
    num_training_steps = int(opt.epochs * len(train_dataset) / opt.batch_size)
    # evaluate model for each 1.42857 epochs (about 1.42857*7000=10000 examples for Spider)
    num_checkpoint_steps = int(1.42857 * len(train_dataset) / opt.batch_size)

    optimizer = optim.AdamW(
        params=model.parameters(),
        lr=opt.learning_rate
    )

    scheduler = transformers.get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_score, early_stop_step, train_step = 0, 0, 0
    encoder_loss_func = ClassifierLoss(alpha=opt.alpha, gamma=opt.gamma)

    from tqdm import tqdm

    for epoch in range(opt.epochs):
        print(f"This is epoch {epoch + 1}.")
        for batch in tqdm(train_dataloder):
            model.train()
            train_step += 1

            encoder_input_ids, encoder_input_attention_mask, \
                batch_column_labels, batch_table_labels, batch_aligned_question_ids, \
                batch_aligned_column_info_ids, batch_aligned_table_name_ids, \
                batch_column_number_in_each_table = prepare_batch_inputs_and_labels(batch, tokenizer)

            model_outputs = model(
                encoder_input_ids,
                encoder_input_attention_mask,
                batch_aligned_question_ids,
                batch_aligned_column_info_ids,
                batch_aligned_table_name_ids,
                batch_column_number_in_each_table
            )

            loss = encoder_loss_func.compute_loss(
                model_outputs["batch_table_name_cls_logits"],
                batch_table_labels,
                model_outputs["batch_column_info_cls_logits"],
                batch_column_labels
            )

            loss.backward()

            # update lr
            if scheduler is not None:
                scheduler.step()

            if writer is not None:
                # record training loss (tensorboard)
                writer.add_scalar('train loss', loss.item(), train_step)
                # record learning rate (tensorboard)
                writer.add_scalar('train lr', optimizer.state_dict()['param_groups'][0]['lr'], train_step)

            if train_step % opt.gradient_descent_step == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                optimizer.zero_grad()

            if early_stop_step >= patience:
                break

        if early_stop_step >= patience:
            print("Classifier training process triggers early stopping.")
            break

    print("best auc score:", best_score)



In [7]:
opt = parse_option()
_train(opt)

Namespace(batch_size=2, gradient_descent_step=4, device='3', learning_rate=3e-05, gamma=1.0, alpha=1.0, epochs=50, patience=32, seed=42, save_path='models/schema_item_classifier', tensorboard_save_path=None, train_filepath='../data/resdsql_pre/preprocessed_dataset_train.json', dev_filepath='../data/resdsql_pre/preprocessed_dataset_test.json', output_filepath='data/resdsql_pre/dataset_with_pred_probs.json', model_name_or_path='roberta-large', use_contents=False, add_fk_info=False, mode='train', f='/root/.local/share/jupyter/runtime/kernel-8b44a98d-b4e0-4a32-9462-cb2c5825b46a.json')
3


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model is running on cuda
This is epoch 1.


  2%|▏         | 74/3152 [00:57<39:44,  1.29it/s]


KeyboardInterrupt: ignored

In [None]:
# assert torch.cuda.is_available()
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# if __name__ == "__main__":
opt = parse_option()
if opt.mode == "train":
    _train(opt)
elif opt.mode in ["eval", "test"]:
    total_table_pred_probs, total_column_pred_probs = _test(opt)

    with open(opt.dev_filepath, "r") as f:
        dataset = json.load(f)

    # record predicted probability
    truncated_data_info = []
    for data_id, data in enumerate(dataset):
        table_num = len(data["table_labels"])
        if table_num == len(total_table_pred_probs[data_id]):
            table_pred_probs = total_table_pred_probs[data_id]
        else:
            table_pred_probs = total_table_pred_probs[data_id] + [-1 for _ in range(
                table_num - len(total_table_pred_probs[data_id]))]

        truncated_table_ids = []
        column_pred_probs = []
        for table_id in range(table_num):
            if table_id >= len(total_column_pred_probs[data_id]):
                truncated_table_ids.append(table_id)
                column_pred_probs.append([-1 for _ in range(len(data["column_labels"][table_id]))])
                continue
            if len(total_column_pred_probs[data_id][table_id]) == len(data["column_labels"][table_id]):
                column_pred_probs.append(total_column_pred_probs[data_id][table_id])
            else:
                truncated_table_ids.append(table_id)
                truncated_column_num = len(data["column_labels"][table_id]) - len(
                    total_column_pred_probs[data_id][table_id])
                column_pred_probs.append(
                    total_column_pred_probs[data_id][table_id] + [-1 for _ in range(truncated_column_num)])

        data["column_pred_probs"] = column_pred_probs
        data["table_pred_probs"] = table_pred_probs

        if len(truncated_table_ids) > 0:
            truncated_data_info.append([data_id, truncated_table_ids])

    # additionally, we need to consider and predict discarded tables and columns
    while len(truncated_data_info) != 0:
        truncated_dataset = []
        for truncated_data_id, truncated_table_ids in truncated_data_info:
            print(dataset[truncated_data_id]["question"])
            truncated_data = deepcopy(dataset[truncated_data_id])
            truncated_data["db_schema"] = [truncated_data["db_schema"][table_id] for table_id in
                                            truncated_table_ids]
            truncated_data["table_labels"] = [truncated_data["table_labels"][table_id] for table_id in
                                              truncated_table_ids]
            truncated_data["column_labels"] = [truncated_data["column_labels"][table_id] for table_id in
                                                truncated_table_ids]
            truncated_data["table_pred_probs"] = [truncated_data["table_pred_probs"][table_id] for table_id in
                                                  truncated_table_ids]
            truncated_data["column_pred_probs"] = [truncated_data["column_pred_probs"][table_id] for table_id in
                                                    truncated_table_ids]

            truncated_dataset.append(truncated_data)

        with open("./data/resdsql_pre/truncated_dataset.json", "w") as f:
            f.write(json.dumps(truncated_dataset, indent=2))

        opt.dev_filepath = "./data/resdsql_pre/truncated_dataset.json"
        total_table_pred_probs, total_column_pred_probs = _test(opt)

        for data_id, data in enumerate(truncated_dataset):
            table_num = len(data["table_labels"])
            if table_num == len(total_table_pred_probs[data_id]):
                table_pred_probs = total_table_pred_probs[data_id]
            else:
                table_pred_probs = total_table_pred_probs[data_id] + [-1 for _ in range(
                    table_num - len(total_table_pred_probs[data_id]))]

            column_pred_probs = []
            for table_id in range(table_num):
                if table_id >= len(total_column_pred_probs[data_id]):
                    column_pred_probs.append([-1 for _ in range(len(data["column_labels"][table_id]))])
                    continue
                if len(total_column_pred_probs[data_id][table_id]) == len(data["column_labels"][table_id]):
                    column_pred_probs.append(total_column_pred_probs[data_id][table_id])
                else:
                    truncated_column_num = len(data["column_labels"][table_id]) - len(
                        total_column_pred_probs[data_id][table_id])
                    column_pred_probs.append(
                        total_column_pred_probs[data_id][table_id] + [-1 for _ in range(truncated_column_num)])

            # fill the predicted probability into the dataset
            truncated_data_id = truncated_data_info[data_id][0]
            truncated_table_ids = truncated_data_info[data_id][1]
            for idx, truncated_table_id in enumerate(truncated_table_ids):
                dataset[truncated_data_id]["table_pred_probs"][truncated_table_id] = table_pred_probs[idx]
                dataset[truncated_data_id]["column_pred_probs"][truncated_table_id] = column_pred_probs[idx]

        # check if there are tables and columns in the new dataset that have not yet been predicted
        truncated_data_info = []
        for data_id, data in enumerate(dataset):
            table_num = len(data["table_labels"])

            truncated_table_ids = []
            for table_id in range(table_num):
                # the current table is not predicted
                if data["table_pred_probs"][table_id] == -1:
                    truncated_table_ids.append(table_id)
                # some columns in the current table are not predicted
                if data["table_pred_probs"][table_id] != -1 and -1 in data["column_pred_probs"][table_id]:
                    truncated_table_ids.append(table_id)

            if len(truncated_table_ids) > 0:
                truncated_data_info.append([data_id, truncated_table_ids])

        os.remove("./data/resdsql_pre/truncated_dataset.json")

    with open(opt.output_filepath, "w") as f:
        f.write(json.dumps(dataset, indent=2))


Namespace(batch_size=2, gradient_descent_step=4, device='3', learning_rate=3e-05, gamma=1.0, alpha=1.0, epochs=50, patience=32, seed=42, save_path='models/schema_item_classifier', tensorboard_save_path=None, train_filepath='data/resdsql_pre/preprocessed_dataset_train.json', dev_filepath='data/resdsql_pre/preprocessed_dataset_test.json', output_filepath='data/resdsql_pre/dataset_with_pred_probs.json', model_name_or_path='roberta-large', use_contents=False, add_fk_info=False, mode='train', f='/root/.local/share/jupyter/runtime/kernel-f12ce7a3-ce04-43e2-b6e7-cb4f46f8d7a7.json')
3


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


This is epoch 1.




This is epoch 2.
At 4502 training step, start an evaluation.


ValueError: ignored