In [1]:
import math

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel, BertConfig, BertPreTrainedModel, BertTokenizer
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from model_gcn import GAT, GCN, Rel_GAT
from model_utils import LinearAttention, DotprodAttention, RelationAttention, Highway, mask_logits
from tree import *
# used reference from : https://github.com/shenwzh3/RGAT-ABSA
# inspired from https://aclanthology.org/2020.acl-main.295.pdf
import os

from transformers import AdamW
from transformers import BertTokenizer

import random

from tensorboardX import SummaryWriter
import logging
logger = logging.getLogger(__name__)
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange
from sklearn.metrics import f1_score, matthews_corrcoef
'''
Trying to add RGAT layers in initial graphix based approach

Used some parts from natsql repo
'''

def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)



In [None]:
class R_GATLayer(nn.Module):
    def __init__(self, in_dim, out_dim, num_relations, num_heads=1):
        super(R_GATLayer, self).__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.num_relations = num_relations
        self.num_heads = num_heads

        self.W = nn.Parameter(torch.Tensor(num_heads, in_dim, out_dim))
        nn.init.xavier_uniform_(self.W)

        self.a = nn.Parameter(torch.Tensor(num_heads, 2*out_dim, 1))
        nn.init.xavier_uniform_(self.a)

        self.relations = nn.Parameter(torch.Tensor(num_relations, out_dim, out_dim))
        nn.init.xavier_uniform_(self.relations)

    def forward(self, x, edge_lists):
        h = torch.matmul(x, self.W)

        heads = []
        for i in range(self.num_heads):
            attention_input = torch.cat([h[edge_lists[:, 0]], h[edge_lists[:, 1]], self.relations[edge_lists[:, 2]]], dim=-1)
            attention_logits = torch.matmul(attention_input, self.a[i])
            attention_weights = F.softmax(attention_logits, dim=0)
            head = torch.sum(attention_weights * h[edge_lists[:, 1]], dim=0)
            heads.append(head)

        output = torch.mean(torch.stack(heads), dim=0)

        return output


In [None]:
class R_GAT(nn.Module):
    def __init__(self, in_dim, out_dim, num_relations, num_heads=1, num_layers=1):
        super(R_GAT, self).__init__()
        self.num_layers = num_layers

        self.layers = nn.ModuleList()
        for i in range(num_layers):
            if i == 0:
                self.layers.append(R_GATLayer(in_dim, out_dim, num_relations, num_heads))
            else:
                self.layers.append(R_GATLayer(out_dim, out_dim, num_relations, num_heads))

    def forward(self, x, edge_lists):
        for i in range(self.num_layers):
            x = self.layers[i](x, edge_lists)

        return x

In [2]:
class Aspect_Text_GAT_only(nn.Module):
    """
    reshape tree in GAT only
    """
    def __init__(self, num_embeddings, embed_dim,dropout, highway, num_layers, embedding_dim, hidden_size, gat_attention_type, final_hidden_size, num_mlps, num_heads, num_classes ):
        super(Aspect_Text_GAT_only, self).__init__()

        self.embed = nn.Embedding(num_embeddings, embed_dim)
        # self.embed.weight = nn.Parameter(args.glove_embedding, requires_grad=False)

        self.dropout = nn.Dropout(dropout)
        self.tanh = nn.Tanh()

        if highway:
            self.highway = Highway(num_layers, embedding_dim)

        self.bilstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size,
                                  bidirectional=True, batch_first=True, num_layers=num_layers)
        gcn_input_dim = hidden_size * 2

        # if args.gat:
        if gat_attention_type == 'linear':
            self.gat = [LinearAttention(in_dim = gcn_input_dim, mem_dim = gcn_input_dim).to(args.device) for i in range(args.num_heads)] # we prefer to keep the dimension unchanged
        elif gat_attention_type == 'dotprod':
            self.gat = [DotprodAttention().to(device) for i in range(num_heads)]
        else:
            # reshaped gcn
            self.gat = nn.Linear(gcn_input_dim, gcn_input_dim)


        last_hidden_size = hidden_size * 2

        layers = [
            nn.Linear(last_hidden_size, final_hidden_size), nn.ReLU()]
        for _ in range(num_mlps-1):
            layers += [nn.Linear(final_hidden_size,
                                 final_hidden_size), nn.ReLU()]
        self.fcs = nn.Sequential(*layers)
        self.fc_final = nn.Linear(final_hidden_size, num_classes)

    def forward(self, sentence, aspect, pos_class, dep_tags, text_len, aspect_len, dep_rels, dep_heads, aspect_position, dep_dirs):
        '''
        Forward takes:
            sentence: sentence_id of size (batch_size, text_length)
            aspect: aspect_id of size (batch_size, aspect_length)
            pos_class: pos_tag_id of size (batch_size, text_length)
            dep_tags: dep_tag_id of size (batch_size, text_length)
            text_len: (batch_size,) length of each sentence
            aspect_len: (batch_size, ) aspect length of each sentence
            dep_rels: (batch_size, text_length) relation
            dep_heads: (batch_size, text_length) which node adjacent to that node
            aspect_position: (batch_size, text_length) mask, with the position of aspect as 1 and others as 0
            dep_dirs: (batch_size, text_length) the directions each node to the aspect
        '''
        fmask = (torch.zeros_like(sentence) != sentence).float()  # (N，L)
        dmask = (torch.zeros_like(dep_tags) != dep_tags).float()  # (N ,L)

        feature = self.embed(sentence)  # (N, L, D)
        aspect_feature = self.embed(aspect) # (N, L', D)
        feature = self.dropout(feature)
        aspect_feature = self.dropout(aspect_feature)


        if self.args.highway:
            feature = self.highway(feature)
            aspect_feature = self.highway(aspect_feature)

        feature, _ = self.bilstm(feature) # (N,L,D)
        aspect_feature, _ = self.bilstm(aspect_feature) #(N,L,D)

        aspect_feature = aspect_feature.mean(dim = 1) # (N, D)

        ############################################################################################

        if self.args.gat_attention_type == 'gcn':
            gat_out = self.gat(feature) # (N, L, D)
            fmask = fmask.unsqueeze(2)
            gat_out = gat_out * fmask
            gat_out = F.relu(torch.sum(gat_out, dim = 1)) # (N, D)

        else:
            gat_out = [g(feature, aspect_feature, fmask).unsqueeze(1) for g in self.gat]
            gat_out = torch.cat(gat_out, dim=1)
            gat_out = gat_out.mean(dim=1)

        feature_out = gat_out # (N, D')
        # feature_out = gat_out
        #############################################################################################
        x = self.dropout(feature_out)
        x = self.fcs(x)
        logit = self.fc_final(x)
        return logit


In [3]:
# get dataset
from load_dataset import Text2SQLDataset
from torch.utils.data import DataLoader

train_filepath = "../data/resdsql_pre/preprocessed_dataset_test.json"
batch_size = 2 #'input batch size.')

train_dataset = Text2SQLDataset(
        dir_ = train_filepath,
        mode = "train")

train_dataloder = DataLoader(
        train_dataset,
        batch_size = batch_size,
        shuffle = True,
        collate_fn = lambda x: x,
        drop_last = True
    )


# def load_datasets_and_vocabs(args):
#     train, test = get_dataset(args.dataset_name)
#
#     # Our model takes unrolled data, currently we don't consider the MAMS cases(future experiments)
#     _, train_all_unrolled, _, _ = get_rolled_and_unrolled_data(train, args)
#     _, test_all_unrolled, _, _ = get_rolled_and_unrolled_data(test, args)
#
#     logger.info('****** After unrolling ******')
#     logger.info('Train set size: %s', len(train_all_unrolled))
#     logger.info('Test set size: %s,', len(test_all_unrolled))
#
#     # Build word vocabulary(part of speech, dep_tag) and save pickles.
#     word_vecs, word_vocab, dep_tag_vocab, pos_tag_vocab = load_and_cache_vocabs(
#         train_all_unrolled+test_all_unrolled, args)
#     if args.embedding_type == 'glove':
#         embedding = torch.from_numpy(np.asarray(word_vecs, dtype=np.float32))
#         args.glove_embedding = embedding
#
#     train_dataset = ASBA_Depparsed_Dataset(
#         train_all_unrolled, args, word_vocab, dep_tag_vocab, pos_tag_vocab)
#     test_dataset = ASBA_Depparsed_Dataset(
#         test_all_unrolled, args, word_vocab, dep_tag_vocab, pos_tag_vocab)
#
#     return train_dataset, test_dataset, word_vocab, dep_tag_vocab, pos_tag_vocab



In [None]:


def get_bert_optimizer(args, model):
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(
            nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate, eps=args.adam_epsilon)
    # scheduler = WarmupLinearSchedule(
    #     optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
    return optimizer

In [4]:
from transformers import T5TokenizerFast


def train_2( model, num_epochs, train_dataloder, optimizer, scheduler ) :
    # Train the model
    # Load the data and tokenizer
    # tokenizer = T5Tokenizer.from_pretrained('t5-small')
    text2sql_tokenizer = T5TokenizerFast.from_pretrained(
        't5-small',
        add_prefix_space = True
    )
    model.train()
    for epoch in range(num_epochs):
        for idx, batch in enumerate(train_dataloder):

            batch_inputs = [data[0] for data in batch]
            batch_sqls = [data[1] for data in batch]

            if epoch == 0 and idx == 0:
                for batch_id in range(len(batch_inputs)):
                    print(f"batch_inputs - {batch_inputs[batch_id]}")
                    print(f"batch_sqls - {batch_sqls[batch_id]}")
    #                 print("----------------------")

            tokenized_inputs = text2sql_tokenizer(
                batch_inputs,
                padding = "max_length",
                return_tensors = "pt",
                max_length = 512, #512, max_encoder_len
                truncation = True
            )

            with text2sql_tokenizer.as_target_tokenizer():
                tokenized_outputs = text2sql_tokenizer(
                    batch_sqls,
                    padding = "max_length",
                    return_tensors = 'pt',
                    max_length = 256, #256, max_decoder_len
                    truncation = True
                )

            encoder_input_ids = tokenized_inputs["input_ids"].to(device)
            encoder_input_attention_mask = tokenized_inputs["attention_mask"].to(device)

            decoder_input_ids = tokenized_outputs["input_ids"].to(device)
            decoder_attention_mask = tokenized_outputs["attention_mask"].to(device)
            labels = None #tokenized_outputs["attention_mask"].to(device)

            optimizer.zero_grad()
            loss = model(input_ids=encoder_input_ids,
                         attention_mask=encoder_input_attention_mask,
                         decoder_input_ids=decoder_input_ids,
                         decoder_attention_mask=decoder_attention_mask,
                         labels=labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            break

    # Save the trained model
    torch.save(model.state_dict(), 'RGAT_model.pt')



In [None]:
# from sklearn.metrics import f1_score, matthews_corrcoef
#
# def train(args, train_dataset, model, test_dataset):
#     '''Train the model'''
#     tb_writer = SummaryWriter()
#
#     args.train_batch_size = args.per_gpu_train_batch_size
#     train_sampler = RandomSampler(train_dataset)
#
#     train_dataloader = DataLoader(train_dataset, sampler=train_sampler,
#                                   batch_size=args.train_batch_size)
#
#     if args.max_steps > 0:
#         t_total = args.max_steps
#         args.num_train_epochs = args.max_steps // (
#             len(train_dataloader) // args.gradient_accumulation_steps) + 1
#     else:
#         t_total = len(
#             train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
#
#
#     if args.embedding_type == 'bert':
#         optimizer = get_bert_optimizer(args, model)
#     else:
#         parameters = filter(lambda param: param.requires_grad, model.parameters())
#         optimizer = torch.optim.Adam(parameters, lr=args.learning_rate)
#
#     # Train
#     logger.info("***** Running training *****")
#     logger.info("  Num examples = %d", len(train_dataset))
#     logger.info("  Num Epochs = %d", args.num_train_epochs)
#     logger.info("  Instantaneous batch size per GPU = %d",
#                 args.per_gpu_train_batch_size)
#     logger.info("  Gradient Accumulation steps = %d",
#                 args.gradient_accumulation_steps)
#     logger.info("  Total optimization steps = %d", t_total)
#
#
#     global_step = 0
#     tr_loss, logging_loss = 0.0, 0.0
#     all_eval_results = []
#     model.zero_grad()
#     train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
#     set_seed(args)
#     for _ in train_iterator:
#         # epoch_iterator = tqdm(train_dataloader, desc='Iteration')
#         for step, batch in enumerate(train_dataloader):
#             model.train()
#             batch = tuple(t.to(args.device) for t in batch)
#
#             inputs, labels = get_input_from_batch(args, batch)
#             logit = model(**inputs)
#             loss = F.cross_entropy(logit, labels)
#
#             if args.gradient_accumulation_steps > 1:
#                 loss = loss / args.gradient_accumulation_steps
#
#             loss.backward()
#             torch.nn.utils.clip_grad_norm_(
#                 model.parameters(), args.max_grad_norm)
#
#             tr_loss += loss.item()
#             if (step + 1) % args.gradient_accumulation_steps == 0:
#                 # scheduler.step()  # Update learning rate schedule
#                 optimizer.step()
#                 model.zero_grad()
#                 global_step += 1
#
#                 # Log metrics
#                 if args.logging_steps > 0 and global_step % args.logging_steps == 0:
#                     results, eval_loss = evaluate(args, test_dataset, model)
#                     all_eval_results.append(results)
#                     for key, value in results.items():
#                         tb_writer.add_scalar(
#                             'eval_{}'.format(key), value, global_step)
#                     tb_writer.add_scalar('eval_loss', eval_loss, global_step)
#                     # tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
#                     tb_writer.add_scalar(
#                         'train_loss', (tr_loss - logging_loss) / args.logging_steps, global_step)
#                     logging_loss = tr_loss
#
#                 # Save model checkpoint
#
#             if args.max_steps > 0 and global_step > args.max_steps:
#                 # epoch_iterator.close()
#                 break
#         if args.max_steps > 0 and global_step > args.max_steps:
#             # epoch_iterator.close()
#             break
#
#     tb_writer.close()
#     return global_step, tr_loss/global_step, all_eval_results
#
#
#
# # def evaluate(args, eval_dataset, model):
# #     results = {}
# #
# #     args.eval_batch_size = args.per_gpu_eval_batch_size
# #     eval_sampler = SequentialSampler(eval_dataset)
# #     # collate_fn = get_collate_fn(args)
# #     eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler,
# #                                  batch_size=args.eval_batch_size) #,
# #                                  # collate_fn=collate_fn)
# #
# #     # Eval
# #     logger.info("***** Running evaluation *****")
# #     logger.info("  Num examples = %d", len(eval_dataset))
# #     logger.info("  Batch size = %d", args.eval_batch_size)
# #     eval_loss = 0.0
# #     nb_eval_steps = 0
# #     preds = None
# #     out_label_ids = None
# #     for batch in eval_dataloader:
# #     # for batch in tqdm(eval_dataloader, desc='Evaluating'):
# #         model.eval()
# #         batch = tuple(t.to(args.device) for t in batch)
# #         with torch.no_grad():
# #             inputs, labels = get_input_from_batch(args, batch)
# #
# #             logits = model(**inputs)
# #             tmp_eval_loss = F.cross_entropy(logits, labels)
# #
# #             eval_loss += tmp_eval_loss.mean().item()
# #         nb_eval_steps += 1
# #         if preds is None:
# #             preds = logits.detach().cpu().numpy()
# #             out_label_ids = labels.detach().cpu().numpy()
# #         else:
# #             preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
# #             out_label_ids = np.append(
# #                 out_label_ids, labels.detach().cpu().numpy(), axis=0)
# #
# #     eval_loss = eval_loss / nb_eval_steps
# #     preds = np.argmax(preds, axis=1)
# #     # print(preds)
# #     result = compute_metrics(preds, out_label_ids)
# #     results.update(result)
# #
# #     output_eval_file = os.path.join(args.output_dir, 'eval_RGAT_results.txt')
# #     with open(output_eval_file, 'a+') as writer:
# #         logger.info('***** Eval results *****')
# #         logger.info("  eval loss: %s", str(eval_loss))
# #         for key in sorted(result.keys()):
# #             logger.info("  %s = %s", key, str(result[key]))
# #             writer.write("  %s = %s\n" % (key, str(result[key])))
# #             writer.write('\n')
# #         writer.write('\n')
# #     return results, eval_loss
#
#
# def simple_accuracy(preds, labels):
#     return (preds == labels).mean()
#
#
# def acc_and_f1(preds, labels):
#     acc = simple_accuracy(preds, labels)
#     f1 = f1_score(y_true=labels, y_pred=preds, average='macro')
#     return {
#         "acc": acc,
#         "f1": f1
#     }
#
# def compute_metrics(preds, labels):
#     return acc_and_f1(preds, labels)

In [20]:
import argparse


def parse_args():
    parser = argparse.ArgumentParser()

    # Required parameters
    # parser.add_argument('--dataset_name', type=str, default='rest',
    #                     choices=['rest', 'laptop', 'twitter'],
    #                     help='Choose absa dataset.')
    # parser.add_argument('--output_dir', type=str, default='/data1/SHENWZH/ABSA_online/data/output-gcn',
    #                     help='Directory to store intermedia data, such as vocab, embeddings, tags_vocab.')
    # parser.add_argument('--num_classes', type=int, default=3,
    #                     help='Number of classes of ABSA.')
    #
    #
    # parser.add_argument('--cuda_id', type=str, default='3',
    #                     help='Choose which GPUs to run')
    # parser.add_argument('--seed', type=int, default=2019,
    #                     help='random seed for initialization')

    # Model parameters
    parser.add_argument('--glove_dir', type=str, default='/data1/SHENWZH/wordvec',
                        help='Directory storing glove embeddings')
    parser.add_argument('--bert_model_dir', type=str, default='/data1/SHENWZH/models/bert_base',
                        help='Path to pre-trained Bert model.')
    parser.add_argument('--pure_bert', action='store_true',
                        help='Cat text and aspect, [cls] to predict.')
    parser.add_argument('--gat_bert', action='store_true',
                        help='Cat text and aspect, [cls] to predict.')

    parser.add_argument('--highway', action='store_true',
                        help='Use highway embed.')

    parser.add_argument('--num_layers', type=int, default=2,
                        help='Number of layers of bilstm or highway or elmo.')


    parser.add_argument('--add_non_connect',  type= bool, default=True,
                        help='Add a sepcial "non-connect" relation for aspect with no direct connection.')
    parser.add_argument('--multi_hop',  type= bool, default=True,
                        help='Multi hop non connection.')
    parser.add_argument('--max_hop', type = int, default=4,
                        help='max number of hops')


    parser.add_argument('--num_heads', type=int, default=6,
                        help='Number of heads for gat.')

    parser.add_argument('--dropout', type=float, default=0,
                        help='Dropout rate for embedding.')


    parser.add_argument('--num_gcn_layers', type=int, default=1,
                        help='Number of GCN layers.')
    parser.add_argument('--gcn_mem_dim', type=int, default=300,
                        help='Dimension of the W in GCN.')
    parser.add_argument('--gcn_dropout', type=float, default=0.2,
                        help='Dropout rate for GCN.')
    # GAT
    parser.add_argument('--gat', action='store_true',
                        help='GAT')
    parser.add_argument('--gat_our', action='store_true',
                        help='GAT_our')
    parser.add_argument('--gat_attention_type', type = str, choices=['linear','dotprod','gcn'], default='dotprod',
                        help='The attention used for gat')

    parser.add_argument('--embedding_type', type=str,default='glove', choices=['glove','bert'])
    parser.add_argument('--embedding_dim', type=int, default=300,
                        help='Dimension of glove embeddings')
    parser.add_argument('--dep_relation_embed_dim', type=int, default=300,
                        help='Dimension for dependency relation embeddings.')

    parser.add_argument('--hidden_size', type=int, default=300,
                        help='Hidden size of bilstm, in early stage.')
    parser.add_argument('--final_hidden_size', type=int, default=300,
                        help='Hidden size of bilstm, in early stage.')
    parser.add_argument('--num_mlps', type=int, default=2,
                        help='Number of mlps in the last of model.')

    # Training parameters
    parser.add_argument("--per_gpu_train_batch_size", default=16, type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size", default=32, type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=2,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--learning_rate", default=1e-3, type=float,
                        help="The initial learning rate for Adam.")

    parser.add_argument("--weight_decay", default=0.0, type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")

    parser.add_argument("--max_grad_norm", default=1.0, type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs", default=30.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_steps", default=-1, type=int,
                        help="If > 0: set total number of training steps(that update the weights) to perform. Override num_train_epochs.")
    parser.add_argument('--logging_steps', type=int, default=50,
                        help="Log every X updates steps.")

    return parser.parse_args()


def check_args(args):
    '''
    eliminate confilct situations

    '''
    logger.info(vars(args))

In [21]:


# Parse args
# args = parse_args()
# check_args(args)

# Bert, load pretrained model and tokenizer, check if neccesary to put bert here
if args.embedding_type == 'bert':
    tokenizer = BertTokenizer.from_pretrained(args.bert_model_dir)
    args.tokenizer = tokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load datasets and vocabs
# train_dataset, test_dataset, word_vocab, dep_tag_vocab, pos_tag_vocab
# Build Model
model = Aspect_Text_GAT_only(args) #, dep_tag_vocab['len'], pos_tag_vocab['len'])

batch_size = 128

model.to(device)
# Train
num_epochs = 10
# Define the optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)


# _, _,  all_eval_results = \
train_2( model, num_epochs, train_dataloder, optimizer, scheduler) # test_dataset

# if len(all_eval_results):
#     best_eval_result = max(all_eval_results, key=lambda x: x['acc'])
#     for key in sorted(best_eval_result.keys()):
#         logger.info("  %s = %s", key, str(best_eval_result[key]))



usage: ipykernel_launcher.py [-h] [--glove_dir GLOVE_DIR]
                             [--bert_model_dir BERT_MODEL_DIR] [--pure_bert]
                             [--gat_bert] [--highway]
                             [--num_layers NUM_LAYERS]
                             [--add_non_connect ADD_NON_CONNECT]
                             [--multi_hop MULTI_HOP] [--max_hop MAX_HOP]
                             [--num_heads NUM_HEADS] [--dropout DROPOUT]
                             [--num_gcn_layers NUM_GCN_LAYERS]
                             [--gcn_mem_dim GCN_MEM_DIM]
                             [--gcn_dropout GCN_DROPOUT] [--gat] [--gat_our]
                             [--gat_attention_type {linear,dotprod,gcn}]
                             [--embedding_type {glove,bert}]
                             [--embedding_dim EMBEDDING_DIM]
                             [--dep_relation_embed_dim DEP_RELATION_EMBED_DIM]
                             [--hidden_size HIDDEN_SIZE]
                 

SystemExit: 2