In [1]:
import os
import time
import numpy as np
import pickle as pkl
import pandas as pd
import warnings
warnings.filterwarnings("ignore")


import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizer
from torch.optim import AdamW

from utils import *
from model_vgcn_bert import VGCN_BERT

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

2023-04-23 17:11:26.495248: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-23 17:11:28.135728: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/lib/oracle/12.2/client64/lib/lib:/usr/local/lib::.
2023-04-23 17:11:28.136411: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/lib/oracle/12.2/client64/lib/lib:/usr/local/lib::.


In [2]:
# Dataset Config
DATASET_NUM_CLASSES = "3"
DATASET_SIZE = "small"
DATA_TYPE = "text_comments"
DATA_DIR = "./processed_data/" + DATASET_NUM_CLASSES + "_" + DATASET_SIZE + "/" + DATA_TYPE
DUMP_DIR = DATA_DIR + "/dumped_data"

# BERT Config
PRE_TRAINED_MODEL_NAME = "bert-base-cased"

# Model Config
GCN_EMBEDDING_DIM = 16
LEARNING_RATE = 1e-5
L2_DECAY = 0.01
VOCAB_ADJ = "npmi" # npmi / tf / all
NPMI_THRESHOLD = 0.1
TF_THRESHOLD = 0.1
MAX_SEQ_LENGTH = 450 + GCN_EMBEDDING_DIM
TRAIN_EPOCH = 5
BATCH_SIZE = 8
OUTPUT_FILE = "./trained_models/" + DATASET_NUM_CLASSES + "_" + DATASET_SIZE
MODEL_FILE = OUTPUT_FILE + "/vgcn_bert_models_" + DATA_TYPE + ".pth"
CONFUSION_MATRIX_PATH = OUTPUT_FILE + "/confusion_matrix_" + DATA_TYPE + ".png"

In [3]:
if not os.path.exists(OUTPUT_FILE):
    os.makedirs(OUTPUT_FILE)

### Extract Prepared Data

#### Vocab Map & Adjacency Matrix

In [4]:
objects = []
file_names = ["vocab_map", "vocab_adj_tf", "vocab_adj_npmi"]

for i in range(len(file_names)):
    datafile = DUMP_DIR + "/data.%s" % (file_names[i])
    with open(datafile, 'rb') as f:
        objects.append(pkl.load(f, encoding='latin1'))

gcn_vocab_map, gcn_vocab_adj_tf, gcn_vocab_adj_npmi = tuple(objects)

In [5]:
if TF_THRESHOLD > 0:
    gcn_vocab_adj_tf.data *= (gcn_vocab_adj_tf.data > TF_THRESHOLD)
    gcn_vocab_adj_tf.eliminate_zeros()
if NPMI_THRESHOLD > 0:
    gcn_vocab_adj_npmi.data *= (gcn_vocab_adj_npmi.data > NPMI_THRESHOLD)
    gcn_vocab_adj_npmi.eliminate_zeros()

if VOCAB_ADJ == "npmi":
    gcn_vocab_adj_list = [gcn_vocab_adj_npmi]
elif VOCAB_ADJ == "tf":
    gcn_vocab_adj_list = [gcn_vocab_adj_tf]
elif VOCAB_ADJ == "all":
    gcn_vocab_adj_list = [gcn_vocab_adj_tf, gcn_vocab_adj_npmi]

In [6]:
norm_gcn_vocab_adj_list = []
for i in range(len(gcn_vocab_adj_list)):
    adj = gcn_vocab_adj_list[i]

    print('Zero Ratio for %dth Vocab Adjacency Matrix : %.8f' %
          (i, 100 * (1 - adj.count_nonzero() / (adj.shape[0] * adj.shape[1]))))

    adj = normalize_adj(adj)
    norm_gcn_vocab_adj_list.append(sparse_scipy2torch(adj.tocoo()).to(device))

gcn_adj_list = norm_gcn_vocab_adj_list

Zero Ratio for 0th Vocab Adjacency Matrix : 92.21295569


In [7]:
gcn_vocab_size = len(gcn_vocab_map)

#### Texts & Labels

In [8]:
train_data = pd.read_csv(DATA_DIR + "/processed_train_data.csv")
test_data = pd.read_csv(DATA_DIR + "/processed_test_data.csv")

In [9]:
train_data.dropna(axis = 0, how = "any", inplace = True)
test_data.dropna(axis = 0, how = "any", inplace = True)

### Use DataLoader to Load Data

In [10]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [11]:
class FakedditDataset(Dataset):
    def __init__(self, texts, labels, gcn_vocab_map, gcn_embedding_dim, tokenizer, max_seq_len):
        super(FakedditDataset, self).__init__()
        self.texts = texts
        self.labels = labels
        self.gcn_vocab_map = gcn_vocab_map
        self.gcn_embedding_dim = gcn_embedding_dim
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        tokens = self.texts[idx].split()
        if len(tokens) > self.max_seq_len - 1 - self.gcn_embedding_dim:
            tokens = tokens[: self.max_seq_len - 1 - self.gcn_embedding_dim]
        
        gcn_vocab_ids = []
        for t in tokens:
            gcn_vocab_ids.append(self.gcn_vocab_map[t])
            
        tokens = ["[CLS]"] + tokens + ["[SEP]" for i in range(self.gcn_embedding_dim + 1)]
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        segment_ids = [0] * len(tokens)
        attention_mask = [1] * len(input_ids)
        return {
            "input_ids" : input_ids,
            "attention_mask" : attention_mask,
            "segment_ids" : segment_ids,
            "gcn_vocab_ids" : gcn_vocab_ids,
            "label" : self.labels[idx],
        }
    
    def pad(self, batch):
        gcn_vocab_size=len(self.gcn_vocab_map)
        input_len_list = [len(sample["input_ids"]) for sample in batch]
        max_input_len = np.array(input_len_list).max()
        
        f_collect = lambda x: [sample[x] for sample in batch]
        f_pad = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch]
        f_pad2 = lambda x, seqlen: [[-1]+ sample[x] + [-1] * (seqlen - len(sample[x])-1) for sample in batch]
        
        batch_input_ids = torch.tensor(f_pad("input_ids", max_input_len), dtype = torch.long)
        batch_attention_mask = torch.tensor(f_pad("attention_mask", max_input_len), dtype = torch.long)
        batch_segment_ids = torch.tensor(f_pad("segment_ids", max_input_len), dtype = torch.long)
        batch_label = torch.tensor(f_collect("label"), dtype = torch.long)
        batch_gcn_vocab_ids_padded = np.array(f_pad2("gcn_vocab_ids", max_input_len)).reshape(-1)
        batch_gcn_swop_eye = torch.eye(gcn_vocab_size + 1)[batch_gcn_vocab_ids_padded][:,:-1]
        batch_gcn_swop_eye = batch_gcn_swop_eye.view(len(batch),-1,gcn_vocab_size).transpose(1,2)
        
        return {
            "input_ids" : batch_input_ids,
            "attention_mask" : batch_attention_mask,
            "segment_ids" : batch_segment_ids,
            "gcn_swop_eye" : batch_gcn_swop_eye,
            "label" : batch_label,
        }

In [12]:
def create_dataloader(df, gcn_vocab_map, gcn_embedding_dim, tokenizer, max_seq_len, batch_size):
    ds = FakedditDataset(
        texts = df["cleaned_tokens"].to_numpy(),
        labels = df["label"].to_numpy(),
        gcn_vocab_map = gcn_vocab_map,
        gcn_embedding_dim = gcn_embedding_dim,
        tokenizer = tokenizer,
        max_seq_len = max_seq_len
    )
    return DataLoader(ds, batch_size = batch_size, shuffle = False, num_workers = 0, collate_fn = ds.pad)

In [13]:
train_dataloader = create_dataloader(train_data, gcn_vocab_map, GCN_EMBEDDING_DIM, tokenizer, MAX_SEQ_LENGTH, BATCH_SIZE)
test_dataloader = create_dataloader(test_data, gcn_vocab_map, GCN_EMBEDDING_DIM, tokenizer, MAX_SEQ_LENGTH, BATCH_SIZE)

In [14]:
sample_data = next(iter(train_dataloader))

In [15]:
sample_data["input_ids"].shape

torch.Size([8, 467])

### Define Training and Evaluating Functions

In [16]:
model = VGCN_BERT.from_pretrained(PRE_TRAINED_MODEL_NAME, gcn_adj_dim = gcn_vocab_size, gcn_adj_num = len(gcn_adj_list), 
                                  gcn_embedding_dim = GCN_EMBEDDING_DIM, num_labels = int(DATASET_NUM_CLASSES))

model.to(device)

VGCN_BERT(
  (embeddings): VGCNBertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
    (vocab_gcn): VocabGraphConvolution(
      (fc_hc): Linear(in_features=128, out_features=16, bias=True)
      (act_func): ReLU()
      (dropout): Dropout(p=0.2, inplace=False)
    )
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bi

In [17]:
optimizer = AdamW(model.parameters(), lr = LEARNING_RATE, weight_decay = L2_DECAY)

In [18]:
from sklearn.metrics import classification_report, f1_score
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

def evaluate(model, gcn_adj_list, dataloader, isTraining = True):
    model.eval()
    all_predicts = []
    all_labels = []
    total_loss = 0

    with torch.no_grad():
        for data in dataloader:
            inputs = {"input_ids": data["input_ids"].to(device), 
                      "attention_mask": data["attention_mask"].to(device),
                      "segment_ids": data["segment_ids"].to(device),
                      "gcn_swop_eye": data["gcn_swop_eye"].to(device),
                      "label": data["label"].to(device)
                     }

            _, logits = model(gcn_adj_list, inputs["gcn_swop_eye"], inputs["input_ids"], 
                              inputs["segment_ids"], inputs["attention_mask"])

            loss = F.cross_entropy(logits.view(-1, int(DATASET_NUM_CLASSES)), inputs["label"])
            total_loss += loss.item()

            _, predicted = torch.max(logits, -1)
            all_predicts.extend(predicted.tolist())
            all_labels.extend(inputs["label"].tolist())

    f1_metrics = f1_score(np.array(all_labels).reshape(-1), np.array(all_predicts).reshape(-1), average='weighted')
    print("Evaluation Report:\n" + classification_report(np.array(all_labels).reshape(-1),
              np.array(all_predicts).reshape(-1), digits = 5))
    
    if not isTraining:
        ConfusionMatrixDisplay.from_predictions(all_labels, all_predicts, cmap = "GnBu")
        plt.savefig(CONFUSION_MATRIX_PATH)
        plt.show()
        
    return total_loss, f1_metrics

In [19]:
def train_epoch(model, gcn_adj_list, dataloader, optimizer, device, epoch):
    model.train()
    train_loss = 0
    model.train()
    optimizer.zero_grad()
    for idx, data in enumerate(train_dataloader):
        inputs = {"input_ids": data["input_ids"].to(device), 
          "attention_mask": data["attention_mask"].to(device),
          "segment_ids": data["segment_ids"].to(device),
          "gcn_swop_eye": data["gcn_swop_eye"].to(device),
          "label": data["label"].to(device)
         }

        _, logits = model(gcn_adj_list, inputs["gcn_swop_eye"], inputs["input_ids"], 
                          inputs["segment_ids"], inputs["attention_mask"])
        loss = F.cross_entropy(logits, inputs["label"])

        loss.backward()

        train_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()
        if idx % 40 == 0:
            print("Epoch:{}-{}/{}, Train Loss: {}, Cumulated Time: {}m ".format(epoch, idx,
                  len(train_dataloader), loss.item(), (time.time() - train_start)/60.0))
    return train_loss

In [None]:
train_start = time.time()
perform_metrics_best = 0

for epoch in range(TRAIN_EPOCH):
    train_loss = train_epoch(model, gcn_adj_list, train_dataloader, optimizer, device, epoch)

    print('*' * 50)
    test_loss, curr_metrics= evaluate(model, gcn_adj_list, test_dataloader)
    
    if (curr_metrics > perform_metrics_best):
        perform_metrics_best = curr_metrics
        torch.save(model, MODEL_FILE)
        
    print('*' * 50)
    print("Epoch:{} Completed, Total Train Loss:{}, Test Loss:{}, Spend {}m ".format(
        epoch, train_loss, test_loss, (time.time() - train_start) / 60.0))


Epoch:0-0/2152, Train Loss: 1.0407068729400635, Cumulated Time: 0.043645608425140384m 
Epoch:0-40/2152, Train Loss: 0.7782796025276184, Cumulated Time: 1.5262845913569132m 
Epoch:0-80/2152, Train Loss: 0.7041869759559631, Cumulated Time: 2.9910873969395957m 
Epoch:0-120/2152, Train Loss: 1.0055947303771973, Cumulated Time: 4.463466068108876m 
Epoch:0-160/2152, Train Loss: 0.4494379460811615, Cumulated Time: 5.904240417480469m 
Epoch:0-200/2152, Train Loss: 0.550123393535614, Cumulated Time: 7.354006246725718m 
Epoch:0-240/2152, Train Loss: 0.2801685333251953, Cumulated Time: 8.77365856965383m 
Epoch:0-280/2152, Train Loss: 0.3546294569969177, Cumulated Time: 10.205514502525329m 
Epoch:0-320/2152, Train Loss: 0.5370128750801086, Cumulated Time: 11.69358864625295m 
Epoch:0-360/2152, Train Loss: 0.42022159695625305, Cumulated Time: 13.15066343943278m 
Epoch:0-400/2152, Train Loss: 0.264009565114975, Cumulated Time: 14.628824810187021m 
Epoch:0-440/2152, Train Loss: 0.6001460552215576, Cum

In [None]:
e_model = torch.load(MODEL_FILE, map_location=torch.device(device))

In [None]:
evaluate(e_model, gcn_adj_list, test_dataloader, isTraining = False)