In [1]:
import os
import time
import numpy as np
import pickle as pkl
import argparse
import pandas as pd

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import DataLoader

from tqdm import tqdm, trange
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

from pytorch_pretrained_bert.modeling import BertModel, BertConfig, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam  # , warmup_linear


from utils import *
from model_vgcn_bert import VGCN_Bert
from model_vgcn_bert_ablation import VGCN_Bert_Ablation

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#### Step 1:   Configurations for Evaluating VGCN_BERT Model

In [2]:
args = {"ds": "pheme", "load": 1, "sw": 1,
        "lr": 1e-5, "l2": 0.01}

config_dataset = args["ds"]
config_load_model_from_checkpoint = True if args["load"] == 1 else False
config_use_stopwords = True if args["sw"] == 1 else False
config_learning_rate0 = args["lr"]
config_l2_decay = args["l2"]

config_will_do_ablation_test = True
config_ablation_choice = "vgcn_only" # no_attention / vgcn_only / bert_only
config_model_type = 'VGCN_BERT' if not config_will_do_ablation_test else 'VGCN_BERT_Ablation'
config_data_type = "text_comments" # text_comments / text_only / comments_only


config_gcn_embedding_dim = 16 if not config_will_do_ablation_test else 1
config_warmup_proportion = 0.1
config_vocab_adj = 'all'  # pmi / tf / all
config_adj_npmi_threshold = 0.2
config_adj_tf_threshold = 0
config_loss_criterion = 'cross_entropy'
config_output_num_features = 768
MAX_SEQ_LENGTH = 200 + config_gcn_embedding_dim if not config_will_do_ablation_test else 250

if config_will_do_ablation_test and config_ablation_choice == "no_attention":
    config_output_num_features = 768 * 2

total_train_epochs = 9
batch_size = 16  # 12
gradient_accumulation_steps = 1
if config_dataset == 'pheme':
    bert_model_scale = 'bert-base-uncased'
elif config_dataset == 'weibo':
    bert_model_scale = 'bert-base-chinese'

do_lower_case = True
perform_metrics_str = ['weighted avg', 'f1-score']
do_softmax_before_mse = True

data_dir = './prepared_data/' + config_dataset + '_' + config_data_type
output_dir = './model_output/' if not config_will_do_ablation_test else './model_output/ablation_tests/'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)


model_load_file = 'Model_' + config_model_type + '_' \
                + config_dataset + '_' + config_data_type + '.pt'

print('----------STEP 1: CONFIGURATIONS FOR VGCN_BERT MODEL--------')
print('Dataset: ', config_dataset)
print('Will Load Model from Checkpoint: ', config_load_model_from_checkpoint)
print('Will Do Ablation Tests: ', config_will_do_ablation_test)
if config_will_do_ablation_test:
    print('Ablation Choice: ', config_ablation_choice)
print('Will Delete Stop Words: ', config_use_stopwords)
print('Vocab GCN Hidden Dim: vocab_size -> 128 -> ' + str(config_gcn_embedding_dim))
print('Learning Rate0: ', config_learning_rate0)
print('Weight Decay: ', config_l2_decay)
print('Loss Criterion: ', config_loss_criterion)
print('Will Perform Softmax before MSE: ', do_softmax_before_mse)
print('Vocab Adjcent: ', config_vocab_adj)
print('MAX_SEQ_LENGTH: ', MAX_SEQ_LENGTH)
print('Perform Metrics: ', perform_metrics_str)
print('Load Model File Name: ', model_load_file)

----------STEP 1: CONFIGURATIONS FOR VGCN_BERT MODEL--------
Dataset:  pheme
Will Load Model from Checkpoint:  True
Will Do Ablation Tests:  True
Ablation Choice:  vgcn_only
Will Delete Stop Words:  True
Vocab GCN Hidden Dim: vocab_size -> 128 -> 1
Learning Rate0:  1e-05
Weight Decay:  0.01
Loss Criterion:  cross_entropy
Will Perform Softmax before MSE:  True
Vocab Adjcent:  all
MAX_SEQ_LENGTH:  250
Perform Metrics:  ['weighted avg', 'f1-score']
Load Model File Name:  Model_VGCN_BERT_Ablation_pheme_text_comments.pt


#### Step 2.1: Prepare Dataset & Load Vocabulary Adjacent Matrix

In [3]:
print('----------STEP 2: PREPARE DATASET & LOAD VOCABULARY ADJACENT MATRIX----------')
print(' Load and seperate', config_dataset, 'dataset, with vocabulary graph adjacent matrix')

objects = []
names = ['index_label', 'train_label', 'train_label_prob', 'test_label',
         'test_label_prob', 'clean_docs', 'vocab_adj_tf', 'vocab_adj_pmi', 'vocab_map']

for i in range(len(names)):
    datafile = data_dir + "/data_%s.%s" % (config_dataset, names[i])
    with open(datafile, 'rb') as f:
        objects.append(pkl.load(f, encoding='latin1'))

index_labels_list, train_label, train_label_prob, test_label, test_label_prob, shuffled_clean_docs, gcn_vocab_adj_tf, gcn_vocab_adj_pmi, gcn_vocab_map = tuple(objects)

label2idx = index_labels_list[0]
idx2label = index_labels_list[1]

all_labels = np.hstack((train_label, test_label))
all_labels_prob = np.vstack((train_label_prob, test_label_prob))

examples = []
for i, text in enumerate(shuffled_clean_docs):
    example = InputExample(i, text.strip(), confidence=all_labels_prob[i], label=all_labels[i])
    examples.append(example)

num_classes = len(label2idx)
gcn_vocab_size = len(gcn_vocab_map)
train_size = len(train_label)
test_size = len(test_label)

indexs = np.arange(0, len(examples))
train_examples = [examples[i] for i in indexs[:train_size]]
test_examples = [examples[i] for i in indexs[train_size:train_size + test_size]]

----------STEP 2: PREPARE DATASET & LOAD VOCABULARY ADJACENT MATRIX----------
 Load and seperate pheme dataset, with vocabulary graph adjacent matrix


In [4]:
if config_adj_tf_threshold > 0:
    gcn_vocab_adj_tf.data *= (gcn_vocab_adj_tf.data > config_adj_tf_threshold)
    gcn_vocab_adj_tf.eliminate_zeros()
if config_adj_npmi_threshold > 0:
    gcn_vocab_adj_pmi.data *= (gcn_vocab_adj_pmi.data > config_adj_npmi_threshold)
    gcn_vocab_adj_pmi.eliminate_zeros()

if config_vocab_adj == 'pmi':
    gcn_vocab_adj_list = [gcn_vocab_adj_pmi]
elif config_vocab_adj == 'tf':
    gcn_vocab_adj_list = [gcn_vocab_adj_tf]
elif config_vocab_adj == 'all':
    gcn_vocab_adj_list = [gcn_vocab_adj_tf, gcn_vocab_adj_pmi]

norm_gcn_vocab_adj_list = []
for i in range(len(gcn_vocab_adj_list)):
    adj = gcn_vocab_adj_list[i]

    print('Zero ratio for vocab adj %dth: %.8f' %
          (i, 100 * (1 - adj.count_nonzero() / (adj.shape[0] * adj.shape[1]))))

    adj = normalize_adj(adj)
    norm_gcn_vocab_adj_list.append(sparse_scipy2torch(adj.tocoo()).to(device))

gcn_adj_list = norm_gcn_vocab_adj_list


train_classes_num, train_classes_weight = get_class_count_and_weight(train_label, len(label2idx))
loss_weight = torch.tensor(train_classes_weight).to(device)
loss_weight = torch.tensor(loss_weight, dtype=torch.float32).to(device)

tokenizer = BertTokenizer.from_pretrained(bert_model_scale, do_lower_case=do_lower_case)

Zero ratio for vocab adj 0th: 79.40479542
Zero ratio for vocab adj 1th: 95.22829647


#### Step 2.2:   Prepare PyTorch DataLoader

In [5]:
def get_pytorch_dataloader(examples, tokenizer, batch_size):
    dataset = CorpusDataset(examples, tokenizer, gcn_vocab_map, MAX_SEQ_LENGTH, config_gcn_embedding_dim)
    return DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=dataset.pad)


train_dataloader = get_pytorch_dataloader(train_examples, tokenizer, batch_size)
test_dataloader = get_pytorch_dataloader(test_examples, tokenizer, batch_size)

total_train_steps = int(len(train_dataloader) / gradient_accumulation_steps * total_train_epochs)

print('Train Classes Count: ', train_classes_num)
print('Batch size: ', batch_size)
print('Num steps: ', total_train_steps)
print('Number of Examples for Training: ', len(train_examples))
print('Number of Examples for Training After Dataloader: ', len(train_dataloader) * batch_size)
print('Number of Examples for Test: ', len(test_examples))

Train Classes Count:  [5341, 2368]
Batch size:  16
Num steps:  4338
Number of Examples for Training:  7709
Number of Examples for Training After Dataloader:  7712
Number of Examples for Test:  3174


#### Step 3.1 Ablation Tests (May Skip)

In [6]:
if config_load_model_from_checkpoint and os.path.exists(os.path.join(output_dir, model_load_file)):
    checkpoint = torch.load(os.path.join(output_dir, model_load_file), map_location='cpu')
    if 'step' in checkpoint:
        prev_save_step = checkpoint['step']
        start_epoch = checkpoint['epoch']
    else:
        prev_save_step = -1
        start_epoch = checkpoint['epoch'] + 1

    test_acc_prev = checkpoint['test_acc']
    perform_metrics_prev = checkpoint['perform_metrics']
    model = VGCN_Bert_Ablation.from_pretrained(bert_model_scale, state_dict=checkpoint['model_state'], gcn_adj_dim=gcn_vocab_size, 
        gcn_adj_num=len(gcn_adj_list), gcn_embedding_dim=config_gcn_embedding_dim, num_labels=len(label2idx))

    pretrained_dict = checkpoint['model_state']
    net_state_dict = model.state_dict()
    pretrained_dict_selected = {
        k: v for k, v in pretrained_dict.items() if k in net_state_dict}
    net_state_dict.update(pretrained_dict_selected)
    model.load_state_dict(net_state_dict)

    print('Loaded the pretrain model:', model_load_file, ', epoch:', checkpoint['epoch'], 'step:', prev_save_step, 'test acc:',
          checkpoint['test_acc'], ' '.join(perform_metrics_str)+'_test:', checkpoint['perform_metrics'])

model.to(device)

Loaded the pretrain model: Model_VGCN_BERT_Ablation_pheme_text_comments.pt , epoch: 7 step: -1 test acc: 0.9505356017643353 weighted avg f1-score_test: 0.9504216946195111


VGCN_Bert_Ablation(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )


In [7]:
def get_pooled_out_ablation(model, gcn_adj_list, predict_dataloader, ablation_choice):
    
    outputs = None
    model.eval()
    with torch.no_grad():
        for batch in tqdm(predict_dataloader, desc="Evaluating", colour='green'):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, y_prob, label_ids, gcn_swop_eye = batch
            vgcn_out, pooled_output, cat_out, _= model(gcn_adj_list, gcn_swop_eye, input_ids, segment_ids, input_mask)
            
            if ablation_choice == 'no_attention':
                output = cat_out
            elif ablation_choice == 'vgcn_only':
                output = vgcn_out
            elif ablation_choice == 'bert_only':
                output = pooled_output
            
            if outputs is None:
                outputs = output.detach().cpu().numpy()
            else:
                outputs = np.append(outputs, output.detach().cpu().numpy(), axis=0)
    
    return outputs

In [8]:
train_pooled_outputs = get_pooled_out_ablation(model, gcn_adj_list, train_dataloader, config_ablation_choice)

Evaluating: 100%|[32m████████████████████████████████████████████████████████████████████████████████[0m| 482/482 [20:46<00:00,  2.59s/it][0m


In [9]:
test_pooled_outputs = get_pooled_out_ablation(model, gcn_adj_list, test_dataloader, config_ablation_choice)

Evaluating: 100%|[32m████████████████████████████████████████████████████████████████████████████████[0m| 199/199 [08:37<00:00,  2.60s/it][0m


In [10]:
print(train_pooled_outputs.shape, test_pooled_outputs.shape)

(7709, 768) (3174, 768)


#### Step 3.2:   Load Trained VGCN_BERT Model

In [11]:
# if config_load_model_from_checkpoint and os.path.exists(os.path.join(output_dir, model_load_file)):
#     checkpoint = torch.load(os.path.join(output_dir, model_load_file), map_location='cpu')
#     if 'step' in checkpoint:
#         prev_save_step = checkpoint['step']
#         start_epoch = checkpoint['epoch']
#     else:
#         prev_save_step = -1
#         start_epoch = checkpoint['epoch'] + 1

#     test_acc_prev = checkpoint['test_acc']
#     perform_metrics_prev = checkpoint['perform_metrics']
#     model = VGCN_Bert.from_pretrained(bert_model_scale, state_dict=checkpoint['model_state'], gcn_adj_dim=gcn_vocab_size, 
#         gcn_adj_num=len(gcn_adj_list), gcn_embedding_dim=config_gcn_embedding_dim, num_labels=len(label2idx))

#     pretrained_dict = checkpoint['model_state']
#     net_state_dict = model.state_dict()
#     pretrained_dict_selected = {
#         k: v for k, v in pretrained_dict.items() if k in net_state_dict}
#     net_state_dict.update(pretrained_dict_selected)
#     model.load_state_dict(net_state_dict)

#     print('Loaded the pretrain model:', model_load_file, ', epoch:', checkpoint['epoch'], 'step:', prev_save_step, 'test acc:',
#           checkpoint['test_acc'], ' '.join(perform_metrics_str)+'_test:', checkpoint['perform_metrics'])

# model.to(device)

#### Step 3.3: Evaluate VGCN_BERT Model

In [12]:
# def evaluate(model, gcn_adj_list,predict_dataloader, batch_size):
#     # print("***** Running prediction *****")
#     model.eval()
#     predict_out = []
#     all_label_ids = []
#     ev_loss=0
#     total = 0
#     correct = 0
#     with torch.no_grad():
#         for batch in tqdm(predict_dataloader, desc="Evaluating", colour='green'):
#             batch = tuple(t.to(device) for t in batch)
#             input_ids, input_mask, segment_ids, y_prob, label_ids, gcn_swop_eye = batch
#             _, logits = model(gcn_adj_list, gcn_swop_eye,input_ids, segment_ids, input_mask)

#             if config_loss_criterion=='mse':
#                 if do_softmax_before_mse:
#                     logits=F.softmax(logits,-1)
#                 loss = F.mse_loss(logits, y_prob)
#             else:
#                 if loss_weight is None:
#                     loss = F.cross_entropy(logits.view(-1, num_classes), label_ids)
#                 else:
#                     loss = F.cross_entropy(logits.view(-1, num_classes), label_ids)
                    
#             ev_loss+=loss.item()
            
#             _, predicted = torch.max(logits, -1)
            
#             predict_out.extend(predicted.tolist())
#             all_label_ids.extend(label_ids.tolist())
#             eval_accuracy = predicted.eq(label_ids).sum().item()
#             total += len(label_ids)
#             correct += eval_accuracy

#         f1_metrics=f1_score(np.array(all_label_ids).reshape(-1),
#             np.array(predict_out).reshape(-1), average='weighted')
#         print("Report:\n"+classification_report(np.array(all_label_ids).reshape(-1),
#             np.array(predict_out).reshape(-1),digits=4))

#     ev_acc = correct / total
#     return ev_loss, ev_acc, f1_metrics

In [13]:
# evaluate(model, gcn_adj_list, test_dataloader, batch_size)

#### Step 4.1: Get VGCN_BERT Model Pooled Output

In [14]:
# def get_pooled_out(model, gcn_adj_list, predict_dataloader):
    
#     outputs = None
#     model.eval()
#     with torch.no_grad():
#         for batch in tqdm(predict_dataloader, desc="Evaluating", colour='green'):
#             batch = tuple(t.to(device) for t in batch)
#             input_ids, input_mask, segment_ids, y_prob, label_ids, gcn_swop_eye = batch
#             pooled_output, _= model(gcn_adj_list, gcn_swop_eye, input_ids, segment_ids, input_mask)
            
#             if outputs is None:
#                 outputs = pooled_output.detach().cpu().numpy()
#             else:
#                 outputs = np.append(outputs, pooled_output.detach().cpu().numpy(), axis=0)
    
#     return outputs

In [15]:
# train_pooled_outputs = get_pooled_out(model, gcn_adj_list, train_dataloader)

In [16]:
# test_pooled_outputs = get_pooled_out(model, gcn_adj_list, test_dataloader)

In [17]:
# print(train_pooled_outputs.shape, test_pooled_outputs.shape)

#### Step 4.2: Reorganize Datasets

In [18]:
train_index_path = './data/' + config_dataset.upper() + '-SEG/' + config_data_type + '/train_index_list.txt'
test_index_path = './data/' + config_dataset.upper() + '-SEG/' + config_data_type + '/test_index_list.txt'

train_index_list = []
with open(train_index_path, 'r') as file:
    lines = file.readlines()
    for each_line in lines:
        train_index_list.append(int(each_line))
        
test_index_list = []
with open(test_index_path, 'r') as file:
    lines = file.readlines()
    for each_line in lines:
        test_index_list.append(int(each_line))

In [19]:
train_label_path = './data/' + config_dataset.upper() + '-SEG/'+ config_data_type +'/train_label.txt'
test_label_path = './data/' + config_dataset.upper() + '-SEG/'+ config_data_type +'/test_label.txt'

train_label = []
with open(train_label_path, 'r') as file:
    lines = file.readlines()
    for each_line in lines:
        train_label.append(int(each_line))

test_label = []
with open(test_label_path, 'r') as file:
    lines = file.readlines()
    for each_line in lines:
        test_label.append(int(each_line))
        

In [20]:
train_x = {}

for l, emb in zip(train_index_list,train_pooled_outputs):
    if l in train_x.keys():
        # np.vstack on lists represents features concatenation 
        train_x[l]  =np.vstack([train_x[l], emb])
    else:
        train_x[l] = [emb]

train_l_final = []
label_l_final = []

for k in train_x.keys():
    train_l_final.append(train_x[k])
    label_l_final.append(train_label[k])

df_train = pd.DataFrame({'emb': train_l_final, 'label': label_l_final})
df_train.head(10)

Unnamed: 0,emb,label
0,"[[0.01680187, 0.013852032, 0.01778804, 0.03349...",0
1,"[[0.00043393672, 0.016871663, 0.044724338, 0.0...",1
2,"[[0.04059963, 0.06517336, 0.034778416, 0.05385...",1
3,"[[0.019191245, 0.023653356, 0.02521983, 0.0310...",0
4,"[[0.0085446965, 0.011816889, 0.006936282, 0.01...",0
5,"[[-0.015042536, -0.011104598, -0.0142199565, -...",0
6,"[[0.004592642, 0.003820967, 0.00013222545, 0.0...",0
7,"[[-0.015112469, -0.010178502, -0.008307333, -0...",0
8,"[[0.012637218, 0.02790481, 0.01790919, 0.03383...",1
9,"[[-0.009839478, 0.03426922, 0.038110718, 0.075...",0


In [21]:
test_x = {}

for l, emb in zip(test_index_list,test_pooled_outputs):
    if l in test_x.keys():
        # np.vstack on lists represents features concatenation 
        test_x[l]  =np.vstack([test_x[l], emb])
    else:
        test_x[l] = [emb]

test_l_final = []
tlabel_l_final = []
for k in test_x.keys():
    test_l_final.append(test_x[k])
    tlabel_l_final.append(test_label[k])

df_test = pd.DataFrame({'emb': test_l_final, 'label': tlabel_l_final})
df_test.head(10)

Unnamed: 0,emb,label
0,"[[0.0009316914, 0.03501807, 0.024133163, 0.050...",1
1,"[[0.015521595, 0.022322385, 0.03313172, 0.0583...",0
2,"[[0.01658486, 0.05144207, 0.03161218, 0.062799...",0
3,"[[0.010414546, 0.023666153, 0.0037200954, 0.03...",0
4,"[[0.043209784, 0.0474843, 0.019516794, 0.08808...",0
5,"[[0.02552834, 0.03534136, 0.00903167, 0.077379...",0
6,"[[-0.00944491, -0.0048265615, 0.0017341878, 0....",1
7,"[[0.031745307, 0.051066287, 0.07357788, 0.0641...",0
8,"[[0.013758717, 0.02846297, 0.023216037, 0.0509...",0
9,"[[0.002276061, 0.007911013, 0.0038150363, 0.00...",0


In [22]:
from sklearn.model_selection import train_test_split

df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=35)

In [23]:
print(df_train.shape,df_val.shape,df_test.shape)

(5221, 2) (1074, 2) (1074, 2)


#### Step 4.3: Generate Data Generator for Joint Model

In [24]:
batch_dict = {
#     "text_comments": [[7,663], [3, 232], [5, 93]],
    "pheme_text_comments_refined": [[23, 227], [6, 179], [6, 179]],
    "pheme_text_only_refined": [[23, 227], [3, 271], [3, 271]],
    "pheme_comments_only_refined": [[23, 227], [2, 341], [1, 683]],
    "weibo_text_comments_refined": [[3, 1399], [1, 863], [2, 432]],
    "weibo_text_only_refined": [[3, 1399], [1, 653], [2, 327]],
    "weibo_comments_only_refined": [[3, 1399], [4, 137], [9, 61]],
}

batch = batch_dict[config_dataset + '_' + config_data_type + '_refined']

In [25]:
def train_generator(df, batch_size, batches_per_epoch):
    num_sequences = len(df['emb'].to_list())
    assert batch_size * batches_per_epoch == num_sequences
    num_features= config_output_num_features

    x_list= df['emb'].to_list()
    y_list =  df.label.to_list()
    # Generate batches
    while True:
        for b in range(batches_per_epoch):
            longest_index = (b + 1) * batch_size - 1
            timesteps = len(max(df['emb'].to_list()[:(b + 1) * batch_size][-batch_size:], key=len))
            x_train = np.full((batch_size, timesteps, num_features), -99.)
            y_train = np.zeros((batch_size,  1))
            for i in range(batch_size):
                li = b * batch_size + i
                x_train[i, 0:len(x_list[li]), :] = x_list[li]
                y_train[i] = y_list[li]
            yield x_train, y_train
            
def val_generator(df,batch_size_val,batches_per_epoch_val):
    num_sequences_val = len(df['emb'].to_list())
    assert batch_size_val * batches_per_epoch_val == num_sequences_val
    num_features= config_output_num_features

    x_list= df['emb'].to_list()
    y_list =  df.label.to_list()
    # Generate batches
    while True:
        for b in range(batches_per_epoch_val):
            longest_index = (b + 1) * batch_size_val - 1
            timesteps = len(max(df['emb'].to_list()[:(b + 1) * batch_size_val][-31:], key=len))
            x_val = np.full((batch_size_val, timesteps, num_features), -99.)
            y_val = np.zeros((batch_size_val,  1))
            for i in range(batch_size_val):
                li = b * batch_size_val + i
                x_val[i, 0:len(x_list[li]), :] = x_list[li]
                y_val[i] = y_list[li]
            yield x_val, y_val
            
def test_generator(df,batch_size_test, batches_per_epoch_test):
    num_sequences_test = len(df['emb'].to_list())
    assert batch_size_test * batches_per_epoch_test == num_sequences_test
    num_features= config_output_num_features

    x_list= df['emb'].to_list()
    y_list =  df.label.to_list()
    # Generate batches
    while True:
        for b in range(batches_per_epoch_test):
            longest_index = (b + 1) * batch_size_test - 1
            timesteps = len(max(df['emb'].to_list()[:(b + 1) * batch_size_test][-31:], key=len))
            # print(len(df_train['emb'].to_list()[:b+batch_size][-7:]))
            x_test = np.full((batch_size_test, timesteps, num_features), -99.)
            y_test = np.zeros((batch_size_test,  1))
            for i in range(batch_size_test):
                li = b * batch_size_test + i
                x_test[i, 0:len(x_list[li]), :] = x_list[li]
                y_test[i] = y_list[li]
            yield x_test, y_test            

In [26]:
train_data = train_generator(df_train, batch[0][0], batch[0][1])
val_data = val_generator(df_val, batch[1][0], batch[1][1])
test_data = test_generator(df_test, batch[2][0], batch[2][1])

In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def cul_all_metrics(y_true, y_pred, pos_label=1):
    return {"accuracy": float("%.5f" % accuracy_score(y_true=y_true, y_pred=y_pred)),
            "precision": float("%.5f" % precision_score(y_true=y_true, y_pred=y_pred, pos_label=pos_label, average="weighted")),
            "recall": float("%.5f" % recall_score(y_true=y_true, y_pred=y_pred, pos_label=pos_label, average="weighted")),
            "f1-score": float("%.5f" % f1_score(y_true=y_true, y_pred=y_pred, average="weighted")),
           }

#### Step 5: Build "Recurrence over VGCN_BERT" Model

In [28]:
import tensorflow as tf
from tensorflow import keras
import h5py

text_input = keras.Input(shape=(None,config_output_num_features,), dtype='float32', name='features')
l_mask = keras.layers.Masking(mask_value=-99.)(text_input) 
encoded_text = keras.layers.LSTM(100,)(l_mask)
out_dense = keras.layers.Dense(30, activation='relu')(encoded_text)
out = keras.layers.Dense(2, activation='softmax')(out_dense)
R_Model = keras.Model(text_input, out)
R_Model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])
R_Model.summary()

2022-04-06 11:35:18.482729: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-04-06 11:35:18.517168: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2099965000 Hz
2022-04-06 11:35:18.521123: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5571cc7279d0 executing computations on platform Host. Devices:
2022-04-06 11:35:18.521180: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): Host, Default Version


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
features (InputLayer)        [(None, None, 768)]       0         
_________________________________________________________________
masking (Masking)            (None, None, 768)         0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               347600    
_________________________________________________________________
dense (Dense)                (None, 30)                3030      
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 62        
Total params: 350,692
Trainable params: 350,692
Non-trainable params: 0
_________________________________________________________________


In [29]:
call_reduce = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_acc', factor=0.95, patience=3, verbose=2,
                                mode='auto', min_delta=0.01, cooldown=0, min_lr=0)

In [30]:
batches_per_epoch = batch[0][1]

batches_per_epoch_val= batch[1][1]

R_Model.fit(train_data, steps_per_epoch=batches_per_epoch, epochs=10,
                    validation_data=val_data, validation_steps=batches_per_epoch_val, callbacks =[call_reduce] )

Train for 227 steps, validate for 179 steps
Epoch 1/10


2022-04-06 11:35:25.517135: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference___backward_cudnn_lstm_with_fallback_5844_7301' and '__inference___backward_standard_lstm_7406_8003_specialized_for_StatefulPartitionedCall_at___inference_distributed_function_8125' both implement 'lstm_5a7fd80a-4385-4cb6-94ca-153bed2a49a2' but their signatures do not match.




2022-04-06 11:35:32.536233: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference_cudnn_lstm_with_fallback_9377' and '__inference_standard_lstm_9036_specialized_for_model_lstm_StatefulPartitionedCall_at___inference_distributed_function_10907' both implement 'lstm_2669f5cf-bbd9-440a-9f96-e0e2ea664434' but their signatures do not match.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.0009500000451225787.


<tensorflow.python.keras.callbacks.History at 0x7f287e8a05d0>

In [31]:
batches_per_epoch_test = batch[2][1]

test_data = test_generator(df_test, batch[2][0], batch[2][1])
r_score = R_Model.predict_generator(test_data, steps=batches_per_epoch_test)

In [32]:
r_pred = np.argmax(r_score, axis=1).tolist()
label = df_test.label.to_list()

cul_all_metrics(label, r_pred)

{'accuracy': 0.78864,
 'precision': 0.78556,
 'recall': 0.78864,
 'f1-score': 0.78027}

#### Step 6: Build "Transformer over VGCN_BERT" Model

In [33]:
class MultiHeadSelfAttention(keras.layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        assert (
            embed_dim % num_heads == 0
        ), "embedding dimension not divisible by num heads"
        self.projection_dim = embed_dim // num_heads
        self.wq = keras.layers.Dense(embed_dim)
        self.wk = keras.layers.Dense(embed_dim)
        self.wv = keras.layers.Dense(embed_dim)
        self.combine_heads = keras.layers.Dense(embed_dim)

    def attention(self, q, k, v):
        score = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dk)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, v)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, x):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(x)[0]
        q = self.wq(x)  # (batch_size, seq_len, embed_dim)
        k = self.wk(x)  # (batch_size, seq_len, embed_dim)
        v = self.wv(x)  # (batch_size, seq_len, embed_dim)
        q = self.separate_heads(
            q, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        k = self.separate_heads(
            k, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        v = self.separate_heads(
            v, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(q, k, v)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output
    
class TransformerLayer(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerLayer, self).__init__()

        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [
                keras.layers.Dense(ff_dim, activation="relu"),
                keras.layers.Dense(embed_dim),
            ]
        )

        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training):
        attn_output = self.att(x)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

In [34]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

text_input = keras.Input(shape=(None,config_output_num_features,), dtype='float32', name='features')
l_mask = keras.layers.Masking(mask_value=-99.)(text_input) 
transformer_encodings = TransformerLayer(embed_dim=config_output_num_features, num_heads=1, ff_dim=32)(l_mask)
encoded_texts = keras.layers.LSTM(100,)(transformer_encodings)
out_dense = keras.layers.Dense(30, activation='relu')(encoded_texts)
out = keras.layers.Dense(2, activation='softmax')(out_dense)
T_Model = keras.Model(text_input, out)
T_Model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['acc'])
T_Model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
features (InputLayer)        [(None, None, 768)]       0         
_________________________________________________________________
masking_1 (Masking)          (None, None, 768)         0         
_________________________________________________________________
transformer_layer (Transform (None, None, 768)         2415392   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               347600    
_________________________________________________________________
dense_8 (Dense)              (None, 30)                3030      
_________________________________________________________________
dense_9 (Dense)              (None, 2)                 62        
Total params: 2,766,084
Trainable params: 2,766,084
Non-trainable params: 0
_________________________________________________

In [35]:
call_reduce = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_acc', factor=0.95, patience=3, verbose=2,
                                mode='auto', min_delta=0.01, cooldown=0, min_lr=0)

In [36]:
batches_per_epoch = batch[0][1]

batches_per_epoch_val= batch[1][1]

T_Model.fit(train_data, steps_per_epoch=batches_per_epoch, epochs=10,
                    validation_data=val_data, validation_steps=batches_per_epoch_val, callbacks =[call_reduce] )

Train for 227 steps, validate for 179 steps
Epoch 1/10


2022-04-06 11:36:32.872598: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference___backward_cudnn_lstm_with_fallback_94593_94775' and '__inference___backward_standard_lstm_94880_95363_specialized_for_StatefulPartitionedCall_at___inference_distributed_function_95941' both implement 'lstm_5cefef48-f6aa-4c43-9249-c90a2f054e2c' but their signatures do not match.




2022-04-06 11:36:43.506183: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference_standard_lstm_97086' and '__inference_standard_lstm_97086_specialized_for_model_1_lstm_1_StatefulPartitionedCall_at___inference_distributed_function_97452' both implement 'lstm_201ff56b-b679-4b86-8de7-c773c793c34c' but their signatures do not match.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f2644642850>

In [37]:
batches_per_epoch_test = batch[2][1]

test_data = test_generator(df_test, batch[2][0], batch[2][1])
t_score = T_Model.predict_generator(test_data, steps=batches_per_epoch_test)

In [38]:
t_pred = np.argmax(t_score, axis=1).tolist()
label = df_test.label.to_list()

cul_all_metrics(label, t_pred)

{'accuracy': 0.76164,
 'precision': 0.76664,
 'recall': 0.76164,
 'f1-score': 0.74002}