# install

In [1]:
!pip install --pre dgl-cu102
!pip install --upgrade gdown
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dgl-cu102
  Downloading dgl_cu102-0.6.1-cp38-cp38-manylinux1_x86_64.whl (36.8 MB)
[K     |████████████████████████████████| 36.8 MB 135 kB/s 
Installing collected packages: dgl-cu102
Successfully installed dgl-cu102-0.6.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.0-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.6.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 6.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>

In [4]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Wed_Oct_23_19:24:38_PDT_2019
Cuda compilation tools, release 10.2, V10.2.89


# source

### import packet

In [5]:
import argparse
import re
import torch
import random
import scipy.sparse as sp
import sys
import dgl
from torch.nn import Module, Dropout
from dgl.nn.pytorch import GraphConv
from torch.nn.functional import relu, softmax
from transformers import AutoModel, AutoTokenizer
from torch.nn import Module, Linear
import os
import numpy as np
import pickle as pkl
import scipy.sparse as sp
from math import log
from sklearn import svm
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
from torch.utils.data import DataLoader
from random import shuffle
import copy
from tqdm import tqdm
import collections
import math
from scipy.sparse import coo_matrix
from torch.nn.utils.rnn import pad_sequence
from torch.nn.functional import nll_loss,cross_entropy
import torch
from numpy import mean
from torch import log
from torch.nn.functional import softmax
from torch.optim import lr_scheduler,AdamW

from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

DGL backend not selected or invalid.  Assuming PyTorch for now.
Using backend: pytorch


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


### set data_set name 

In [6]:
# Vion VSMEC   ViCTSD 
dataet_name = 'ViCTSD'
f1_type = 'binary'
if dataet_name == 'VSMEC' or dataet_name == 'ViCTSD':
  f1_type = 'macro'
elif dataet_name == 'Vion' :
  f1_type = 'weighted'

### util 

In [7]:
def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()



def get_file(choice):
    file_dict = {"R8": 'R8.txt', 'ohsumed': 'ohsumed.txt', '20news': "20ng.txt", 'mr': 'mr.txt','SVMC': 'SVMC.txt'}
    if choice not in file_dict.keys():
        raise FileNotFoundError
    return file_dict[choice]


def clean_str(string):
    # string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    # string = re.sub(r"\'s", " \'s", string)
    # string = re.sub(r"\'ve", " \'ve", string)
    # string = re.sub(r"n\'t", " n\'t", string)
    # string = re.sub(r"\'re", " \'re", string)
    # string = re.sub(r"\'d", " \'d", string)
    # string = re.sub(r"\'ll", " \'ll", string)
    # string = re.sub(r",", " , ", string)
    # string = re.sub(r"!", " ! ", string)
    # string = re.sub(r"\(", " \( ", string)
    # string = re.sub(r"\)", " \) ", string)
    # string = re.sub(r"\?", " \? ", string)
    # string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index



def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)


def load_corpus(dataset_str):
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'adj']
    objects = []
    for i in range(len(names)):
        with open("/content/data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, adj = tuple(objects)
    print(x.shape, y.shape, tx.shape, ty.shape, allx.shape, ally.shape)
    features = sp.vstack((allx, tx)).tolil()
    labels = np.vstack((ally, ty))
    print(len(labels))

    train_idx_orig = parse_index_file(
        "/content/data/{}.train.index".format(dataset_str))
    train_size = len(train_idx_orig)

    val_size = train_size - x.shape[0]
    test_size = tx.shape[0]

    idx_train = range(len(y))
    idx_val = range(len(y), len(y) + val_size)
    idx_test = range(allx.shape[0], allx.shape[0] + test_size)

    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
    y_train[train_mask, :] = labels[train_mask, :]
    y_val[val_mask, :] = labels[val_mask, :]
    y_test[test_mask, :] = labels[test_mask, :]

    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_size, test_size
setup_seed(9)

## model


In [8]:
class finetunedphoBert(Module):
    def __init__(self, class_num):
        super().__init__()
        #bert-base-uncased
        # self.model = RobertaModel.from_pretrained('/content/PhoBERT_base_fairseq', checkpoint_file='model.pt', bpe='fastbpe', bpe_codes='/content/PhoBERT_base_fairseq/bpe.codes').eval()
        self.model = AutoModel.from_pretrained('vinai/phobert-base')
        # self.model = AutoModel.from_pretrained('bert-base-uncased')
        self.linear = Linear(768, class_num)

    def forward(self, input):
        predict = self.model(input).last_hidden_state[:,0,:]
        return self.linear(predict)

class GCN(Module):
    def __init__(self, class_num):
        super().__init__()
        self.Conv1 = GraphConv(768, 200, weight=True, activation=relu)
        self.Conv2 = GraphConv(200, class_num, weight=True)
        self.dropout=Dropout(0.1)
        self.dropout2=Dropout(0.2)


    def forward(self, graph: dgl.graph, feature: torch.tensor):
        predict1 = self.dropout(self.Conv1(graph, feature, edge_weight=graph.edata['w']))
        predict2 = self.dropout2(self.Conv2(graph, predict1, edge_weight=graph.edata['w']))
        return predict2


class phoBertGCN(Module):
    def __init__(self, pretrained_path, label_size, lam=0.3):
        """
        :param pretrained_path: path to pretrained bert model
        """
        super().__init__()
        self.phoBertModel = finetunedphoBert(label_size)
        self.phoBertModel.load_state_dict(torch.load(pretrained_path))
        self.gcn = GCN(label_size)
        self.lam = lam

    def forward(self, sentences, features,attention, graph, indexs):
        last_predict = self.phoBertModel.model(sentences,attention_mask=attention).last_hidden_state[:, 0, :]
        features[indexs] = last_predict.detach()
        gcn_predict = self.gcn(graph, features)
        bert_predict = self.phoBertModel.linear(last_predict)
        predict = softmax(gcn_predict[indexs], -1) * self.lam + (1 - self.lam) * softmax(bert_predict, -1)
        return predict

## buildgraph for training

In [9]:
import os
import random
import numpy as np
import pickle as pkl
# import networkx as nx
import scipy.sparse as sp
from math import log
from sklearn import svm
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine


datasets = ['20ng', 'R8', 'R52', 'ohsumed', 'mr']
# build corpus
dataset = dataet_name


# Read Word Vectors
# word_vector_file = 'data/glove.6B/glove.6B.300d.txt'
# word_vector_file = 'data/corpus/' + dataset + '_word_vectors.txt'
#_, embd, word_vector_map = loadWord2Vec(word_vector_file)
# word_embeddings_dim = len(embd[0])

word_embeddings_dim = 300
word_vector_map = {}

# shulffing
doc_name_list = []
doc_train_list = []
doc_test_list = []

with open('/content/data/' + dataset + '.txt', 'r',errors='replace') as f:
    lines = f.readlines()
    for line in lines:
        doc_name_list.append(line.strip())
        temp = line.split("\t")
        if temp[1].find('test') != -1:
            doc_test_list.append(line.strip())
        elif temp[1].find('train') != -1:
            doc_train_list.append(line.strip())
# print(doc_train_list)
# print(doc_test_list)

doc_content_list = []
with open('/content/data/corpus/' + dataset + '.txt','r',errors='replace') as f:
    lines = f.readlines()
    for line in lines:
        doc_content_list.append(line.strip())
# print(doc_content_list)

train_ids = []
for train_name in doc_train_list:
    train_id = doc_name_list.index(train_name)
    train_ids.append(train_id)
# print(train_ids)
random.shuffle(train_ids)

# partial labeled data
#train_ids = train_ids[:int(0.2 * len(train_ids))]

train_ids_str = '\n'.join(str(index) for index in train_ids)
with open('/content/data/' + dataset + '.train.index', 'w') as f:
    f.write(train_ids_str)


test_ids = []
for test_name in doc_test_list:
    test_id = doc_name_list.index(test_name)
    test_ids.append(test_id)
print(test_ids)
random.shuffle(test_ids)

test_ids_str = '\n'.join(str(index) for index in test_ids)
with open('/content/data/' + dataset + '.test.index', 'w') as f:
    f.write(test_ids_str)


ids = train_ids + test_ids
print(ids)
print(len(ids))

shuffle_doc_name_list = []
shuffle_doc_words_list = []
for id in ids:
    # print(id)
    shuffle_doc_name_list.append(doc_name_list[int(id)])
    shuffle_doc_words_list.append(doc_content_list[int(id)])
shuffle_doc_name_str = '\n'.join(shuffle_doc_name_list)
shuffle_doc_words_str = '\n'.join(shuffle_doc_words_list)

with open('/content/data/' + dataset + '_shuffle.txt', 'w') as f:
    f.write(shuffle_doc_name_str)

with open('/content/data/corpus/' + dataset + '_shuffle.txt', 'w') as f:
    f.write(shuffle_doc_words_str)


# build vocab
word_freq = {}
word_set = set()
for doc_words in shuffle_doc_words_list:
    words = doc_words.split()
    for word in words:
        word_set.add(word)
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1

vocab = list(word_set)
vocab_size = len(vocab)

word_doc_list = {}

for i in range(len(shuffle_doc_words_list)):
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    appeared = set()
    for word in words:
        if word in appeared:
            continue
        if word in word_doc_list:
            doc_list = word_doc_list[word]
            doc_list.append(i)
            word_doc_list[word] = doc_list
        else:
            word_doc_list[word] = [i]
        appeared.add(word)

word_doc_freq = {}
for word, doc_list in word_doc_list.items():
    word_doc_freq[word] = len(doc_list)

word_id_map = {}
for i in range(vocab_size):
    word_id_map[vocab[i]] = i

vocab_str = '\n'.join(vocab)

with open('/content/data/corpus/' + dataset + '_vocab.txt', 'w') as f:
    f.write(vocab_str)

label_set = set()
for doc_meta in shuffle_doc_name_list:
    temp = doc_meta.split('\t')
    label_set.add(temp[2])
label_list = list(label_set)

label_list_str = '\n'.join(label_list)
with open('/content/data/corpus/' + dataset + '_labels.txt', 'w') as f:
    f.write(label_list_str)


# x: feature vectors of training docs, no initial features
# slect 90% training set
train_size = len(train_ids)
val_size = int(0.1 * train_size)
real_train_size = train_size - val_size  # - int(0.5 * train_size)
# different training rates

real_train_doc_names = shuffle_doc_name_list[:real_train_size]
real_train_doc_names_str = '\n'.join(real_train_doc_names)

with open('/content/data/' + dataset + '.real_train.name', 'w') as f:
    f.write(real_train_doc_names_str)


row_x = []
col_x = []
data_x = []
for i in range(real_train_size):
    doc_vec = np.array([0.0 for k in range(word_embeddings_dim)])
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            # print(doc_vec)
            # print(np.array(word_vector))
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embeddings_dim):
        row_x.append(i)
        col_x.append(j)
        # np.random.uniform(-0.25, 0.25)
        data_x.append(doc_vec[j] / doc_len)  # doc_vec[j]/ doc_len

# x = sp.csr_matrix((real_train_size, word_embeddings_dim), dtype=np.float32)
x = sp.csr_matrix((data_x, (row_x, col_x)), shape=(
    real_train_size, word_embeddings_dim))

y = []
for i in range(real_train_size):
    doc_meta = shuffle_doc_name_list[i]
    temp = doc_meta.split('\t')
    label = temp[2]
    one_hot = [0 for l in range(len(label_list))]
    label_index = label_list.index(label)
    one_hot[label_index] = 1
    y.append(one_hot)
y = np.array(y)
print(y)

# tx: feature vectors of test docs, no initial features
test_size = len(test_ids)

row_tx = []
col_tx = []
data_tx = []
for i in range(test_size):
    doc_vec = np.array([0.0 for k in range(word_embeddings_dim)])
    doc_words = shuffle_doc_words_list[i + train_size]
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embeddings_dim):
        row_tx.append(i)
        col_tx.append(j)
        # np.random.uniform(-0.25, 0.25)
        data_tx.append(doc_vec[j] / doc_len)  # doc_vec[j] / doc_len

# tx = sp.csr_matrix((test_size, word_embeddings_dim), dtype=np.float32)
tx = sp.csr_matrix((data_tx, (row_tx, col_tx)),
                   shape=(test_size, word_embeddings_dim))

ty = []
for i in range(test_size):
    doc_meta = shuffle_doc_name_list[i + train_size]
    temp = doc_meta.split('\t')
    label = temp[2]
    one_hot = [0 for l in range(len(label_list))]
    label_index = label_list.index(label)
    one_hot[label_index] = 1
    ty.append(one_hot)
ty = np.array(ty)
print(ty)

# allx: the the feature vectors of both labeled and unlabeled training instances
# (a superset of x)
# unlabeled training instances -> words

word_vectors = np.random.uniform(-0.01, 0.01,
                                 (vocab_size, word_embeddings_dim))

for i in range(len(vocab)):
    word = vocab[i]
    if word in word_vector_map:
        vector = word_vector_map[word]
        word_vectors[i] = vector

row_allx = []
col_allx = []
data_allx = []

for i in range(train_size):
    doc_vec = np.array([0.0 for k in range(word_embeddings_dim)])
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embeddings_dim):
        row_allx.append(int(i))
        col_allx.append(j)
        # np.random.uniform(-0.25, 0.25)
        data_allx.append(doc_vec[j] / doc_len)  # doc_vec[j]/doc_len
for i in range(vocab_size):
    for j in range(word_embeddings_dim):
        row_allx.append(int(i + train_size))
        col_allx.append(j)
        data_allx.append(word_vectors.item((i, j)))


row_allx = np.array(row_allx)
col_allx = np.array(col_allx)
data_allx = np.array(data_allx)

allx = sp.csr_matrix(
    (data_allx, (row_allx, col_allx)), shape=(train_size + vocab_size, word_embeddings_dim))

ally = []
for i in range(train_size):
    doc_meta = shuffle_doc_name_list[i]
    temp = doc_meta.split('\t')
    label = temp[2]
    one_hot = [0 for l in range(len(label_list))]
    label_index = label_list.index(label)
    one_hot[label_index] = 1
    ally.append(one_hot)

for i in range(vocab_size):
    one_hot = [0 for l in range(len(label_list))]
    ally.append(one_hot)

ally = np.array(ally)

print(x.shape, y.shape, tx.shape, ty.shape, allx.shape, ally.shape)

'''
Doc word heterogeneous graph
'''

# word co-occurence with context windows
window_size = 20
windows = []

for doc_words in shuffle_doc_words_list:
    words = doc_words.split()
    length = len(words)
    if length <= window_size:
        windows.append(words)
    else:
        # print(length, length - window_size + 1)
        for j in range(length - window_size + 1):
            window = words[j: j + window_size]
            windows.append(window)
            # print(window)


word_window_freq = {}
for window in windows:
    appeared = set()
    for i in range(len(window)):
        if window[i] in appeared:
            continue
        if window[i] in word_window_freq:
            word_window_freq[window[i]] += 1
        else:
            word_window_freq[window[i]] = 1
        appeared.add(window[i])

word_pair_count = {}
for window in windows:
    for i in range(1, len(window)):
        for j in range(0, i):
            word_i = window[i]
            word_i_id = word_id_map[word_i]
            word_j = window[j]
            word_j_id = word_id_map[word_j]
            if word_i_id == word_j_id:
                continue
            word_pair_str = str(word_i_id) + ',' + str(word_j_id)
            if word_pair_str in word_pair_count:
                word_pair_count[word_pair_str] += 1
            else:
                word_pair_count[word_pair_str] = 1
            # two orders
            word_pair_str = str(word_j_id) + ',' + str(word_i_id)
            if word_pair_str in word_pair_count:
                word_pair_count[word_pair_str] += 1
            else:
                word_pair_count[word_pair_str] = 1

row = []
col = []
weight = []

# pmi as weights

num_window = len(windows)

for key in word_pair_count:
    temp = key.split(',')
    i = int(temp[0])
    j = int(temp[1])
    count = word_pair_count[key]
    word_freq_i = word_window_freq[vocab[i]]
    word_freq_j = word_window_freq[vocab[j]]
    pmi = log((1.0 * count / num_window) /
              (1.0 * word_freq_i * word_freq_j/(num_window * num_window)))
    if pmi <= 0:
        continue
    row.append(train_size + i)
    col.append(train_size + j)
    weight.append(pmi)

# word vector cosine similarity as weights

'''
for i in range(vocab_size):
    for j in range(vocab_size):
        if vocab[i] in word_vector_map and vocab[j] in word_vector_map:
            vector_i = np.array(word_vector_map[vocab[i]])
            vector_j = np.array(word_vector_map[vocab[j]])
            similarity = 1.0 - cosine(vector_i, vector_j)
            if similarity > 0.9:
                print(vocab[i], vocab[j], similarity)
                row.append(train_size + i)
                col.append(train_size + j)
                weight.append(similarity)
'''
# doc word frequency
doc_word_freq = {}

for doc_id in range(len(shuffle_doc_words_list)):
    doc_words = shuffle_doc_words_list[doc_id]
    words = doc_words.split()
    for word in words:
        word_id = word_id_map[word]
        doc_word_str = str(doc_id) + ',' + str(word_id)
        if doc_word_str in doc_word_freq:
            doc_word_freq[doc_word_str] += 1
        else:
            doc_word_freq[doc_word_str] = 1

for i in range(len(shuffle_doc_words_list)):
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    doc_word_set = set()
    for word in words:
        if word in doc_word_set:
            continue
        j = word_id_map[word]
        key = str(i) + ',' + str(j)
        freq = doc_word_freq[key]
        if i < train_size:
            row.append(i)
        else:
            row.append(i + vocab_size)
        col.append(train_size + j)
        idf = log(1.0 * len(shuffle_doc_words_list) /
                  word_doc_freq[vocab[j]])
        weight.append(freq * idf)
        doc_word_set.add(word)

node_size = train_size + vocab_size + test_size
adj = sp.csr_matrix(
    (weight, (row, col)), shape=(node_size, node_size))

# dump objects
with open("/content/data/ind.{}.x".format(dataset), 'wb') as f:
    pkl.dump(x, f)

with open("/content/data/ind.{}.y".format(dataset), 'wb') as f:
    pkl.dump(y, f)

with open("/content/data/ind.{}.tx".format(dataset), 'wb') as f:
    pkl.dump(tx, f)

with open("/content/data/ind.{}.ty".format(dataset), 'wb') as f:
    pkl.dump(ty, f)

with open("/content/data/ind.{}.allx".format(dataset), 'wb') as f:
    pkl.dump(allx, f)

with open("/content/data/ind.{}.ally".format(dataset), 'wb') as f:
    pkl.dump(ally, f)

with open("/content/data/ind.{}.adj".format(dataset), 'wb') as f:
    pkl.dump(adj, f)

[7000, 7001, 7002, 7003, 7004, 7005, 7006, 7007, 7008, 7009, 7010, 7011, 7012, 7013, 7014, 7015, 7016, 7017, 7018, 7019, 7020, 7021, 7022, 7023, 7024, 7025, 7026, 7027, 7028, 7029, 7030, 7031, 7032, 7033, 7034, 7035, 7036, 7037, 7038, 7039, 7040, 7041, 7042, 7043, 7044, 7045, 7046, 7047, 7048, 7049, 7050, 7051, 7052, 7053, 7054, 7055, 7056, 7057, 7058, 7059, 7060, 7061, 7062, 7063, 7064, 7065, 7066, 7067, 7068, 7069, 7070, 7071, 7072, 7073, 7074, 7075, 7076, 7077, 7078, 7079, 7080, 7081, 7082, 7083, 7084, 7085, 7086, 7087, 7088, 7089, 7090, 7091, 7092, 7093, 7094, 7095, 7096, 7097, 7098, 7099, 7100, 7101, 7102, 7103, 7104, 7105, 7106, 7107, 7108, 7109, 7110, 7111, 7112, 7113, 7114, 7115, 7116, 7117, 7118, 7119, 7120, 7121, 7122, 7123, 7124, 7125, 7126, 7127, 7128, 7129, 7130, 7131, 7132, 7133, 7134, 7135, 7136, 7137, 7138, 7139, 7140, 7141, 7142, 7143, 7144, 7145, 7146, 7147, 7148, 7149, 7150, 7151, 7152, 7153, 7154, 7155, 7156, 7157, 7158, 7159, 7160, 7161, 7162, 7163, 7164, 7165, 716

In [10]:
word_vector_map = {}

# shulffing
doc_name_list = []
doc_train_list = []
doc_test_list = []

with open('/content/data/' + dataset + '.txt', 'r',errors='replace') as f:
    lines = f.readlines()
    for line in lines:
        doc_name_list.append(line.strip())
        temp = line.split("\t")
        if temp[1].find('test') != -1:
            doc_test_list.append(line.strip())
        elif temp[1].find('train') != -1:
            doc_train_list.append(line.strip())
# print(doc_train_list)
# print(doc_test_list)

doc_content_list = []
with open('/content/data/corpus/' + dataset + '.txt','r',errors='replace') as f:
    lines = f.readlines()
    for line in lines:
        doc_content_list.append(line.strip())
# print(doc_content_list)

train_ids = []
for train_name in doc_train_list:
    train_id = doc_name_list.index(train_name)
    train_ids.append(train_id)
# print(train_ids)
# random.shuffle(train_ids)

### delete to freeram

In [11]:
# del(doc_name_list)
# del(doc_train_list)
# del(doc_test_list)
# del(lines)
# del(train_ids)
# del(train_ids_str)
# del(test_ids)
# del(test_ids_str)
# del(ids)
# del(shuffle_doc_name_list)
# del(shuffle_doc_words_list)
# del(word_doc_list)
# del(word_id_map)
# del(vocab_str)
# del(label_set)
# del(train_size)
# del(real_train_doc_names)
# del(real_train_doc_names_str)
# del(row_x)
# del(col_x)
# del(data_x)
# del(x)
# del(y)
# del(row_tx)
# del(col_tx)
# del(data_tx)
# del(tx)
# del(ty)
# del(word_vectors)
# del(row_allx)
# del(col_allx)
# del(data_allx)
# del(row_allx)
# del(col_allx)
# del(data_allx)
# del(allx)
# del(ally)
# del(windows)
# del(word_window_freq)
# del(word_pair_count)
# del(row)
# del(col)
# del(weight)
# del(num_window)
# del(doc_word_freq)
# del()

## training data loader

In [12]:
from torch.utils.data import Dataset
# from utils import load_corpus,normalize_adj
from scipy.sparse import eye
import torch
import dgl
def encode_input(text, tokenizer):
    input = tokenizer(text, max_length=256, truncation=True, padding='max_length', return_tensors='pt')
#     print(input.keys())
    return input.input_ids, input.attention_mask
class Data_set(Dataset):
    def __init__(self,name):
        adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_size, test_size = load_corpus(name)
        doc_mask=train_mask+test_mask+val_mask
        adj=normalize_adj(adj+eye(adj.shape[0]))
        train_num=train_mask.sum().item()
        val_num=val_mask.sum().item()
        test_num=test_mask.sum().item()
        node_size=adj.shape[0]
        y=torch.tensor(y_train+y_val+y_test)
        self.y=torch.argmax(y,-1)

        self.train_index=[i for i in range(train_num+val_num)] + [i for i in range(node_size-test_num,node_size)]
        corpse_file = open('/content/data/corpus/' + name +'_shuffle.txt').readlines()
        token=AutoTokenizer.from_pretrained('vinai/phobert-base')
        # token=AutoTokenizer.from_pretrained('bert-base-uncased')
        
        self.dataset,self.attention_mask=encode_input(corpse_file,token)
        self.attention_mask=torch.tensor(self.attention_mask)
        self.dataset=torch.tensor(self.dataset)
        self.graph=dgl.from_scipy(adj,eweight_name='w')
        self.graph.ndata['label']=self.y
        self.label_num=len(y_train[0])
        self.graph.edata['w']=self.graph.edata['w'].float()
        self.graph.ndata['train_mask']=torch.tensor(train_mask)
        self.graph.ndata['valid_mask']=torch.tensor(val_mask)
        self.graph.ndata['test_mask']=torch.tensor(test_mask)
        self.train_mask=train_mask
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, item):
        return self.dataset[item],self.y[self.train_index[item]],self.attention_mask[item],self.train_mask[self.train_index[item]],self.train_index[item]

## pretrain(fine tune) data loader

In [13]:
from torch.utils.data import Dataset
from torch import tensor
import pickle


class DataSet(Dataset):
    def __init__(self, name, usage, label_dict=None):
        tokenizers = AutoTokenizer.from_pretrained('vinai/phobert-base')
        # tokenizers = AutoTokenizer.from_pretrained('bert-base-uncased')
        
        # data=open(usage+'_data.txt').readlines()
        # label=open(usage+'_label.txt').readlines()
        # self.data=[tokenizers.encode(each,max_length=512) for each in data]
        # self.label=[int(i) for i in label]
        # self.class_num=23
        label_path = '/content/data/' + name + '_labels.pkl'
        indexs = '/content/data/' + name + '_indexs.pkl'
        current_usage = '/content/data/' + name + '_' + usage + '_index.pkl'
        current_usage = pickle.load(open(current_usage, 'rb'))
        orig_data_path = '/content/data/corpus/' + name + '.txt'
        orig_data = open(orig_data_path).readlines()
        indexs = pickle.load(open(indexs, 'rb'))
        labels = pickle.load(open(label_path, 'rb'))
        orig_data = [orig_data[i] for i in indexs]
        orig_data = [tensor(tokenizers.encode(each, max_length=256,truncation=True)) for each in orig_data]
        self.data = [orig_data[i] for i in current_usage]
        self.label = [labels[i] for i in current_usage]
        self.class_num = len(pickle.load(open('/content/data/' + name + '_label_dict.pkl', 'rb')))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        return tensor(self.data[item]), tensor(self.label[item])
# data=DataSet('ohsumed','train')

## remove stop words

In [14]:
from nltk.corpus import stopwords
import nltk
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn
# from utils import clean_str
import sys
from tqdm import tqdm


dataset = dataet_name


nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(stop_words)

# Read Word Vectors
# word_vector_file = 'data/glove.6B/glove.6B.200d.txt'
# vocab, embd, word_vector_map = loadWord2Vec(word_vector_file)
# word_embeddings_dim = len(embd[0])
# dataset = '20ng'

doc_content_list = []
# with open('data/wiki_long_abstracts_en_text.txt', 'r') as f:
with open('/content/data/corpus/' + dataset + '.txt', 'rb') as f:
    for line in tqdm(f.readlines()):
        doc_content_list.append(line.strip().decode('latin1'))

word_freq = {}  # to remove rare words

for doc_content in tqdm(doc_content_list):
    temp = clean_str(doc_content)
    words = temp.split()
    for word in words:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1

clean_docs = []
for doc_content in tqdm(doc_content_list):
    temp = clean_str(doc_content)
    words = temp.split()
    doc_words = []
    for word in words:
        # word not in stop_words and word_freq[word] >= 5
        if dataset == 'mr':
            doc_words.append(word)
        elif word not in stop_words and word_freq[word] >= 5:
            doc_words.append(word)

    doc_str = ' '.join(doc_words).strip()
    # if doc_str == '':
    # doc_str = temp
    clean_docs.append(doc_str)

clean_corpus_str = '\n'.join(clean_docs)

with open('/content/data/corpus/' + dataset + '.clean.txt', 'w') as f:
    f.write(clean_corpus_str)

# dataset = '20ng'
min_len = 10000
aver_len = 0
max_len = 0

with open('/content/data/corpus/' + dataset + '.clean.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        temp = line.split()
        aver_len = aver_len + len(temp)
        if len(temp) < min_len:
            min_len = len(temp)
        if len(temp) > max_len:
            max_len = len(temp)

aver_len = 1.0 * aver_len / len(lines)
print('Min_len : ' + str(min_len))
print('Max_len : ' + str(max_len))
print('Average_len : ' + str(aver_len))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


{'have', 'her', 'hadn', "that'll", 'wouldn', 'some', 'on', 'a', 'is', 'over', 'ourselves', 'than', 'your', 'our', "you're", 'will', 'each', 'again', 'did', "shan't", "she's", "wasn't", 'do', 'or', 'you', 'their', 'such', 'which', 'because', "don't", 'its', 'nor', 'shouldn', 'his', 'out', 'being', "you'd", "won't", 'all', 'if', 'between', 'shan', 'few', 'am', 'the', 'these', "wouldn't", 'mustn', 'didn', 'that', "you'll", 'very', 'what', 'most', 'hasn', 'ours', 'them', 'once', 'own', 'couldn', "hasn't", 'hers', 'wasn', 'ma', 'o', 'haven', 'theirs', 'up', 'then', 'doing', 'during', 'in', 'can', 'after', 'where', 'does', 'to', 'themselves', 'an', "haven't", 'with', 'no', 'they', 'i', 'who', 'were', 'itself', 'both', "mustn't", "needn't", 'when', "weren't", 'ain', 'isn', 'here', 'aren', 'from', "couldn't", 'm', "shouldn't", 'd', "it's", 'yourselves', "didn't", 'whom', 'he', 'into', 'll', 'him', 're', "mightn't", 'of', 'under', 'only', 'so', 'those', "isn't", 'too', 'myself', 'below', 'same'

100%|██████████| 8000/8000 [00:00<00:00, 687506.29it/s]
100%|██████████| 8000/8000 [00:00<00:00, 85196.83it/s]
100%|██████████| 8000/8000 [00:00<00:00, 63286.01it/s]

Min_len : 0
Max_len : 325
Average_len : 28.0705





In [15]:
doc_content_list = []
dataset = 'VSMEC'
# with open('data/wiki_long_abstracts_en_text.txt', 'r') as f:
with open('/content/data/corpus/' + dataset + '.txt', 'rb') as f:
    for line in tqdm(f.readlines()):
        doc_content_list.append(line.strip().decode('utf-8'))

word_freq = {}  # to remove rare words
c = []
for doc_content in tqdm(doc_content_list):
    temp = clean_str(doc_content)
    words = temp.split()
    c.append(words)


100%|██████████| 6241/6241 [00:00<00:00, 803408.36it/s]
100%|██████████| 6241/6241 [00:00<00:00, 48246.20it/s]


In [16]:
c[0]

['cho', 'mình', 'xin', 'bài', 'nhạc', 'tên', 'là', 'gì', 'với', 'ạ', 'khác']

In [17]:
doc_content_list[0]

'cho mình xin bài nhạc tên là gì với ạ khác'

## tokenize sentence 

In [18]:
def convert_data_to_index(context):
    dictionary = {}
    index_context = []
    context_length = len(context)
    for line in context:
        line = line.strip().lower().split(' ')
        for word in line:
            if word not in dictionary.keys():
                dictionary[word] = len(dictionary.keys()) + context_length
        index_context.append([dictionary[each] for each in line])
    return dictionary, index_context


def word_window_num(windows):
    single_word_fluency = collections.defaultdict(int)
    tuple_word_fluency = collections.defaultdict(int)
    for window in tqdm(windows):
        current_appear = set()
        for i in range(len(window)):
            if window[i] not in current_appear:
                single_word_fluency[window[i]] += 1
                current_appear.add(window[i])
            for j in range(i + 1, len(window)):
                str1=str(window[i])+','+str(window[j])
                str2=str(window[j])+','+str(window[i])
                if window[i] == window[j]:
                    continue
                if str1 in current_appear or str2 in current_appear:
                    continue
                else:
                    tuple_word_fluency[str1] += 1
                    tuple_word_fluency[str2] += 1
                    current_appear.add(str1)
                    current_appear.add(str2)
    return single_word_fluency, tuple_word_fluency


def word_document(indexed_document, start, end, weight):
    word_document = collections.defaultdict(int)
    for document in indexed_document:
        appear = set()
        for word in document:
            if word not in appear:
                word_document[word] += 1
                appear.add(word)
    for i in range(len(indexed_document)):
        word_dict = collections.defaultdict(int)
        line = indexed_document[i]
        for word in line:
            word_dict[word] += 1
        for key in word_dict.keys():
            start.append(i)
            end.append(key)
            TF = word_dict[key] / len(indexed_document[i])
            IDF = math.log(len(indexed_document) / word_document[key])
            weight.append(TF * IDF)
            start.append(key)
            end.append(i)
            weight.append(TF*IDF)

    return start, end, weight


def build_graph(index_context: list, window_size):
    windows = []
    for line in tqdm(index_context):
        if len(line) <= window_size:
            windows.append(line)
        else:
            for i in range(len(line) - window_size + 1):
                windows.append(line[i:i + window_size])
    print('generating word relations')
    single_word_fluency, tuple_word_fluency = word_window_num(windows)
    window_num = len(windows)
    start = []
    end = []
    weight = []
    for sen in tuple_word_fluency.keys():
        s,t=sen.split(',')
        s,t=int(s),int(t)
        score = math.log(tuple_word_fluency[str(s)+','+str(t)] / window_num / (
                single_word_fluency[s] / window_num * single_word_fluency[t] / window_num))
        if score < 0:
            continue
        start.append(s)
        end.append(t)
        weight.append(score)
    start, end, weight = word_document(index_context, start, end, weight)
    return start, end, weight

dataset = dataet_name
file_path = '/content/data/' + dataset + '.txt'
with open(file_path) as f:
    lines = f.readlines()
titles = lines
orig_titles = copy.deepcopy(titles)
content_path = '/content/data/corpus/' + dataset + '.clean.txt'
content = open(content_path).readlines()
shuffle(titles)
indexs = [orig_titles.index(each) for each in titles]
content=[content[i] for i in indexs]
dictonary, index_data = convert_data_to_index(content)
train_index = []
test_index = []
label_dict = {}
labels = []
for i in range(len(indexs)):
    line = titles[i].strip().split()
    if line[1] == 'train':
        train_index.append(i)
    else:
        test_index.append(i)
    label = line[-1]
    if label not in label_dict.keys():
        label_dict[label] = len(label_dict)
    labels.append(label_dict[label])
shuffle(train_index)
valid_index = train_index[int(len(train_index) * 0.9):]
train_index = train_index[:int(len(train_index) * 0.9)]
start, end, weight = build_graph(index_data, 20)
matrix = coo_matrix((weight, (start, end)))

pickle.dump(matrix, open('/content/data/' + dataset + '_matrix.pkl', 'wb'))
pickle.dump(indexs, open('/content/data/' + dataset + '_indexs.pkl', 'wb'))
pickle.dump(train_index, open('/content/data/' +dataset + '_train_index.pkl', 'wb'))
pickle.dump(test_index, open('/content/data/' + dataset + '_test_index.pkl', 'wb'))
pickle.dump(valid_index, open('/content/data/' + dataset + '_valid_index.pkl', 'wb'))
pickle.dump(labels, open('/content/data/' + dataset + '_labels.pkl', 'wb'))
pickle.dump(dictonary, open('/content/data/' + dataset + '_dict.pkl', 'wb'))
pickle.dump(label_dict, open('/content/data/' +dataset + '_label_dict.pkl', 'wb'))

100%|██████████| 8000/8000 [00:00<00:00, 18009.44it/s]


generating word relations


100%|██████████| 108283/108283 [00:35<00:00, 3040.60it/s]


## finetune phobert

In [19]:
def collate_fn(batch):
    src, label = [], []
    for s, t in batch:
        src.append(s)
        label.append(t)
    src = pad_sequence(src, batch_first=True, padding_value=0)
    label = torch.tensor(label)
    return src, label


def train(i, model, optim, data_loader, device):
    model.train()
    losses = []
    correct = 0
    total=0
    for src, trg in tqdm(data_loader):
        optim.zero_grad()
        src = src.to(device)
        trg = trg.to(device)
        predict = model(src)
        loss = cross_entropy(predict, trg.long())
        loss.backward()
        optim.step()
        losses.append(loss.item())
        correct += (torch.argmax(predict, -1) == trg).sum().item()
        total+=predict.shape[0]
    print("train epoch {} accuracy {} || loss {}".format(i, correct / total, mean(losses)))


def eval(i, model, best_loss, no_increase, data_loader, device):
    model.eval()
    losses = []
    correct = 0
    total = 0
    for src, trg in tqdm(data_loader):
        src = src.to(device)
        trg = trg.to(device)
        predict = model(src)
        loss = cross_entropy(predict, trg.long())
        losses.append(loss.item())
        correct += (torch.argmax(predict, -1) == trg).sum().item()
        total += predict.shape[0]
    loss = mean(losses)
    if loss < best_loss:
        best_loss = loss
        torch.save(model.state_dict(), 'best_pretrain_phobert.pkl')
        no_increase = 0
    else:
        no_increase += 1
    print("eval epoch {} accuracy {} || loss {}".format(i, correct / total, mean(losses)))
    return best_loss, no_increase


def test( model, data_loader, device):
    model.eval()
    losses = []
    correct = 0
    total = 0
    for src, trg in tqdm(data_loader):
        src = src.to(device)
        trg = trg.to(device)
        predict = model(src)
        loss = cross_entropy(predict, trg.long())
        losses.append(loss.item())
        correct += (torch.argmax(predict, -1) == trg).sum().item()
        total += predict.shape[0]
    loss = mean(losses)
    print("test accuracy {} || loss {}".format( correct / total, mean(losses)))


dataset = dataet_name
# args = get_args()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
data = DataSet(dataset, 'train')
train_loader = DataLoader(data, collate_fn=collate_fn, batch_size=7,shuffle=True)
valid_data = DataSet(dataset, 'valid')
val_loader = DataLoader(valid_data, collate_fn=collate_fn, batch_size=7)
test_data=DataSet(dataset,'test')
test_loader=DataLoader(test_data,collate_fn=collate_fn,batch_size=7)




Downloading:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/895k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
model=finetunedphoBert(data.class_num)
model = model.to(device)
optim = AdamW(model.parameters(), lr=2e-5)
best_loss = 1e10
no_increasing = 0
for i in range(12):
    train(i, model, optim, train_loader, device)
    with torch.no_grad():
        no_increasing, best_loss = eval(i, model, best_loss, no_increasing, val_loader, device)
        if no_increasing>3:
            break
model.load_state_dict(torch.load('best_pretrain_phobert.pkl'))
with torch.no_grad():
    test(model,test_loader,device)

Downloading:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 900/900 [02:31<00:00,  5.96it/s]


train epoch 0 accuracy 0.9214285714285714 || loss 0.1780990522631651


100%|██████████| 100/100 [00:04<00:00, 24.44it/s]


eval epoch 0 accuracy 0.9442857142857143 || loss 0.11331650348904077


100%|██████████| 900/900 [02:29<00:00,  6.02it/s]


train epoch 1 accuracy 0.9577777777777777 || loss 0.09311121890981061


100%|██████████| 100/100 [00:04<00:00, 24.43it/s]


eval epoch 1 accuracy 0.9628571428571429 || loss 0.0859376654837979


100%|██████████| 900/900 [02:28<00:00,  6.04it/s]


train epoch 2 accuracy 0.9663492063492064 || loss 0.07665733697125689


100%|██████████| 100/100 [00:04<00:00, 24.50it/s]


eval epoch 2 accuracy 0.9585714285714285 || loss 0.08636046658153645


100%|██████████| 900/900 [02:28<00:00,  6.07it/s]


train epoch 3 accuracy 0.9763492063492063 || loss 0.056068493031392184


100%|██████████| 100/100 [00:04<00:00, 24.55it/s]


eval epoch 3 accuracy 0.9642857142857143 || loss 0.08689971587344189


100%|██████████| 900/900 [02:29<00:00,  6.03it/s]


train epoch 4 accuracy 0.9855555555555555 || loss 0.035119901684617316


100%|██████████| 100/100 [00:04<00:00, 24.56it/s]


eval epoch 4 accuracy 0.9457142857142857 || loss 0.1359344152547419


100%|██████████| 900/900 [02:27<00:00,  6.09it/s]


train epoch 5 accuracy 0.9904761904761905 || loss 0.02347027996807204


100%|██████████| 100/100 [00:04<00:00, 24.47it/s]


eval epoch 5 accuracy 0.9514285714285714 || loss 0.17120389128263924


100%|██████████| 900/900 [02:28<00:00,  6.06it/s]


train epoch 6 accuracy 0.9938095238095238 || loss 0.017921170811379044


100%|██████████| 100/100 [00:04<00:00, 24.63it/s]


eval epoch 6 accuracy 0.9271428571428572 || loss 0.23790726555511355


100%|██████████| 900/900 [02:28<00:00,  6.06it/s]


train epoch 7 accuracy 0.9895238095238095 || loss 0.029809769124840388


100%|██████████| 100/100 [00:04<00:00, 24.44it/s]


eval epoch 7 accuracy 0.9585714285714285 || loss 0.16337823301975732


100%|██████████| 900/900 [02:28<00:00,  6.05it/s]


train epoch 8 accuracy 0.9955555555555555 || loss 0.014767863418932797


100%|██████████| 100/100 [00:04<00:00, 24.59it/s]


eval epoch 8 accuracy 0.9557142857142857 || loss 0.15678860538959272


100%|██████████| 900/900 [02:28<00:00,  6.05it/s]


train epoch 9 accuracy 0.9968253968253968 || loss 0.007546681734869102


100%|██████████| 100/100 [00:04<00:00, 24.54it/s]


eval epoch 9 accuracy 0.9542857142857143 || loss 0.24653905410450533


100%|██████████| 900/900 [02:28<00:00,  6.05it/s]


train epoch 10 accuracy 0.996984126984127 || loss 0.011639417441458338


100%|██████████| 100/100 [00:04<00:00, 24.64it/s]


eval epoch 10 accuracy 0.9571428571428572 || loss 0.1768391746483394


  7%|▋         | 64/900 [00:10<02:19,  6.01it/s]


KeyboardInterrupt: ignored

# trainning process

In [21]:
setup_seed(9)
def train(i, dataset, model: phoBertGCN, optim, features, graph, device):
    print(' epoch ',i+1)
    model.train()
    losses = []
    correct = 0
    total = 0
    increase=0
    for src, label, attention,mask, index in tqdm(dataset):
        mask = mask.to(device)
        src = src.to(device)
        attention=attention.to(device)
        label = label.to(device)
        predict = model(src, features,attention, graph, index)
        predict = predict[mask]
        label = label[mask]
        if predict.shape[0] == 0:
            continue
        # print(predict,label)
        loss = nll_loss(torch.log(predict), label)
        loss.backward(retain_graph=True)
        increase+=1
        if increase%3==0:
            optim.step()
            optim.zero_grad()
            increase=0
        total += predict.shape[0]
        correct += (torch.argmax(predict, -1) == label).sum().item()
        losses.append(loss.item())
        a = torch.Tensor.cpu(torch.argmax(predict, -1))
        b = torch.Tensor.cpu(label)
    print("training set : (loss {} || accuracy {}  || f1_score {})".format(mean(losses), correct / total,f1_score(a,b,average= f1_type)))

In [22]:
def update_features(features, dataset, model, device):
    print('update featre')
    with torch.no_grad():
        model.eval()
        for src, label,attention, mask, idx in tqdm(dataset):
            src = src.to(device)
            attention=attention.to(device)
            current_features = model.phoBertModel.model(src,attention_mask=attention).last_hidden_state[:, 0, :]
            features[idx] = current_features.detach()
    return features
def eval(i, dataset, model: phoBertGCN, features, graph, usage, device, best_loss=None, best_accuracy=None,
         no_increasing=None):
    model.eval()
    if usage == 'valid':
        mask = graph.ndata['valid_mask']
    else:
        mask = graph.ndata['test_mask']
    mask = (mask == 1)
    if usage == 'test':
        model.load_state_dict(torch.load('best_phoBert_GCN_model.pkl'))
        features = update_features(features, dataset, model, device)
    predict = model.phoBertModel.linear(features)
    graph_predict = model.gcn(graph, features)
    predict = softmax(predict[mask], -1) * (1 - model.lam) + softmax(graph_predict[mask], -1) * model.lam
    label = graph.ndata['label']
    loss = nll_loss(torch.log(predict), label[mask])
    correct = (torch.argmax(predict, -1) == label[mask]).sum().item()
    # print(label[mask])
    a = torch.Tensor.cpu(torch.argmax(predict, -1))
    b = torch.Tensor.cpu(label[mask])
    # print(torch.argmax(predict, -1)
    total = sum(mask).item()
    print("{} loss {} || accuracy {} || f1_score {}".format(usage, loss.item(), correct / total,f1_score(a,b,average= 'macro')))
    if usage == 'valid':
        if best_loss > loss.item():
            best_loss = loss.item()
            no_increasing = 0
            torch.save(model.state_dict(), 'best_phoBert_GCN_model.pkl')
            print("saving to file best_phoBert_GCN_model.pkl")

        else:
            no_increasing += 1
        if best_accuracy < correct / total:
            best_accuracy = correct / total
            torch.save(model.state_dict(), 'best_accuracy.pkl')
            print("saving to file best_accuracy.pkl")
        return best_loss, best_accuracy, no_increasing

In [23]:
warnings.filterwarnings('ignore')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dataset = Data_set(dataet_name)
graph = dataset.graph.to(device)
data_loader = DataLoader(dataset,  batch_size=7, shuffle=True)
features = torch.zeros(graph.num_nodes(), 768, requires_grad=False).to(device)
model = phoBertGCN('./best_pretrain_phobert.pkl', dataset.label_num)
model = model.to(device)
optim = torch.optim.Adam([{'params': model.gcn.parameters(), 'lr': 1e-3}, {'params': model.phoBertModel.parameters(), 'lr': 1e-5}])
scheduler=lr_scheduler.MultiStepLR(optim,milestones=[30],gamma=0.1)
features = update_features(features, data_loader, model, device)
best_loss = 1e10
no_increasing = 0
best_accuracy = 0
for i in range(6):
    torch.cuda.empty_cache()
    train(i, data_loader, model, optim, features, graph, device)
    scheduler.step()
    torch.cuda.empty_cache()
    with torch.no_grad():
        features = update_features(features, data_loader, model, device)
        best_loss, best_accuracy, no_increasing = eval(i, data_loader, model, features, graph, 'valid', device,
                                                        best_loss,
                                                        best_accuracy,
                                                        no_increasing)
    if no_increasing >= 10:
        break  # for i in range(20):
with torch.no_grad():
    eval(0, data_loader, model, features, graph, 'test', device, best_loss, best_accuracy, no_increasing)

(6300, 300) (6300, 2) (1000, 300) (1000, 2) (25310, 300) (25310, 2)
26310


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


update featre


100%|██████████| 1143/1143 [01:53<00:00, 10.11it/s]


 epoch  1


100%|██████████| 1143/1143 [06:42<00:00,  2.84it/s]


training set : (loss 0.8503361744700246 || accuracy 0.5914285714285714  || f1_score 0.37499999999999994)
update featre


100%|██████████| 1143/1143 [01:52<00:00, 10.17it/s]


valid loss 0.7488362193107605 || accuracy 0.6371428571428571 || f1_score 0.38917975567190227
saving to file best_phoBert_GCN_model.pkl
saving to file best_accuracy.pkl
 epoch  2


100%|██████████| 1143/1143 [06:42<00:00,  2.84it/s]


training set : (loss 0.5179453734976621 || accuracy 0.7801587301587302  || f1_score 0.42857142857142855)
update featre


100%|██████████| 1143/1143 [01:52<00:00, 10.15it/s]


valid loss 0.20643958449363708 || accuracy 0.9771428571428571 || f1_score 0.9750214102198117
saving to file best_phoBert_GCN_model.pkl
saving to file best_accuracy.pkl
 epoch  3


100%|██████████| 1143/1143 [06:42<00:00,  2.84it/s]


training set : (loss 0.1815943146940917 || accuracy 0.9922222222222222  || f1_score 1.0)
update featre


100%|██████████| 1143/1143 [01:52<00:00, 10.16it/s]


valid loss 0.17127282917499542 || accuracy 0.9914285714285714 || f1_score 0.9907154415470727
saving to file best_phoBert_GCN_model.pkl
saving to file best_accuracy.pkl
 epoch  4


 98%|█████████▊| 1122/1143 [06:35<00:07,  2.84it/s]


KeyboardInterrupt: ignored

In [24]:
# model = phoBertGCN('/content/best_phoBert_GCN_model.pkl', dataset.label_num)

with torch.no_grad():
    eval(0, data_loader, model, features, graph, 'test', device, best_loss, best_accuracy, no_increasing)

update featre


100%|██████████| 1143/1143 [01:53<00:00, 10.07it/s]


test loss 0.36304208636283875 || accuracy 0.876 || f1_score 0.8647847023518739
