In [1]:
import cl_graph_bert as cgm
import torch
from torch import nn
import json
from transformers import BertModel, BertConfig, BertTokenizer, AdamW
import tqdm

from torch.utils.data import Dataset, DataLoader
# '''
# class ReviewDataset(torch.utils.data.Dataset):
#     def __init__(self, tokens, labels, ids):
#         self.tokens = tokens
#         self.labels = labels
#         self.ids = ids
        
#     def __len__(self):
#         return len(self.tokens['input_ids'])

#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx]) for key, val in self.tokens.items()}
#         out = {"tokens": item, "label": self.labels[idx], "id": self.ids[idx]}
#         return out

# def process_text(filepath, batch_size):
#     reviews = []
#     data = open(filepath)
#     for line in data.readlines():
#         reviews.append(json.loads(line))

#     review_texts = []
#     review_scores = []

#     for sample in reviews:
#         if 'reviewText' in sample and 'overall' in sample:
#             review_texts.append(sample['reviewText'])
#             if sample['overall'] >= 4:
#                 review_scores.append(1)
#             else:
#                 review_scores.append(0)
                
#     train_reviews = review_texts[:len(review_texts)//2]
#     train_ids = [i for i in range(0, len(review_texts)//2+1)]
#     test_reviews = review_texts[len(review_texts)//2:]
#     test_ids = [i for i in range(len(review_texts)//2, len(review_texts))]
#     train_scores = review_scores[:len(review_texts)//2]
#     test_scores = review_scores[len(review_texts)//2:]

#     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#     tokenized_train_reviews = tokenizer(train_reviews, return_tensors="pt", padding='max_length', truncation=True)
#     tokenized_test_reviews = tokenizer(test_reviews, return_tensors="pt", padding='max_length', truncation=True)

#     train_dataset = ReviewDataset(tokenized_train_reviews, train_scores, train_ids)
#     test_dataset = ReviewDataset(tokenized_test_reviews, test_scores, test_ids)

#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#     test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
#     return train_loader, test_loader
# '''

In [2]:
import dgl
g = dgl.load_graphs("./graphs/industrial_and_scientific_5_core_new.dgl")[0][0]

train_nids = g.ndata["train_mask"]
test_nids = g.ndata['test_mask']

sampler = dgl.dataloading.NeighborSampler([5, 10], prefetch_labels = {'Review': ['input_ids', 'attention_mask', 'token_type_ids']})

In [3]:
import math
num_reviews = g.num_nodes("Review")

In [4]:
train_nids = {}
test_nids = {}
for ntype in g.ntypes:
    if ntype != "Review":
        pass
    else:
        train_nids[ntype] = g.nodes(ntype)[g.ndata["train_mask"]["Review"] == 1]
        test_nids[ntype] = g.nodes(ntype)[g.ndata["test_mask"]["Review"] == 1]

In [5]:
batch_size = 24

train_loader = dgl.dataloading.DataLoader(
    g, train_nids, sampler,
    batch_size=batch_size,
    shuffle=True,
    drop_last=False,
    num_workers=0)

test_loader = dgl.dataloading.DataLoader(
    g, test_nids, sampler,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=0)

In [6]:
model = cgm.CLIPGraphModel(
    rel_types = g.etypes,
    emb_types = {x: g.number_of_nodes(x) for x in g.ntypes},
    device="cuda"
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
device = 'cuda'
model.to(device)
# model.graph_model.load_state_dict(torch.load("./pretrained_statedict_0.02531265653669834.pt", map_location=torch.device(device)))
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([{"params":model.language_model.parameters(),"lr": 0.00001},
                              {"params":model.graph_model.parameters(), "lr": 0.00001},
                              {"params":model.language_projection.parameters(), "lr": 0.001},
                              {"params":model.graph_projection.parameters(), "lr": 0.001}])

In [8]:
epochs = 1

model.train()

for epoch in range(epochs):
    epoch_loss = 0
    for batch, labels, blocks in tqdm.tqdm(train_loader):            
        optimizer.zero_grad()
        blocks = [b.to(device) for b in blocks]
        loss = model("Review", blocks)["loss"]    
        epoch_loss += loss
        
        loss.backward()
        optimizer.step()
    epoch_loss /= len(train_loader)
    val_loss = 0
    with torch.no_grad():
        for batch, labels, blocks in tqdm.tqdm(test_loader):
            blocks = [b.to(device) for b in blocks]
            loss = model("Review", blocks)["loss"]    

            val_loss += loss
    val_loss /= len(test_loader)
    print("End of Epoch", epoch)
    print("Training loss:", epoch_loss)
    print("Test loss:", val_loss)

100%|█████████████████████████████████████████████████████████| 1606/1606 [11:11<00:00,  2.39it/s]
100%|█████████████████████████████████████████████████████████| 1606/1606 [04:24<00:00,  6.06it/s]

End of Epoch 0
Training loss: tensor(3.1840, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)
Test loss: tensor(3.1775, device='cuda:0', dtype=torch.float64)





In [9]:
class BertGraphMLP(nn.Module):
    def __init__(self, model, encoder="language_emb", num_labels=2, finetune=False):
        super(BertGraphMLP, self).__init__()
        self.num_labels = num_labels
        self.BertGraph = model
        self.encoder = encoder
        self.classifier = nn.Linear(512, num_labels)
        self.finetune = finetune
       
        if self.finetune:
            self.BertGraph.eval()

    def forward(self, vertex_type, blocks):
        if self.finetune:
            out = self.BertGraph(vertex_type, blocks)
        else:
            with torch.no_grad():
                out = self.BertGraph(vertex_type, blocks)
        if self.encoder == "language_emb":
            embs = out["language_emb"]
        elif self.encoder == "graph_emb":
            embs = out["graph_emb"]
        elif self.encoder == "mean":
            embs = (out["language_emb"] + out["graph_emb"])/2
        else:
            raise("Not Implemented")
        logits = self.classifier(embs.float())
        return logits


In [10]:
import torch.nn.functional as F

def evaluate_review_sentiment(model, encoder, finetune=False):
    eval_model = BertGraphMLP(model, encoder, finetune=finetune)

    loss_function = torch.nn.CrossEntropyLoss(weight=torch.tensor([0.877, 1-0.877]).to('cuda')) # 
    optimizer = torch.optim.Adam(
        [
            {"params":eval_model.classifier.parameters(), "lr": 0.001},       
    ])

    epochs = 2
    device= 'cuda'
    eval_model.to(device)
    if not(finetune):
        eval_model.BertGraph.eval()
    preds = []
    labels = g.nodes["Review"].data["Positive"]
    
    
    for epoch in range(epochs):
        epoch_loss = 0
        for batch, ids, blocks in tqdm.tqdm(train_loader):
            optimizer.zero_grad()  
            blocks = [b.to(device) for b in blocks]
            embs = eval_model("Review", blocks)  
            loss = loss_function(embs, labels[ids['Review']].to(torch.int64).to(device)) / batch_size
            epoch_loss += loss
            loss.backward()
            optimizer.step()    
        print(epoch_loss/len(train_loader))

    eval_model.eval()
    preds = []
    test_labels = []
    with torch.no_grad():
        for batch, ids, blocks in tqdm.tqdm(test_loader):
            blocks = [b.to(device) for b in blocks]
            embs = eval_model("Review", blocks)  
            preds.extend(torch.round(F.softmax(embs)[:,1]).to(torch.int64))
            test_labels.extend(labels[ids['Review']].to(torch.int64))

    from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

    preds_cpu = [i.to('cpu') for i in preds]
    labels_cpu = [i.to('cpu') for i in test_labels]
    print("End of training f1 score accuracy", f1_score(labels_cpu, preds_cpu))
    print("End of training accuracy", accuracy_score(labels_cpu, preds_cpu))
    print("End of training precision", precision_score(labels_cpu, preds_cpu))
    print("End of training recall", recall_score(labels_cpu, preds_cpu))

In [11]:
enc_methods = ["language_emb", "graph_emb", "mean"]

for method in enc_methods:
    print("====== EVALUATING:", method, "======")
    evaluate_review_sentiment(model, method)



100%|█████████████████████████████████████████████████████████| 1606/1606 [04:05<00:00,  6.54it/s]


tensor(0.0284, device='cuda:0', grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████| 1606/1606 [04:06<00:00,  6.52it/s]


tensor(0.0283, device='cuda:0', grad_fn=<DivBackward0>)


  preds.extend(torch.round(F.softmax(embs)[:,1]).to(torch.int64))
100%|█████████████████████████████████████████████████████████| 1606/1606 [04:05<00:00,  6.54it/s]


End of training f1 score accuracy 0.934325344691679
End of training accuracy 0.8767453932001038
End of training precision 0.8767453932001038
End of training recall 1.0


100%|█████████████████████████████████████████████████████████| 1606/1606 [04:05<00:00,  6.54it/s]


tensor(0.0286, device='cuda:0', grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████| 1606/1606 [04:05<00:00,  6.54it/s]


tensor(0.0284, device='cuda:0', grad_fn=<DivBackward0>)


  preds.extend(torch.round(F.softmax(embs)[:,1]).to(torch.int64))
100%|█████████████████████████████████████████████████████████| 1606/1606 [04:05<00:00,  6.55it/s]


End of training f1 score accuracy 0.9281881906991652
End of training accuracy 0.866260057098365
End of training precision 0.8769222666947546
End of training recall 0.9858204316035641


100%|█████████████████████████████████████████████████████████| 1606/1606 [04:05<00:00,  6.54it/s]


tensor(0.0284, device='cuda:0', grad_fn=<DivBackward0>)


100%|█████████████████████████████████████████████████████████| 1606/1606 [04:06<00:00,  6.52it/s]


tensor(0.0283, device='cuda:0', grad_fn=<DivBackward0>)


  preds.extend(torch.round(F.softmax(embs)[:,1]).to(torch.int64))
100%|█████████████████████████████████████████████████████████| 1606/1606 [04:04<00:00,  6.56it/s]


End of training f1 score accuracy 0.8959522498768581
End of training accuracy 0.8135997923695821
End of training precision 0.8773443041566179
End of training recall 0.9153666262099998


In [9]:
# torch.save(model.state_dict(), "./base_statedict_{}.pt".format(float(val_loss)))

In [13]:
model.load_state_dict(torch.load("./base_statedict_3.1703879752973503.pt", map_location=torch.device('cuda')))
model.eval()

CLIPGraphModel(
  (graph_model): StochasticTwoLayerRGCN(
    (embed): HeteroEmbedding(
      (embeds): ModuleDict(
        (Brand): Embedding(1900, 512)
        (Customer): Embedding(11041, 512)
        (Product): Embedding(5334, 512)
        (Review): Embedding(77060, 512)
      )
    )
    (conv1): HeteroGraphConv(
      (mods): ModuleDict(
        (rev_SOLD_BY): GraphConv(in=512, out=256, normalization=both, activation=None)
        (WROTE): GraphConv(in=512, out=256, normalization=both, activation=None)
        (SOLD_BY): GraphConv(in=512, out=256, normalization=both, activation=None)
        (rev_REVIEW_OF): GraphConv(in=512, out=256, normalization=both, activation=None)
        (REVIEW_OF): GraphConv(in=512, out=256, normalization=both, activation=None)
        (rev_WROTE): GraphConv(in=512, out=256, normalization=both, activation=None)
      )
    )
    (conv2): HeteroGraphConv(
      (mods): ModuleDict(
        (rev_SOLD_BY): GraphConv(in=256, out=256, normalization=both, activ

In [17]:
model.to('cpu')
model.graph_model.embed({'Review': ids})

torch.Size([1, 512])