In [1]:
import cl_graph_bert as cgm
import torch
from torch import nn
import json
from transformers import BertModel, BertConfig, BertTokenizer, AdamW
import tqdm

from torch.utils.data import Dataset, DataLoader
'''
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, labels, ids):
        self.tokens = tokens
        self.labels = labels
        self.ids = ids
        
    def __len__(self):
        return len(self.tokens['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokens.items()}
        out = {"tokens": item, "label": self.labels[idx], "id": self.ids[idx]}
        return out

def process_text(filepath, batch_size):
    reviews = []
    data = open(filepath)
    for line in data.readlines():
        reviews.append(json.loads(line))

    review_texts = []
    review_scores = []

    for sample in reviews:
        if 'reviewText' in sample and 'overall' in sample:
            review_texts.append(sample['reviewText'])
            if sample['overall'] >= 4:
                review_scores.append(1)
            else:
                review_scores.append(0)
                
    train_reviews = review_texts[:len(review_texts)//2]
    train_ids = [i for i in range(0, len(review_texts)//2+1)]
    test_reviews = review_texts[len(review_texts)//2:]
    test_ids = [i for i in range(len(review_texts)//2, len(review_texts))]
    train_scores = review_scores[:len(review_texts)//2]
    test_scores = review_scores[len(review_texts)//2:]

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    tokenized_train_reviews = tokenizer(train_reviews, return_tensors="pt", padding='max_length', truncation=True)
    tokenized_test_reviews = tokenizer(test_reviews, return_tensors="pt", padding='max_length', truncation=True)

    train_dataset = ReviewDataset(tokenized_train_reviews, train_scores, train_ids)
    test_dataset = ReviewDataset(tokenized_test_reviews, test_scores, test_ids)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    return train_loader, test_loader
'''

'\nclass ReviewDataset(torch.utils.data.Dataset):\n    def __init__(self, tokens, labels, ids):\n        self.tokens = tokens\n        self.labels = labels\n        self.ids = ids\n        \n    def __len__(self):\n        return len(self.tokens[\'input_ids\'])\n\n    def __getitem__(self, idx):\n        item = {key: torch.tensor(val[idx]) for key, val in self.tokens.items()}\n        out = {"tokens": item, "label": self.labels[idx], "id": self.ids[idx]}\n        return out\n\ndef process_text(filepath, batch_size):\n    reviews = []\n    data = open(filepath)\n    for line in data.readlines():\n        reviews.append(json.loads(line))\n\n    review_texts = []\n    review_scores = []\n\n    for sample in reviews:\n        if \'reviewText\' in sample and \'overall\' in sample:\n            review_texts.append(sample[\'reviewText\'])\n            if sample[\'overall\'] >= 4:\n                review_scores.append(1)\n            else:\n                review_scores.append(0)\n            

In [2]:
import dgl
g = dgl.load_graphs("./graphs/industrial_and_scientific_5_core_new.dgl")[0][0]

In [3]:
import math
num_reviews = g.num_nodes("Review")
'''
class IDDataset(torch.utils.data.IterableDataset):
    def __init__(self, start, end):
        super(IDDataset).__init__()
        assert end > start, "this example code only works with end >= start"
        self.start = start
        self.end = end

    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:  # single-process data loading, return the full iterator
            iter_start = self.start
            iter_end = self.end
        else:  # in a worker process
            # split workload
            per_worker = int(math.ceil((self.end - self.start) / float(worker_info.num_workers)))
            worker_id = worker_info.id
            iter_start = self.start + worker_id * per_worker
            iter_end = min(iter_start + per_worker, self.end)
        return iter(range(iter_start, iter_end))

'''

class IDDataset(torch.utils.data.Dataset):
    def __init__(self, ids):
        self.ids = ids
        
    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        out = {"id": self.ids[idx]}
        return out

train_ds = IDDataset(torch.tensor([i for i in range(0, num_reviews//2+1)]))
test_ds = IDDataset(torch.tensor([i for i in range(num_reviews//2, num_reviews)]))

In [4]:
batch_size = 16
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=batch_size)

In [5]:
model = cgm.CLIPGraphModel(
    rel_types = g.etypes,
    emb_types = {x: g.number_of_nodes(x) for x in g.ntypes} 
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
device = 'cpu'
model.to(device)

loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([{"params":model.language_model.parameters(),"lr": 0.00001},
                              {"params":model.graph_model.parameters(), "lr": 0.001},
                              {"params":model.language_projection.parameters(), "lr": 0.001},
                              {"params":model.graph_projection.parameters(), "lr": 0.001}])

In [7]:
epochs = 2

model.train()

for epoch in range(epochs):
    epoch_loss = 0
    for batch in tqdm.tqdm(train_loader):
        optimizer.zero_grad()
        loss = model(g.to(device), "Review", batch["id"].to(device))["loss"]    
                
        epoch_loss += loss
        
        loss.backward()
        optimizer.step()
       
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm.tqdm(test_loader):
            loss = model(g.to(device), "Review", batch["id"].to(device))["loss"]

            val_loss += loss
        
    print("End of Epoch", epoch)
    print("Training loss:", epoch_loss)
    print("Test loss:", val_loss)

  0%|          | 0/2409 [00:17<?, ?it/s]


KeyboardInterrupt: 

In [28]:
class BertGraphMLP(nn.Module):
    def __init__(self, model, encoder="language_emb", num_labels=2, finetune=False):
        super(BertGraphMLP, self).__init__()
        self.num_labels = num_labels
        self.BertGraph = model
        self.encoder = encoder
        self.classifier = nn.Linear(512, num_labels)
        self.finetune = finetune

    def forward(self, g, ids):
        if self.finetune:
            out = self.BertGraph(g, "Review", ids)
        else:
            with torch.no_grad():
                out = self.BertGraph(g, "Review", ids)
        if self.encoder == "language_emb":
            embs = out["language_emb"]
        elif self.encoder == "graph_emb":
            embs = out["graph_emb"]
        elif self.encoder == "mean":
            embs = (out["langauge_emb"] + out["graph_emb"])/2
        else:
            raise("Not Implemented")
        logits = self.classifier(embs)
        return logits


In [None]:
import torch.nn.functional as F

def evaluate_review_sentiment(model, encoder, finetune=False):
    eval_model = BertGraphMLP(model, encoder, finetune=finetune)

    loss_function = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(
        [
            {"params":eval_model.classifier.parameters(), "lr": 0.001},       
    ])

    epochs = 10
    device= 'cuda'
    eval_model.to(device)
    if not(finetune):
        eval_model.BertGraph.eval()
    preds = []

    for epoch in tqdm.tqdm(range(epochs)):
        epoch_loss = 0
        for batch in tqdm.tqdm(train_loader):
            optimizer.zero_grad()  
            embs = eval_model(g.to(device), batch["id"].to(device))  
            loss = loss_function(embs, batch['label'].to(device)) / batch_size
            epoch_loss += loss
            loss.backward()
            optimizer.step()    
        print(epoch_loss/len(train_loader))

    eval_model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch in tqdm.tqdm(test_loader):
            scores = eval_model(g.to(device), batch["id"].to(device))
            preds.extend(torch.round(F.softmax(scores)[:,1]).to(torch.int64))
            labels.extend(g.nodes["Review"].data["Positive"][batch["id"]].to(torch.int64))

    from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

    preds_cpu = [i.to('cpu') for i in preds]
    labels_cpu = [i.to('cpu') for i in labels]
    print("End of training f1 score accuracy", f1_score(labels_cpu, preds_cpu))
    print("End of training accuracy", accuracy_score(labels_cpu, preds_cpu))
    print("End of training precision", precision_score(labels_cpu, preds_cpu))
    print("End of training recall", recall_score(labels_cpu, preds_cpu))

  item = {key: torch.tensor(val[idx]) for key, val in self.tokens.items()}
100%|███████████████████████████████████████| 2409/2409 [09:44<00:00,  4.12it/s]


tensor(135.6381, device='cuda:0', grad_fn=<AddBackward0>)


 24%|█████████▋                              | 585/2409 [02:22<07:23,  4.11it/s]

In [None]:
enc_methods = ["language_emb", "graph_emb", "mean"]

for method in enc_methods:
    print("====== EVALUATING:", method, "======")
    evaluate_review_sentiment(model, method)

In [None]:
torch.save(model.state_dict(), "./base_statedict_{}.pt".format(float(val_loss)))

In [17]:
g.nodes["Review"].data["Positive"]

tensor(68201)