In [1]:
import torch
import cl_graph_bert as cgm
from torch import nn
import torch.nn.functional as F

import dgl
g = dgl.load_graphs("./graphs/industrial_and_scientific_5_core.dgl")[0][0]

model = cgm.CLIPGraphModel(
    rel_types = g.etypes,
    emb_types = {x: g.number_of_nodes(x) for x in g.ntypes} 
)

In [None]:
import json
from transformers import BertModel, BertConfig, BertTokenizer, AdamW
import tqdm

from torch.utils.data import Dataset, DataLoader

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, labels, ids):
        self.tokens = tokens
        self.labels = labels
        self.ids = ids
        
    def __len__(self):
        return len(self.tokens['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokens.items()}
        out = {"tokens": item, "label": self.labels[idx], "id": self.ids[idx]}
        return out

def process_text(filepath, batch_size):
    reviews = []
    data = open(filepath)
    for line in data.readlines():
        reviews.append(json.loads(line))

    review_texts = []
    review_scores = []

    for sample in reviews:
        if 'reviewText' in sample and 'overall' in sample:
            review_texts.append(sample['reviewText'])
            if sample['overall'] >= 4:
                review_scores.append(1)
            else:
                review_scores.append(0)
                
    train_reviews = review_texts[:len(review_texts)//2]
    train_ids = [i for i in range(0, len(review_texts)//2+1)]
    test_reviews = review_texts[len(review_texts)//2:]
    test_ids = [i for i in range(len(review_texts)//2, len(review_texts))]
    train_scores = review_scores[:len(review_texts)//2]
    test_scores = review_scores[len(review_texts)//2:]

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    tokenized_train_reviews = tokenizer(train_reviews, return_tensors="pt", padding='max_length', truncation=True)
    tokenized_test_reviews = tokenizer(test_reviews, return_tensors="pt", padding='max_length', truncation=True)

    train_dataset = ReviewDataset(tokenized_train_reviews, train_scores, train_ids)
    test_dataset = ReviewDataset(tokenized_test_reviews, test_scores, test_ids)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    return train_loader, test_loader

In [None]:
batch_size = 16
train_loader, test_loader = process_text("./data/Industrial_and_Scientific_5.json", batch_size)

In [None]:
device = "cpu"

model.load_state_dict(torch.load("./state_dict.pt", map_location=torch.device(device)))
model.eval()

In [None]:
review_emb = model.graph_projection(model.graph_model(g)["Review"].double())
review_emb = review_emb / review_emb.norm(dim=-1, keepdim=True)

In [None]:
top1 = []
top5 = []
top10 = []
j = 0
for batch in test_loader:
    if j > 9:
        break
    out = model(g.to(device), "Review", batch["tokens"], batch["id"].to(device))
    for i in range(len(batch["id"])):
        pred_ids = F.cosine_similarity(review_emb, out["language_emb"][i]).topk(k=10)[1]
        print(pred_ids)
        print(batch["id"][i])
        if pred_ids[0] == batch["id"][i]:
            top1.append(True)
        else:
            top1.append(False)
        if batch["id"][i] in pred_ids[:5]:
            top5.append(True)
        else:
            top5.append(False)
        if batch["id"][i] in pred_ids:
            top10.append(True)
        else:
            top10.append(False)
    j += 1

top1 = torch.tensor(top1)
top5 = torch.tensor(top5)
top10 = torch.tensor(top10)

In [None]:
top1.sum()/len(top1)

In [None]:
top5.sum()/len(top5)

In [None]:
top10.sum()/len(top10)