In [1]:
import torch
import cl_graph_bert as cgm
from torch import nn
import torch.nn.functional as F

import dgl
g = dgl.load_graphs("./graphs/industrial_and_scientific_5_core.dgl")[0][0]

model = cgm.CLIPGraphModel(
    rel_types = g.etypes,
    emb_types = {x: g.number_of_nodes(x) for x in g.ntypes} 
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
import json
from transformers import BertModel, BertConfig, BertTokenizer, AdamW
import tqdm

from torch.utils.data import Dataset, DataLoader

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, labels, ids):
        self.tokens = tokens
        self.labels = labels
        self.ids = ids
        
    def __len__(self):
        return len(self.tokens['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokens.items()}
        out = {"tokens": item, "label": self.labels[idx], "id": self.ids[idx]}
        return out

def process_text(filepath, batch_size):
    reviews = []
    data = open(filepath)
    for line in data.readlines():
        reviews.append(json.loads(line))

    review_texts = []
    review_scores = []

    for sample in reviews:
        if 'reviewText' in sample and 'overall' in sample:
            review_texts.append(sample['reviewText'])
            if sample['overall'] >= 4:
                review_scores.append(1)
            else:
                review_scores.append(0)
                
    train_reviews = review_texts[:len(review_texts)//2]
    train_ids = [i for i in range(0, len(review_texts)//2+1)]
    test_reviews = review_texts[len(review_texts)//2:]
    test_ids = [i for i in range(len(review_texts)//2, len(review_texts))]
    train_scores = review_scores[:len(review_texts)//2]
    test_scores = review_scores[len(review_texts)//2:]

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    tokenized_train_reviews = tokenizer(train_reviews, return_tensors="pt", padding='max_length', truncation=True)
    tokenized_test_reviews = tokenizer(test_reviews, return_tensors="pt", padding='max_length', truncation=True)

    train_dataset = ReviewDataset(tokenized_train_reviews, train_scores, train_ids)
    test_dataset = ReviewDataset(tokenized_test_reviews, test_scores, test_ids)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    return train_loader, test_loader, train_dataset, test_dataset

In [3]:
batch_size = 16
train_loader, test_loader, train_dataset, test_dataset = process_text("./data/Industrial_and_Scientific_5.json", batch_size)

In [4]:
device = "cpu"

model.load_state_dict(torch.load("./base_statedict_6668.011407389138.pt", map_location=torch.device(device)))
model.eval()

CLIPGraphModel(
  (graph_model): RGCN(
    (embed): ParameterDict(
        (Brand): Parameter containing: [torch.FloatTensor of size 1900x512]
        (Customer): Parameter containing: [torch.FloatTensor of size 11041x512]
        (Product): Parameter containing: [torch.FloatTensor of size 5334x512]
        (Review): Parameter containing: [torch.FloatTensor of size 77071x512]
    )
    (conv1): HeteroGraphConv(
      (mods): ModuleDict(
        (rev_SOLD_BY): GraphConv(in=512, out=256, normalization=both, activation=None)
        (WROTE): GraphConv(in=512, out=256, normalization=both, activation=None)
        (SOLD_BY): GraphConv(in=512, out=256, normalization=both, activation=None)
        (rev_REVIEW_OF): GraphConv(in=512, out=256, normalization=both, activation=None)
        (REVIEW_OF): GraphConv(in=512, out=256, normalization=both, activation=None)
        (rev_WROTE): GraphConv(in=512, out=256, normalization=both, activation=None)
      )
    )
    (conv2): HeteroGraphConv(
     

In [5]:
review_emb = model.graph_projection(model.graph_model(g)["Review"].double())
review_emb = review_emb / review_emb.norm(dim=-1, keepdim=True)

In [6]:
top1 = []
top5 = []
top10 = []
j = 0
for batch in test_loader:
    if j > 0:
        break
    out = model(g.to(device), "Review", batch["tokens"], batch["id"].to(device))
    for i in range(len(batch["id"])):
        pred_ids = F.cosine_similarity(review_emb, out["language_emb"][i]).topk(k=10)[1]
        print(pred_ids)
        print(batch["id"][i])
        if pred_ids[0] == batch["id"][i]:
            top1.append(True)
        else:
            top1.append(False)
        if batch["id"][i] in pred_ids[:5]:
            top5.append(True)
        else:
            top5.append(False)
        if batch["id"][i] in pred_ids:
            top10.append(True)
        else:
            top10.append(False)
    j += 1

top1 = torch.tensor(top1)
top5 = torch.tensor(top5)
top10 = torch.tensor(top10)

  item = {key: torch.tensor(val[idx]) for key, val in self.tokens.items()}


tensor([13274, 17340, 34698, 32265, 33239, 11144,   436, 27734,  1424,   978])
tensor(47909)
tensor([34698, 13274, 17340, 32265, 33239, 11144,   436, 27734,  1424,   978])
tensor(40386)
tensor([13274, 17340, 34698, 32265, 33239, 11144,   436, 27734,  1424,   978])
tensor(41293)
tensor([13274, 17340, 34698, 32265, 33239, 11144,   436, 27734,  1424,   978])
tensor(38909)
tensor([13274, 17340, 34698, 32265, 33239, 11144,   436, 27734,  1424,   978])
tensor(60732)
tensor([13274, 17340, 34698, 32265, 33239, 11144,   436, 27734,  1424,   978])
tensor(53219)
tensor([13274, 17340, 34698, 32265, 33239, 11144,   436, 27734,  1424,   978])
tensor(75326)
tensor([13274, 17340, 34698, 32265, 33239, 11144,   436, 27734,  1424,   978])
tensor(68499)
tensor([13274, 17340, 34698, 32265, 33239, 11144,   436, 27734,  1424,   978])
tensor(52540)
tensor([13274, 17340, 34698, 32265, 33239, 11144,   436, 27734,  1424,   978])
tensor(55740)
tensor([13274, 17340, 34698, 32265, 33239, 11144,   436, 27734,  1424,

In [7]:
F.cosine_similarity(review_emb, out["language_emb"][-1]).topk(k=10)

torch.return_types.topk(
values=tensor([0.2654, 0.2654, 0.2654, 0.2654, 0.2654, 0.2654, 0.2654, 0.2654, 0.2654,
        0.2654], dtype=torch.float64, grad_fn=<TopkBackward0>),
indices=tensor([13274, 17340, 34698, 32265, 33239, 11144,   436, 27734,  1424,   978]))

In [8]:
def reviewFromTokens(data):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    return tokenizer.decode(data["tokens"]["input_ids"])

In [9]:
reviewFromTokens(test_dataset[batch["id"][-1]])

IndexError: index 67781 is out of bounds for dimension 0 with size 38530

In [10]:
reviewFromTokens(test_dataset[37378])

  item = {key: torch.tensor(val[idx]) for key, val in self.tokens.items()}


'[CLS] what can i say i love hatchbox filament great prints every time [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

In [11]:
top1.sum()/len(top1)

tensor(0.)

In [12]:
top5.sum()/len(top5)

tensor(0.)

In [13]:
top10.sum()/len(top10)

tensor(0.)