In [4]:
import cl_graph_bert as cgm
import torch
from torch import nn
import json
from transformers import BertModel, BertConfig, BertTokenizer, AdamW
import tqdm

from torch.utils.data import Dataset, DataLoader


class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, labels, ids):
        self.tokens = tokens
        self.labels = labels
        self.ids = ids
        
    def __len__(self):
        return len(self.tokens['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokens.items()}
        out = {"tokens": item, "label": self.labels[idx], "id": self.ids[idx]}
        return out

def process_text(filepath, batch_size):
    reviews = []
    data = open(filepath)
    for line in data.readlines():
        reviews.append(json.loads(line))

    review_texts = []
    review_scores = []
    missing = []
    i = 0
    for sample in reviews:
        if 'reviewText' in sample and 'overall' in sample:
            review_texts.append(sample['reviewText'])
            if sample['overall'] >= 4:
                review_scores.append(1)
            else:
                review_scores.append(0)
        else:
            review_texts.append("NULL")
            missing.append(i)
            
        i += 1

     
            
    print(missing)            
    train_reviews = review_texts[:len(review_texts)//2]
    train_ids = [i for i in range(0, len(review_texts)//2+1)]
    test_reviews = review_texts[len(review_texts)//2:]
    test_ids = [i for i in range(len(review_texts)//2, len(review_texts))]
    train_scores = review_scores[:len(review_texts)//2]
    test_scores = review_scores[len(review_texts)//2:]

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    tokenized_train_reviews = tokenizer(train_reviews, return_tensors="pt", padding='max_length', truncation=True)
    tokenized_test_reviews = tokenizer(test_reviews, return_tensors="pt", padding='max_length', truncation=True)

    tokenized_reviews = tokenizer(review_texts, return_tensors="pt", padding='max_length', truncation=True)
    
    train_dataset = ReviewDataset(tokenized_train_reviews, train_scores, train_ids)
    test_dataset = ReviewDataset(tokenized_test_reviews, test_scores, test_ids)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    return train_loader, test_loader, tokenized_reviews

# %%
batch_size = 16
train_loader, test_loader, review_tokens = process_text("../review_dataset/Industrial_and_Scientific_5.json", batch_size)

# %%
import dgl
g = dgl.load_graphs("./graphs/industrial_and_scientific_5_core.dgl")[0][0]

[603, 14747, 18813, 21401, 35713, 38271, 41986, 41987, 43997, 55969, 56410]


In [5]:
model = cgm.CLIPGraphModel(
    rel_types = g.etypes,
    emb_types = {x: g.number_of_nodes(x) for x in g.ntypes} 
)

device = "cpu"

model.load_state_dict(torch.load("./base_statedict_6668.011407389138.pt", map_location=torch.device(device)))
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CLIPGraphModel(
  (graph_model): RGCN(
    (embed): ParameterDict(
        (Brand): Parameter containing: [torch.FloatTensor of size 1900x512]
        (Customer): Parameter containing: [torch.FloatTensor of size 11041x512]
        (Product): Parameter containing: [torch.FloatTensor of size 5334x512]
        (Review): Parameter containing: [torch.FloatTensor of size 77071x512]
    )
    (conv1): HeteroGraphConv(
      (mods): ModuleDict(
        (rev_SOLD_BY): GraphConv(in=512, out=256, normalization=both, activation=None)
        (WROTE): GraphConv(in=512, out=256, normalization=both, activation=None)
        (SOLD_BY): GraphConv(in=512, out=256, normalization=both, activation=None)
        (rev_REVIEW_OF): GraphConv(in=512, out=256, normalization=both, activation=None)
        (REVIEW_OF): GraphConv(in=512, out=256, normalization=both, activation=None)
        (rev_WROTE): GraphConv(in=512, out=256, normalization=both, activation=None)
      )
    )
    (conv2): HeteroGraphConv(
     

In [6]:
g

Graph(num_nodes={'Brand': 1900, 'Customer': 11041, 'Product': 5334, 'Review': 77071},
      num_edges={('Brand', 'rev_SOLD_BY', 'Product'): 5555, ('Customer', 'WROTE', 'Review'): 77071, ('Product', 'SOLD_BY', 'Brand'): 5555, ('Product', 'rev_REVIEW_OF', 'Review'): 77071, ('Review', 'REVIEW_OF', 'Product'): 77071, ('Review', 'rev_WROTE', 'Customer'): 77071},
      metagraph=[('Brand', 'Product', 'rev_SOLD_BY'), ('Product', 'Brand', 'SOLD_BY'), ('Product', 'Review', 'rev_REVIEW_OF'), ('Customer', 'Review', 'WROTE'), ('Review', 'Product', 'REVIEW_OF'), ('Review', 'Customer', 'rev_WROTE')])

In [7]:
review_tokens.input_ids.shape

torch.Size([77071, 512])

In [8]:
def tokens_to_cuda(tokens, device):
    dictionary = {}
    for key, value in tokens.items():
        dictionary[key] = value.to(device)
    return dictionary

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 
import tqdm

bert_base = BertModel.from_pretrained('bert-base-uncased')

bert_base.to('cuda')
model.language_model.to('cuda')

reivew_ids = []

gnn_sims = []
bert_base_sims = []
bert_finetuned_sims = []

with torch.no_grad():

    for i in tqdm.tqdm(range(g.num_nodes('Product'))):
        review_id = g.successors(i, 'rev_REVIEW_OF')
        
        if len(review_id) > 1 and len(review_id) < 650:

            reivew_ids.append(review_id)
                
            tokens = review_tokens.input_ids[review_id].to('cuda')
            attentions = review_tokens.attention_mask[review_id].to('cuda')
            token_types = review_tokens.token_type_ids[review_id].to('cuda')

            gnn_embs = model.graph_model.embed["Review"][review_id].detach().numpy()
            
            base_bert_embs = bert_base(input_ids = tokens, 
                 attention_mask= attentions, 
                 token_type_ids= token_types).last_hidden_state[:,0].detach().cpu().numpy()
            
            bert_finetuned_embs = model.language_model(input_ids = tokens, 
                 attention_mask= attentions, 
                 token_type_ids= token_types).last_hidden_state[:,0].detach().cpu().numpy()
            
            gnn_sims.append( (np.sum(cosine_similarity(gnn_embs)) - len(review_id)) /  (len(review_id) ** 2 - len(review_id)) )
            bert_base_sims.append( (np.sum(cosine_similarity(base_bert_embs)) - len(review_id)) /  (len(review_id) ** 2 - len(review_id)) )
            bert_finetuned_sims.append( (np.sum(cosine_similarity(bert_finetuned_embs)) - len(review_id)) / (len(review_id) ** 2 - len(review_id)) )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|█████████████████████████████████████████████████████████████████████████████| 5334/5334 [14:02<00:00,  6.33it/s]


In [13]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

review_id = g.successors(0, 'rev_REVIEW_OF')

embs = model.graph_model.embed["Review"][review_id].detach().numpy()
(np.sum(cosine_similarity(embs)) - 13) / (13*13 - 13)

0.7741479140061599

In [11]:
bert_base.to('cpu')
embs = bert_base(input_ids = review_tokens.input_ids[review_id], 
                 attention_mask= review_tokens.attention_mask[review_id], 
                 token_type_ids= review_tokens.token_type_ids[review_id]).last_hidden_state[:,0].detach().numpy()
(np.sum(cosine_similarity(embs)) - 13) / (12*12)

0.8752829233805338

In [56]:
np.argmin(cosine_similarity(embs))

155

In [21]:
len(review_id)

305

In [4]:
g.successors(0, 'WROTE')

tensor([    0,  1023,  3244,  3429,  4197,  5451,  5550, 21650, 21807, 58913])

In [11]:
for i in range(10):
    print(gnn_sims[i], bert_base_sims[i], bert_finetuned_sims[i])

0.4351822747124566 0.7986572265625 1.0
0.6400400161743164 0.8430681228637695 1.0
0.6438650131225586 0.8048208872477214 0.9999998728434245
0.2493367936876085 0.7941697014702691 1.0
0.1823503017425537 0.8022414207458496 1.0
0.26235570907592776 0.7868319511413574 1.0
0.438725217183431 0.757946523030599 1.0
0.5948143674616229 0.7766534459521199 1.0
0.5275150934855143 0.762452761332194 1.0
0.6180869511195591 0.8213873363676525 1.0
