In [2]:
from transformers import BertTokenizer
from transformers import AutoModelForSequenceClassification
#from datasets import IterableDataset
from tqdm import tqdm
import json
import transformers
#from datasets import Dataset
transformers.logging.set_verbosity_error()
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [4]:

maps = torch.load('./data/FB15k-237/maps.pt')
uri_to_id = maps['ent_ids']
relation_uri_to_id = maps['rel_ids']
        
edge_index = []
edge_type = []
with open('./data/FB15k-237/ind-train.tsv') as triples_in:
    for line in triples_in:
        head, relation, tail = line[:-1].split('\t')
        edge_index.append([uri_to_id[head], uri_to_id[tail]])
        edge_type.append(relation_uri_to_id[relation])

In [12]:
sentences = ['' for _ in range(max(uri_to_id.values()) + 1)]
with open('./data/FB15k-237/entity2textlong.txt') as sentences_in:
    for line in sentences_in:
        uri, description = line[:-1].split('\t')
        if uri not in uri_to_id:
            continue
        sentences[uri_to_id[uri]] = description


In [13]:
samples = []
for i in tqdm(range(len(edge_index))):
    samples.append((sentences[edge_index[i][0]], sentences[edge_index[i][1]], edge_type[i]))

100%|██████████| 215082/215082 [00:00<00:00, 2488769.72it/s]


In [25]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

dataset = []
for idx, (first_sentence, second_sentence, label) in enumerate(tqdm(samples)):
    encoding = tokenizer(first_sentence, second_sentence, padding="max_length", truncation=True)
    encoding['idx'] = idx
    encoding['labels'] = label
    #encoding['sentence1'] = first_sentence
    #encoding['sentence2'] = second_sentence
    dataset.append(encoding)
    if idx == 10000:
        break


Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.51MB/s]
  5%|▍         | 10000/215082 [00:34<11:40, 292.93it/s]


In [26]:
from datasets import Dataset

d = Dataset.from_list(dataset)

In [27]:
import torch

d.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])
dataloader = torch.utils.data.DataLoader(d, batch_size=64)

In [28]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 20.3MB/s]


In [29]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [30]:
#id2label = {0: "relation", 1: "inverse_relation"}
#label2id = {"relation": 0, "inverse_relation": 1}

In [31]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(relation_uri_to_id.keys()))

In [32]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [33]:
from transformers import get_scheduler

num_epochs = 10
num_training_steps = num_epochs * len(dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [34]:
import torch
torch.cuda.empty_cache()

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [35]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()


for epoch in tqdm(range(num_epochs)):
    loss_total = 0
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        loss_total += loss.detach().cpu()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    print('loss_total:', loss_total)

  0%|          | 0/1570 [00:00<?, ?it/s]
  0%|          | 0/10 [02:49<?, ?it/s]2, 129.81s/it]


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), '../data/wikidata5m_inductive/relation_classification_model.pt')

In [None]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

## Use model to make predictions

In [8]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(relation_uri_to_id.keys()))
model.load_state_dict(torch.load('../data/wikidata5m_inductive/relation_classification_model.pt'))

<All keys matched successfully>

In [9]:
page_links = torch.load('../data/wikidata5m_inductive/page_links.pt')

In [None]:
page_links

In [None]:
print('test')


In [None]:
samples = []
for i in tqdm(range(page_links.size(0))):
    head, tail = page_links[i]
    samples.append((sentences[head], sentences[tail]))

In [None]:
# first try with manually constructed sentences
germany = "Germany, officially the Federal Republic of Germany, is a country in Central Europe."
berlin = "Berlin is the capital and largest city of Germany by both area and population."
samples = []
samples.append((berlin, germany))
samples.append((germany, berlin))

In [10]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

model.to('cuda')
model.eval()

dataset = []
batch_edge_index = []
for idx in tqdm(range(page_links.size(0))):
    head, tail = page_links[idx]
    batch_edge_index.append([idx, head, tail])
    encoding = tokenizer(sentences[head], sentences[tail], padding="max_length", truncation=True)
    encoding['idx'] = idx
    dataset.append(encoding)
    if idx % 1000 == 0:
        d = Dataset.from_list(dataset)
        d.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask"])
        dataloader = torch.utils.data.DataLoader(d, batch_size=64, shuffle=False)
        with torch.no_grad():
            for batch in tqdm(dataloader):
                batch = {k: v.to('cuda') for k, v in batch.items()}
                with torch.no_grad():
                    outputs = model(**batch)
                    predictions = torch.argmax(outputs.logits, dim=1)
                    
        with open('page_ling_graph.txt', 'a+') as triples_out:
            for (idx, head, tail), relation in zip(batch_edge_index, predictions.tolist()):
                triples_out.write('\t'.join([str(idx), str(head), str(relation), str(tail)]) + '\n')
            
        dataset = []
        batch_edge_index = []
            

  0%|          | 0/100109235 [00:00<?, ?it/s]


NameError: name 'Dataset' is not defined

In [None]:
torch.save(dataset, 'test.pt')

In [None]:
d = Dataset.from_list(dataset)

d.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask"])
dataloader = torch.utils.data.DataLoader(d, batch_size=64, shuffle=False)

model.to('cuda')
model.eval()


#for batch in dataloader:
#    batch = {k: v.to(device) for k, v in batch.items()}
with torch.no_grad():
    for batch in tqdm(dataloader):
        batch = {k: v.to('cuda') for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=1)


#    logits = outputs.logits
#    predictions = torch.argmax(logits, dim=-1)
#    metric.add_batch(predictions=predictions, references=batch["labels"])



In [None]:
outputs

In [None]:
torch.argmax(outputs.logits, dim=1)

In [None]:
relation_uri_to_id()

In [None]:
id_to_relation_uri = {v:k for k,v in relation_uri_to_id.items()}

In [None]:
id_to_relation_uri[42]

In [14]:
degree = {'a': 10, 'b': 5, 'c': 2}

In [15]:
x = ['c', 'a', 'b']

In [17]:
sorted(x, key=lambda e: - degree[e])

['a', 'b', 'c']

In [129]:
import torch

In [130]:
e1_emb = torch.tensor([1.,2,3,4])
rel_emb = torch.tensor([5.,6,7,8])
e2_emb = torch.tensor([1.,2,3,4])

# requirement: height * width = embedding dim
height = 2
width = 2
e1_emb = e1_emb.view(-1, 1, height, width)

rel_emb = rel_emb.view(-1, 1, height, width)

In [131]:
stacked_inputs = torch.cat([e1_emb, rel_emb], 2)

In [132]:
stacked_inputs

tensor([[[[1., 2.],
          [3., 4.],
          [5., 6.],
          [7., 8.]]]])

In [133]:
conv1 = torch.nn.Conv2d(1, 32, (2, 2), 1, 0)

In [134]:
x_conv = conv1(stacked_inputs)

In [135]:
x_conv.size()

torch.Size([1, 32, 3, 1])

In [136]:
x_conv = torch.relu(x_conv)

In [126]:
x_conv

tensor([[[[0.0000],
          [0.0000],
          [0.0000]],

         [[1.3309],
          [2.1268],
          [2.9226]],

         [[0.0000],
          [0.0000],
          [0.0000]],

         [[1.0137],
          [0.8094],
          [0.6051]],

         [[0.0000],
          [0.0000],
          [0.0000]],

         [[0.0000],
          [0.0000],
          [0.0000]],

         [[0.4274],
          [0.6910],
          [0.9546]],

         [[2.7062],
          [4.0017],
          [5.2972]],

         [[0.0000],
          [0.0000],
          [0.0000]],

         [[0.0000],
          [0.0000],
          [0.0000]],

         [[1.6623],
          [2.5654],
          [3.4685]],

         [[1.7829],
          [3.6603],
          [5.5377]],

         [[1.5170],
          [2.8361],
          [4.1551]],

         [[1.6728],
          [2.4682],
          [3.2635]],

         [[0.0000],
          [0.0000],
          [0.0000]],

         [[0.1390],
          [0.0000],
          [0.0000]],

        

In [137]:
x_flatten = x_conv.flatten()

In [141]:
x_flatten.size()

torch.Size([96])

In [142]:
embedding_dim = 4
# 96 = x.size(0)
fc = torch.nn.Linear(96, embedding_dim)
x_fc = fc(x_flatten)


In [143]:
x_fc.size()

torch.Size([4])

In [144]:
e2_emb.size()

torch.Size([4])

In [145]:
torch.dot(x.flatten(), e2_emb.t())

tensor(10.2821, grad_fn=<DotBackward0>)

Types page link graph to tensor of triples

In [6]:
import torch
triples = []
with open('../data/wikidata5m_inductive/page_links_typed.txt') as triples_in:
    for line in triples_in:
        head, relation, tail = line[:-1].split('\t')
        triples.append([int(head), int(relation), int(tail)])
triples = torch.tensor(triples)

In [7]:
triples.size()

torch.Size([100109001, 3])

In [8]:
torch.save(triples, '../data/wikidata5m_inductive/page_links_typed.pt')