In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Creating data and models

In [69]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [70]:
# Build KG from entities and relations and get them.
import data_builder
import torch
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from models import TransE, TransM
import numpy as np
import utilities

%run -i "data_builder.py"

entities, relations = data_builder.get_entities_and_relations()

In [71]:
# Read datasets and indices.

import model_utilities
from torch.utils.data import DataLoader

entities_index = create_index_dictionary("data/kg/entities_index.csv")
relations_index = create_index_dictionary("data/kg/relations_index.csv")

kg = model_utilities.MLDataset("data/kg/kg.csv", entities_index, relations_index)
train_size = int(0.7 * len(kg))
val_size = int(0.1 * len(kg))
test_size = len(kg) - train_size - val_size
train_kg, test_kg, val_kg = torch.utils.data.random_split(kg, [train_size, test_size, val_size],
                                                          generator=torch.Generator().manual_seed(42))

train_loader = DataLoader(train_kg, batch_size=128)

In [72]:
# Define test routine for both TransE and TransM models.

def test(model, data_loader, model_name):
    precision = 0
    num_examples = 0

    for h, l, t in test_loader:
        # print(f"batch size: {h.size()[0]}")
        h, l, t = (h.to(device), t.to(device), l.to(device))
        
        if model_name == "TransM":
            # Compute weights for each relation.
            rel_weights = []

        batch_size = h.size()[0]

        # Build h, l, t for batch.
        entities_times_batch = torch.arange(end=len(entities_index), device=device).unsqueeze(0).repeat(batch_size, 1)
        h_times_batch = h.reshape(-1, 1).repeat(1, entities_times_batch.size()[1])
        l_times_batch = l.reshape(-1, 1).repeat(1, entities_times_batch.size()[1])
        t_times_batch = t.reshape(-1, 1).repeat(1, entities_times_batch.size()[1])

        # Compute prediction on all head/tail variatios.
        head_corrupted_triplets = torch.stack((entities_times_batch, l_times_batch, t_times_batch), dim=2).reshape(-1, 3)
        tail_corrupted_triplets = torch.stack((h_times_batch, l_times_batch, entities_times_batch), dim=2).reshape(-1, 3)
        
        head_corrupted_predictions = model.predict(head_corrupted_triplets).reshape(batch_size, -1)
        tail_corrupted_predictions = model.predict(tail_corrupted_triplets).reshape(batch_size, -1)

        # Compute precision at 10.
        predictions = torch.cat((tail_corrupted_predictions, head_corrupted_predictions), dim=0)
        target = torch.cat((t.reshape(-1, 1), h.reshape(-1, 1)))

        _, indices = predictions.topk(k=10, largest=False)
        precision += torch.where(indices == target, torch.tensor([1], device=device), torch.tensor([0], device=device)).sum().item()
        num_examples += predictions.size()[0]

    # print(precision/num_examples)
        
    return(precision/num_examples)


In [73]:
# Define train routine for both TransE and TransM.

def train(model, data_loader, optimizer, num_epochs, summary_writer, model_name="TransE", validation=True):
    best_score = 0.0
    
    print(f"Training {model.__class__.__name__} model.")
    
    training_distances_list = []
    corrupted_distances_list = []
    loss_list = []
    perc_non_zero_loss_samples = []
          
    for epoch in range(num_epochs):
        print(f"epoch: {epoch}")
        model.train()

        non_zero_loss_samples = 0
        sample_count = 0
        step = 0

        for h, l, t in data_loader:
            h, l, t = (h.to(device), t.to(device), l.to(device))
            
            training_triples = torch.stack((h, l, t), dim=1)
            
            if model_name == "TransM":
                # Compute weights for each relation.
                rel_weights = []

                for rel in l:
                    mask = l == rel
                    num_rel = mask.sum()
                    num_tails = torch.numel(t[mask].unique())
                    num_heads = torch.numel(h[mask].unique())
                    rel_weights.append(1 / np.log(num_rel/num_tails + num_rel/num_heads))

                rel_weights = torch.tensor(rel_weights)
            
            # Generating corrupted triplets by replacing either head or tail with random entity.
            replacement_mask = torch.randint(high=2, size=h.size(), device=device)
            corrupted_entities = torch.randint(high=len(entities_index), size=h.size(), device=device)
            corrupted_h = torch.where(replacement_mask==0, corrupted_entities, h)
            corrupted_t = torch.where(replacement_mask==1, corrupted_entities, t)

            corrupted_triples = torch.stack((corrupted_h, l, corrupted_t), dim=1)

            optimizer.zero_grad()
            
            if model_name == "TransM":
                loss, training_distances, corrupted_distances = model(training_triples, corrupted_triples, rel_weights)
            else:
                loss, training_distances, corrupted_distances = model(training_triples, corrupted_triples)
            loss.mean().backward()

            summary_writer.add_scalar('Loss/train', loss.mean().data.cpu().numpy(), global_step=step)
            summary_writer.add_scalar('Distance/training', training_distances.sum().data.cpu().numpy(), global_step=step)
            summary_writer.add_scalar('Distance/corrupted', corrupted_distances.sum().data.cpu().numpy(), global_step=step)
            
            # Appending to list.
            loss_list.append(loss.mean().data.cpu().numpy())
            training_distances_list.append(training_distances.sum().data.cpu().numpy())
            corrupted_distances_list.append(corrupted_distances.sum().data.cpu().numpy())

            loss = loss.data.cpu()
            non_zero_loss_samples += loss.nonzero().size()[0]
            sample_count += loss.size()[0]

            optimizer.step()
            step+=1
            # if step%100 == 0:
            #     print(step)

        print(f"\t train non zero fraction: {non_zero_loss_samples/sample_count}")
        
        perc_non_zero_loss_samples.append(non_zero_loss_samples/sample_count)

        summary_writer.add_scalar('Metrics/loss_impacting_samples', 10, global_step=epoch)
        if validation:
            print(f"\t validation hit@10: {test(model, test_loader, model_name)}")
        
    return loss_list, training_distances_list, corrupted_distances_list, perc_non_zero_loss_samples


In [74]:
# Define model and parameters.

# TransE.
model = TransE(num_entities=len(entities), num_relations=len(relations), p=1, k=50, gamma=1.0)
device = torch.device("cpu")
model.to(device)

optimizer = optim.SGD(model.parameters(), lr=0.1)
summary_writer = SummaryWriter()
best_score = 0.0
epochs = 20
batch_size = 128
print(model)

TransE(
  (criterion): MarginRankingLoss()
  (entities_embedding): Embedding(94768, 50)
  (relations_embedding): Embedding(47384, 50)
)


In [81]:
with utilities.codeTimer(f" {model_name} training"):
    loss, training_distances, corrupted_distances, perc_non_zero_loss = train(model, train_loader, 
                                                                              optimizer, epochs, 
                                                                              summary_writer, model_name="TransM",
                                                                              validation=False)

Training TransM model.
epoch: 0
	 train non zero fraction: 0.34141341051616014
epoch: 1
	 train non zero fraction: 0.2679992764109986
epoch: 2
	 train non zero fraction: 0.22976965750120598
epoch: 3
	 train non zero fraction: 0.2124035214664737
epoch: 4
	 train non zero fraction: 0.1949770863482875
epoch: 5
	 train non zero fraction: 0.18189218523878437
epoch: 6
	 train non zero fraction: 0.16814399421128798
epoch: 7
	 train non zero fraction: 0.16009406657018813
epoch: 8
	 train non zero fraction: 0.15481789676796912
epoch: 9
	 train non zero fraction: 0.14477809937288952
epoch: 10
	 train non zero fraction: 0.14161239749155813
epoch: 11
	 train non zero fraction: 0.13736131210805597
epoch: 12
	 train non zero fraction: 0.13615533043897732
epoch: 13
	 train non zero fraction: 0.13286903039073805
epoch: 14
	 train non zero fraction: 0.12557284129281235
epoch: 15
	 train non zero fraction: 0.12322117703810902
epoch: 16
	 train non zero fraction: 0.12080921369995176
epoch: 17
	 train non

In [80]:
# TransM

model = TransM(num_entities=len(entities), num_relations=len(relations), p=1, k=50, gamma=1.0)
device = torch.device("cpu")
model.to(device)
num_epochs = 10

optimizer = optim.SGD(model.parameters(), lr=0.1)
summary_writer = SummaryWriter()
best_score = 0.0
epochs = 20
batch_size = 128
print(model)


TransM(
  (criterion): MarginRankingLoss()
  (entities_embedding): Embedding(94768, 50)
  (relations_embedding): Embedding(47384, 50)
)


In [22]:
# loss, training_distances, corrupted_distances, perc_non_zero_loss = train(model, train_loader, 
                                                                          # optimizer, epochs, 
                                                                          # summary_writer, model_name="TransM", 
                                                                          # validation=False)

In [131]:
# Save results to file.

results_folder = "results/"

rows = zip(loss, perc_non_zero_loss)

with open(results_folder+f"{model.__class__.__name__}_train.csv", "w") as f:
    f.write("loss, perc_non_zero_loss\n")
    writer = csv.writer(f)
    for row in rows:
        writer.writerow(row)

## Test

In [77]:
# model_name = "TransE"
model_name = "TransM"

In [82]:
# Test on whole data, i.e., all relations.

test_loader = DataLoader(test_kg, batch_size=128)

precision_at_10 = 0

with utilities.codeTimer(f"hit@10 {model_name} prediction"):
    precision_at_10 = test(model, test_loader, model_name)
    
print(precision_at_10)

Executed 'hit@10 TransE prediction'.  Elapsed time: 29.510294s
0.31114159105296474


In [83]:
# Test on feedback specifically.

feedback_kg = model_utilities.MLDataset("data/kg/kg.csv", entities_index, relations_index, mask='feedback')
train_size = int(0.7 * len(feedback_kg))
test_size = len(feedback_kg) - train_size
_, test_feedback_kg = torch.utils.data.random_split(feedback_kg, [train_size, test_size], generator=torch.Generator().manual_seed(42))

test_loader = DataLoader(test_feedback_kg, batch_size=128)

with utilities.codeTimer(f"hit@10 {model_name} prediction (using only feedback relation)"):
    precision_at_10 = test(model, test_loader, model_name)
    
print(precision_at_10)

Executed 'hit@10 TransE prediction (using only feedback relation)'.  Elapsed time: 34.310445s
0.38910193525372333


## DP embedding

In [63]:
from opacus import PrivacyEngine
from opacus.accountants import RDPAccountant

model = TransM(num_entities=len(entities), num_relations=len(relations), p=1, k=50, gamma=1.0)
optimizer = optim.SGD(model.parameters(), lr=0.1)

privacy_engine = PrivacyEngine()
dp_model, dp_optimizer, dp_train_loader = privacy_engine.make_private(
    module=model,
    optimizer=optimizer,
    data_loader=train_loader,
    noise_multiplier=1.1,
    max_grad_norm=1.0,
)

# Privacy accounting.
accountant = RDPAccountant()
dp_optimizer.attach_step_hook(accountant.get_optimizer_hook_fn(sample_rate=128/len(train_kg)))

In [47]:
# for epoch in range(5):
#     print(f"epoch: {epoch}")
#     dp_model.train()
    
#     non_zero_loss_samples = 0
#     sample_count = 0
#     step = 0
    
#     for h, l, t in dp_train_loader:
#         h, l, t = (h.to(device), t.to(device), l.to(device))
        
#         training_triples = torch.stack((h, l, t), dim=1)
        
#         # Generating corrupted triplets by replacing either head 
#         # or tail with random entity.
#         replacement_mask = torch.randint(high=2, size=h.size(), device=device)
#         corrupted_entities = torch.randint(high=len(entities_index), size=h.size(), device=device)
#         corrupted_h = torch.where(replacement_mask==0, corrupted_entities, h)
#         corrupted_t = torch.where(replacement_mask==1, corrupted_entities, t)
        
#         corrupted_triples = torch.stack((corrupted_h, l, corrupted_t), dim=1)
        
#         dp_optimizer.zero_grad()
        
#         loss, training_distances, corrupted_distances = dp_model(training_triples, corrupted_triples)
#         loss.mean().backward()
        
#         summary_writer.add_scalar('Loss/train', loss.mean().data.cpu().numpy(), global_step=step)
#         summary_writer.add_scalar('Distance/training', training_distances.sum().data.cpu().numpy(), global_step=step)
#         summary_writer.add_scalar('Distance/corrupted', corrupted_distances.sum().data.cpu().numpy(), global_step=step)
        
        
#         loss = loss.data.cpu()
#         non_zero_loss_samples += loss.nonzero().size()[0]
#         sample_count += loss.size()[0]
        
#         dp_optimizer.step()
#         step+=1
#         # print(step)
#         if step%100 == 0:
#             print(step)
        
#     print(non_zero_loss_samples/sample_count)
        
#     summary_writer.add_scalar('Metrics/loss_impacting_samples', 10, global_step=epoch)
    
#     # TODO: add some validation and probably better logging.

In [64]:
def dp_train(model, data_loader, optimizer, num_epochs, summary_writer, model_name="TransE"):
    
    for epoch in range(num_epochs):
        print(f"epoch: {epoch}")
        model.train()
        
        loss_list = []
        perc_non_zero_loss_samples = []

        non_zero_loss_samples = 0
        sample_count = 0
        step = 0

        for h, l, t in train_loader:
            h, l, t = (h.to(device), t.to(device), l.to(device))

            training_triples = torch.stack((h, l, t), dim=1)
            
            if model_name == "TransM":
                # Compute weights for each relation.
                rel_weights = []

                for rel in l:
                    mask = l == rel
                    num_rel = mask.sum()
                    num_tails = torch.numel(t[mask].unique())
                    num_heads = torch.numel(h[mask].unique())
                    rel_weights.append(1 / np.log(num_rel/num_tails + num_rel/num_heads))

                rel_weights = torch.tensor(rel_weights)

            # Generating corrupted triplets by replacing either head 
            # or tail with random entity.
            replacement_mask = torch.randint(high=2, size=h.size(), device=device)
            corrupted_entities = torch.randint(high=len(entities_index), size=h.size(), device=device)
            corrupted_h = torch.where(replacement_mask==0, corrupted_entities, h)
            corrupted_t = torch.where(replacement_mask==1, corrupted_entities, t)

            corrupted_triples = torch.stack((corrupted_h, l, corrupted_t), dim=1)

            optimizer.zero_grad()

            if model_name == "TransM":
                loss, training_distances, corrupted_distances = model(training_triples, corrupted_triples, rel_weights)
            else:
                loss, training_distances, corrupted_distances = model(training_triples, corrupted_triples)
            loss.mean().backward()

            summary_writer.add_scalar('Loss/train', loss.mean().data.cpu().numpy(), global_step=step)
            summary_writer.add_scalar('Distance/training', training_distances.sum().data.cpu().numpy(), global_step=step)
            summary_writer.add_scalar('Distance/corrupted', corrupted_distances.sum().data.cpu().numpy(), global_step=step)


            loss = loss.data.cpu()
            non_zero_loss_samples += loss.nonzero().size()[0]
            sample_count += loss.size()[0]

            optimizer.step()
            step+=1
            # print(step)
            if step%100 == 0:
                print(step)

        print(f"\t train non zero fraction: {non_zero_loss_samples/sample_count}")
        
        perc_non_zero_loss_samples.append(non_zero_loss_samples/sample_count)

        summary_writer.add_scalar('Metrics/loss_impacting_samples', 10, global_step=epoch)
        
    return loss_list, perc_non_zero_loss_samples
        


In [65]:
with utilities.codeTimer(f"DP-{model_name} training"):
    loss_list, perc_non_zero_loss = dp_train(dp_model, dp_train_loader, dp_optimizer, 10, summary_writer, model_name)

epoch: 0
100
200
	 train non zero fraction: 0.4772672455378678
epoch: 1
100
200
	 train non zero fraction: 0.41181259044862517
epoch: 2
100
200
	 train non zero fraction: 0.3796731789676797
epoch: 3
100
200
	 train non zero fraction: 0.3617342016401351
epoch: 4
100
200
	 train non zero fraction: 0.3524481427882296
epoch: 5
100
200
	 train non zero fraction: 0.34346357935359384
epoch: 6
100
200
	 train non zero fraction: 0.33475036179450074
epoch: 7
100
200
	 train non zero fraction: 0.3273034249879402
epoch: 8
100
200
	 train non zero fraction: 0.31871080559575493
epoch: 9
100
200
	 train non zero fraction: 0.3155752532561505
Executed 'DP-TransM training'.  Elapsed time: 5607.763006s


In [66]:
accountant.get_epsilon(delta=1/len(train_kg))

0.9584085158564951

In [None]:
results_folder = "results/"

rows = zip(loss, perc_non_zero_loss)

with open(results_folder+f"{model.__class__.__name__}_dp_train.csv", "w") as f:
    f.write("loss, perc_non_zero_loss\n")
    writer = csv.writer(f)
    for row in rows:
        writer.writerow(row)

In [84]:
# test_loader = DataLoader(test_kg, batch_size=128)
with utilities.codeTimer(f"DP-{model_name} training"):
    test_loader = DataLoader(test_feedback_kg, batch_size=128)

    precision = 0
    num_examples = 0

    dp_model.eval()

    for h, l, t in test_loader:
        # print(f"batch size: {h.size()[0]}")
        h, l, t = (h.to(device), t.to(device), l.to(device))

        batch_size = h.size()[0]

        # Build h, l, t for batch.
        entities_times_batch = torch.arange(end=len(entities_index), device=device).unsqueeze(0).repeat(batch_size, 1)
        h_times_batch = h.reshape(-1, 1).repeat(1, entities_times_batch.size()[1])
        l_times_batch = l.reshape(-1, 1).repeat(1, entities_times_batch.size()[1])
        t_times_batch = t.reshape(-1, 1).repeat(1, entities_times_batch.size()[1])

        # Compute prediction on all head/tail variatios.
        head_corrupted_triplets = torch.stack((entities_times_batch, l_times_batch, t_times_batch), dim=2).reshape(-1, 3)
        tail_corrupted_triplets = torch.stack((h_times_batch, l_times_batch, entities_times_batch), dim=2).reshape(-1, 3)

        # Note, the dp wrapper does not allow to compute predict(), so this is a workaround to get distances using forward().
        _, _, head_corrupted_predictions = dp_model(head_corrupted_triplets, head_corrupted_triplets)
        _, _, tail_corrupted_predictions = dp_model(tail_corrupted_triplets, tail_corrupted_triplets)

        # Compute precision at 10.
        predictions = torch.cat((tail_corrupted_predictions, head_corrupted_predictions), dim=0)
        target = torch.cat((t.reshape(-1, 1), h.reshape(-1, 1)))

        _, indices = predictions.topk(k=10, largest=False)
        precision += torch.where(indices == target, torch.tensor([1], device=device), torch.tensor([0], device=device)).sum().item()
        num_examples += predictions.size()[0]


    print(precision/num_examples)

5.408915528446937e-09
Executed 'DP-TransE training'.  Elapsed time: 74.596323s


---