In [1]:
!pip install rdflib



In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split
from rdflib import Graph, Literal, RDF, URIRef
import numpy as np
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)

In [2]:
def read_ttl_file(file_path):
    graph = Graph()
    graph.parse(file_path, format="turtle")
    # Extract triples from the graph
    triples = [(str(s), str(p), str(o)) for s, p, o in graph]
    return triples


In [3]:
def convert_to_tensors(triples, entity_to_index, relation_to_index):
    triple_indices = [
        (entity_to_index[s], relation_to_index[p], entity_to_index[o])
        for s, p, o in triples
    ]
    tensor_data = torch.tensor(triple_indices, dtype=torch.long)
    return tensor_data

In [4]:
def calculateNumOfUniqueValues(data):
    unique_entities =set()
    unique_relations =set()
    for triple in data:
        unique_entities.add(triple[0])  # Subject
        unique_relations.add(triple[1]) # Predicate (relation)
        unique_entities.add(triple[2])  # Object
    
    # Count the number of unique entities
    num_entities = len(unique_entities)
    num_relations = len(unique_relations)
    print(f"Number of unique entities: {num_entities}")
    print(f"Number of unique relations: {num_relations}")
    return num_entities,num_relations


In [5]:

class TransEModel(nn.Module):
    def __init__(self, num_entities, num_relations, embedding_dim):
        super(TransEModel, self).__init__()
        self.entity_embedding = nn.Embedding(num_entities, embedding_dim)
        self.relation_embedding = nn.Embedding(num_relations, embedding_dim)

    def forward(self, triples):
        # Ensure indices are within the valid range
        num_entities = self.entity_embedding.weight.size(0)
        num_relations = self.relation_embedding.weight.size(0)
        
        subject_indices = torch.clamp(triples[:, 0], max=num_entities - 1)
        relation_indices = torch.clamp(triples[:, 1], max=num_relations - 1)
        object_indices = torch.clamp(triples[:, 2], max=num_entities - 1)
        
        subject_embedding = self.entity_embedding(subject_indices)
        relation_embedding = self.relation_embedding(relation_indices)
        object_embedding = self.entity_embedding(object_indices)
        
        score = torch.sum(torch.abs(subject_embedding + relation_embedding - object_embedding), dim=1)

        return score

In [6]:
# Define MyDataset
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, triples):
        self.triples = triples
    def __len__(self):
        return len(self.triples)
    def __getitem__(self, idx):
        return self.triples[idx]


In [7]:
# Define DataLoader for negative triplets
class NegativeTripletsDataset(Dataset):
    def __init__(self, positive_triples, num_entities):
        self.positive_triples = positive_triples
        self.num_entities = num_entities
    def __len__(self):
        return len(self.positive_triples)
    def __getitem__(self, idx):
        return create_negative_triples(self.positive_triples[idx], self.num_entities)


In [8]:
def create_negative_triples(positive_triples, num_entities, device):
    batch_size = positive_triples.size(0)
    # Generate random indices for negative subjects and objects
    negative_subjects = torch.randint(0, num_entities, (batch_size,)).to(device)
    negative_objects = torch.randint(0, num_entities, (batch_size,)).to(device)
    
    # Clone positive triples to create negative triples
    negative_triples = positive_triples.clone()

    # Replace either subject or object in each positive triple with a negative entity
    replace_subject = torch.rand(batch_size).to(device) > 0.5
    negative_triples[:, 0] = torch.where(replace_subject, negative_subjects, negative_triples[:, 0])
    negative_triples[:, 2] = torch.where(~replace_subject, negative_objects, negative_triples[:, 2])
    
    return negative_triples

In [9]:
def train_model(model, data_loader, criterion, optimizer, num_epochs):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0.0

        for batch_data in data_loader:
            positive_scores = model(batch_data.to(device))

            # Generate negative triples
            negative_triples = create_negative_triples(batch_data, num_entities, device)
            negative_scores = model(negative_triples)

            # Ensure consistent batch size
            target = torch.ones(positive_scores.size(0)).to(device)

            # Calculate loss
            loss = criterion(positive_scores, negative_scores, target)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        average_loss = total_loss / len(data_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss:.4f}")

In [10]:
def evaluate_model(model, test_data_loader, num_entities, k_values=[1, 3, 5, 10]):
    model.eval()  # Set the model to evaluation mode
    all_scores = []
    all_labels = []
    with torch.no_grad():
        for batch_data in test_data_loader:
            positive_scores = model(batch_data.to(device))
            # Create negative triples by corrupting positive triples
            negative_triples = create_negative_triples(batch_data, num_entities,device)
            negative_scores = model(negative_triples)
            # Concatenate positive and negative scores
            batch_scores = torch.cat([positive_scores, negative_scores])
            all_scores.append(batch_scores)
            # Create labels (1 for positive, 0 for negative)
            batch_labels = torch.cat([torch.ones_like(positive_scores), torch.zeros_like(negative_scores)])
            all_labels.append(batch_labels)

    # Concatenate scores and labels across batches
    all_scores = torch.cat(all_scores)
    all_labels = torch.cat(all_labels)

    # Print dimensions for debugging
    print("Dimensions:")
    print("y_true:", all_labels.cpu().detach().numpy().shape)
    print("y_score:", all_scores.cpu().detach().numpy().shape)
    # Calculate Hits@k
    hits_at_k = {}
    sorted_indices = torch.argsort(positive_scores, descending=True)
    for k in k_values:
        top_k_indices = sorted_indices[:k]
        hits_at_k[k] = torch.sum(top_k_indices < len(positive_scores)).item() / len(positive_scores)
    # Calculate precision-recall metrics
    ap_score = average_precision_score(all_labels.cpu().detach().numpy(), all_scores.cpu().detach().numpy())
    print(f'Average Precision (AP): {ap_score:.4f}')
    for k in k_values:
        print(f'Hits@{k}: {hits_at_k[k]:.4f}')

In [11]:
#initializing the variables
batch_size = 64
random_seed = 42
embedding_dim = 100
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Reading the data (we fucused only on formula 1 data)
f1_file_path = "FormulaOne_FinalOutPut.ttl"
data = read_ttl_file(f1_file_path)

# Creating the train and test sets
train_triples, test_triples = train_test_split(data, test_size=0.2, random_state=random_seed)

# Calculating the number of entities and relations
num_entities,num_relations = calculateNumOfUniqueValues(train_triples)

# Transforming the entities and relations into dictionaries that are compatible with our class
entity_to_index = {entity: index for index, entity in enumerate(set([s for s, _, _ in data] + [o for _, _, o in data]))}
relation_to_index = {relation: index for index, relation in enumerate(set([p for _, p, _ in data]))}

# Converting the data into a tensors format
tensor_data = convert_to_tensors(train_triples, entity_to_index, relation_to_index).to(device)
dataset = MyDataset(tensor_data)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initializing the model according to number of relations and entities
model = TransEModel(num_entities, num_relations, embedding_dim).to(device)


Number of unique entities: 257222
Number of unique relations: 189


In [12]:
# Create DataLoader for negative triplets
negative_triplets_dataset = NegativeTripletsDataset(tensor_data, num_entities)
negative_triplets_loader = DataLoader(negative_triplets_dataset, batch_size=batch_size, shuffle=True)

In [13]:
# Define optimizer and learning rate scheduler
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
criterion = nn.MarginRankingLoss(margin=1.0).to(device)

In [14]:
# Train the model
train_model(model, data_loader, criterion, optimizer, num_epochs)

Epoch 1/10, Average Loss: 3.2504
Epoch 2/10, Average Loss: 2.1872
Epoch 3/10, Average Loss: 1.7438
Epoch 4/10, Average Loss: 1.4729
Epoch 5/10, Average Loss: 1.2859
Epoch 6/10, Average Loss: 1.1335
Epoch 7/10, Average Loss: 1.0138
Epoch 8/10, Average Loss: 0.9178
Epoch 9/10, Average Loss: 0.8334
Epoch 10/10, Average Loss: 0.7605


In [15]:
# Transforming the test set into a data loader
test_batch_size = 64
test_tensor_data = convert_to_tensors(test_triples, entity_to_index, relation_to_index).to(device)
test_data_loader = DataLoader(test_tensor_data, batch_size=test_batch_size, shuffle=False)
# Evaluate the model
num_entities_test,num_relations_test = calculateNumOfUniqueValues(test_triples)
evaluate_model(model, test_data_loader,num_entities_test)

Number of unique entities: 169585
Number of unique relations: 189
Dimensions:
y_true: (772754,)
y_score: (772754,)
Average Precision (AP): 0.7121
Hits@1: 0.1111
Hits@3: 0.3333
Hits@5: 0.5556
Hits@10: 1.0000


In [16]:
example_triple = ("27", 'http://example.org/motor-sports/formula-one/sprint_results/status', "20")

subject_idx = entity_to_index.get(example_triple[0], -1)
relation_idx = relation_to_index.get(example_triple[1], -1)
object_idx = entity_to_index.get(example_triple[2], -1)

print("Indices:", subject_idx, relation_idx, object_idx)

if subject_idx == -1 or relation_idx == -1 or object_idx == -1:
    print("One or more entities/relations wasn't found -> Unable to evaluate.")
else:
    # Transforming the triple into tensor format
    custom_triple_tensor = torch.tensor([[subject_idx, relation_idx, object_idx]], dtype=torch.long).to(device)

    # Predict the score for the positive (true) triple
    positive_score = model(custom_triple_tensor).detach().cpu().numpy()

    # Corrupt the triple to create a negative example
    negative_subject = torch.randint(0, num_entities, (1,)).item()
    negative_object = torch.randint(0, num_entities, (1,)).item()

    negative_triple_tensor = torch.tensor([[negative_subject, relation_idx, negative_object]], dtype=torch.long).to(device)

    # Predict the score for the negative triple
    negative_score = model(negative_triple_tensor).item()

    print("Positive Score:", positive_score)
    print("Negative Score:", negative_score)

    # Print the prediction
    prediction = "Positive" if positive_score > negative_score else "Negative"
    print(f"The exsistance of such triple according to our model is: {prediction}")

Indices: 115523 103 120252
Positive Score: [183.40065]
Negative Score: 134.9213104248047
The exsistance of such triple according to our model is: Positive


In [25]:
# example query (subject -> constructor:27 , predicate -> status , object -> ?)
query_triple = ("http://example.org/motor-sports/formula-one/constructors/27", 'http://example.org/motor-sports/formula-one/sprint_results/status', None)

# Get indices for subject and relation
subject_idx = entity_to_index.get(query_triple[0], -1)
relation_idx = relation_to_index.get(query_triple[1], -1)

if subject_idx != -1 and relation_idx != -1:
    # Create tensor for the given subject and relation
    input_tensor = torch.tensor([[subject_idx, relation_idx, 0]], dtype=torch.long).to(device)

    # Predict the scores for all objects
    object_scores = model(input_tensor).detach().cpu().numpy()

    # Get the indices of the top-k predictions
    k = 1
    top_k_indices = torch.topk(torch.tensor(object_scores), k=k).indices.numpy()

    # Convert indices back to entity names using entity_to_index dictionary
    top_k_entities = [key for key, value in entity_to_index.items() if value in top_k_indices]

    # Print the results
    print("Top-k Predictions:", top_k_entities)
else:
    print("One or more entities/relations not found in the mapping. Unable to evaluate.")

Top-k Predictions: ['turbo']
