In [1]:
# set app as default directory to address imports
import os
import sys
sys.path.append(os.path.join(os.getcwd(), './../../'))

#activate autoreload to easier test classes
%load_ext autoreload
%autoreload 2

In [2]:
from app.services.sparql_graph import SPARQLGraph
from app.config.enums import Environment
graph = SPARQLGraph(Environment.DEV, False)

Metadata loaded successfully from JSON files.
Initializing SPARQLGraph
Graph loaded with 94107 triples after 0:00:14.296556


In [13]:
import nltk
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sentence_transformers import SentenceTransformer, util
import torch

# Relations as a flat list for embedding
relations = graph.get_relations_labels()

# Load the transformer model (e.g., all-mpnet-base-v2)
model = SentenceTransformer('all-mpnet-base-v2')

# Load the transformer model (e.g., all-mpnet-base-v2)
model = SentenceTransformer('all-mpnet-base-v2')

# Embed the relations and their synonyms into the embedding space
relation_embeddings = model.encode(relations, convert_to_tensor=True)

# Ensure you have the required data for bigrams
nltk.download('punkt')

# Function to extract n-grams (unigrams, bigrams, trigrams)
def extract_ngrams(text):
    # Tokenize the sentence
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    filtered_tokens = [word for word in tokens if word.lower() not in ENGLISH_STOP_WORDS]
    
    # Generate unigrams, bigrams, and trigrams
    unigrams = filtered_tokens
    bigrams = list(nltk.bigrams(filtered_tokens))
    trigrams = list(nltk.trigrams(filtered_tokens))
    
    # Join n-grams into phrases
    unigram_phrases = unigrams
    bigram_phrases = [' '.join(bigram) for bigram in bigrams]
    trigram_phrases = [' '.join(trigram) for trigram in trigrams]
    
    return unigram_phrases, bigram_phrases, trigram_phrases

# Example query
# query = "Who was the screen writer of the movie"
query = "On which day was  released"

# Step 1: Extract unigrams, bigrams, and trigrams
unigrams, bigrams, trigrams = extract_ngrams(query)
print("Unigrams extracted:", unigrams)
print("Bigrams extracted:", bigrams)
print("Trigrams extracted:", trigrams)

# Step 2: Embed each n-gram and find the most similar relation
ngrams = unigrams + bigrams + trigrams
ngrams_embeddings = model.encode(ngrams, convert_to_tensor=True)

# Find the cosine similarities between n-grams and relations
cosine_scores_ngrams = util.pytorch_cos_sim(ngrams_embeddings, relation_embeddings)

# Iterate over each n-gram and print cosine similarity scores for each relation
for i, ngram in enumerate(ngrams):
    print(f"\nCosine similarities for n-gram: '{ngram}'")
    for j, relation in enumerate(relations):
        similarity_score = cosine_scores_ngrams[i, j].item()  # Get the similarity score for the n-gram and relation
        print(f"  Similarity with relation '{relation}': {similarity_score:.4f}")

# Step 3: Find the most similar relation by maximizing over all n-grams
max_scores_ngrams, _ = cosine_scores_ngrams.max(dim=0)
most_similar_idx_ngrams = max_scores_ngrams.argmax()
most_similar_relation_ngrams = relations[most_similar_idx_ngrams]

# Print the cosine similarity and relation for the most similar n-gram
print(f"\nMost similar relation using n-grams: '{most_similar_relation_ngrams}' with cosine similarity: {max_scores_ngrams[most_similar_idx_ngrams].item():.4f}")

# Step 4: Whole sentence comparison
filtered_query_tokens = [word for word in nltk.word_tokenize(query) if word.lower() not in ENGLISH_STOP_WORDS]
filtered_query = ' '.join(filtered_query_tokens)
query_embedding = model.encode(filtered_query, convert_to_tensor=True)

# Find the cosine similarities between the whole query and relations
cosine_scores_sentence = util.pytorch_cos_sim(query_embedding, relation_embeddings)

# Get the index of the most similar relation for the whole sentence
most_similar_idx_sentence = cosine_scores_sentence.argmax()
most_similar_relation_sentence = relations[most_similar_idx_sentence]

# Print the cosine similarity and relation for the whole sentence
print(f"\nCosine similarity using whole sentence: {cosine_scores_sentence[0, most_similar_idx_sentence].item():.4f}")
print(f"Most similar relation using whole sentence: {most_similar_relation_sentence}")

# Step 5: Overall comparison - selecting the highest similarity from all methods
all_scores = torch.cat((max_scores_ngrams, cosine_scores_sentence.flatten()))
all_relations = relations + [most_similar_relation_sentence]

# Get the index of the overall most similar relation
most_similar_idx_overall = all_scores.argmax()
most_similar_relation_overall = all_relations[most_similar_idx_overall]

# Print the final most similar relation and its cosine similarity
print(f"\nOverall most similar relation: '{most_similar_relation_overall}' with cosine similarity: {all_scores[most_similar_idx_overall].item():.4f}")



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unigrams extracted: ['day', 'released']
Bigrams extracted: ['day released']
Trigrams extracted: []

Cosine similarities for n-gram: 'day'
  Similarity with relation 'IFCO rating': 0.0765
  Similarity with relation 'located in present-day administrative territorial entity': 0.0795
  Similarity with relation 'assessment': 0.2442
  Similarity with relation 'production company': 0.1486
  Similarity with relation 'named after': 0.1638
  Similarity with relation 'place of death': 0.1324
  Similarity with relation 'ancestral home': 0.1600
  Similarity with relation 'based on': 0.2089
  Similarity with relation 'ethnic group': 0.1286
  Similarity with relation 'location': 0.2160
  Similarity with relation 'original film format': 0.0788
  Similarity with relation 'depicts': 0.1865
  Similarity with relation 'head of state': 0.1213
  Similarity with relation 'different from': 0.2399
  Similarity with relation 'narrator': 0.1462
  Similarity with relation 'sound designer': 0.1025
  Similarity wit