In [None]:
# set app as default directory to address imports
import os
import sys
sys.path.append(os.path.join(os.getcwd(), './../../'))

#activate autoreload to easier test classes
%load_ext autoreload
%autoreload 2

In [None]:
from app.services.sparql_graph import SPARQLGraph
from app.config.enums import Environment
graph = SPARQLGraph(Environment.DEV, False)

Metadata loaded successfully from JSON files.
Initializing SPARQLGraph
Graph loaded with 94107 triples after 0:00:02.678702


In [20]:
from sentence_transformers import SentenceTransformer, util

# Load the transformer model
model = SentenceTransformer('all-mpnet-base-v2')

# Define the relations from the graph
relations = graph.get_relations_labels()

# Array of paraphrased queries
queries = [
    # Original + Variations for "Who is the director of Star Wars: Episode VI - Return of the Jedi?"
    "Who is the director of Star Wars: Episode VI - Return of the Jedi?",
    "Can you tell me who directed the movie",
    "Who was the director of the movie X?",
    "Do you know who directed X?",
    
    # Original + Variations for "Who is the screenwriter of The Masked Gang: Cyprus?"
    "Who is the screenwriter of The Masked Gang: Cyprus?",
    "Can you tell me who wrote the script for The Masked Gang: Cyprus?",
    "Who was responsible for writing X?",
    "Who worked as the screenwriter on The Masked Gang: Cyprus?",

    # Original + Variations for "When was 'The Godfather' released?"
    "When was X released?",
    "What is the release date of X?",
    "Can you tell me when X was first released?",
    "When did X come out?"
]

# Convert relations into embeddings
relation_embeddings = model.encode(relations, convert_to_tensor=True)

# Loop through each query, compute similarity, and print the most similar relation
for user_query in queries:
    # Convert user query into embedding
    query_embedding = model.encode(user_query, convert_to_tensor=True)
    
    # Compute cosine similarities
    cosine_scores = util.pytorch_cos_sim(query_embedding, relation_embeddings)
    
    # Find the most similar relation
    most_similar_idx = cosine_scores.argmax()
    
    # Print the most similar relation for the current query
    print(f"User Query: {user_query}")
    print(f"Most similar relation: {relations[most_similar_idx]}\n")


User Query: Who is the director of Star Wars: Episode VI - Return of the Jedi?
Most similar relation: director

User Query: Can you tell me who directed the movie
Most similar relation: director

User Query: Who was the director of the movie X?
Most similar relation: director

User Query: Do you know who directed X?
Most similar relation: director

User Query: Who is the screenwriter of The Masked Gang: Cyprus?
Most similar relation: screenwriter

User Query: Can you tell me who wrote the script for The Masked Gang: Cyprus?
Most similar relation: screenwriter

User Query: Who was responsible for writing X?
Most similar relation: author

User Query: Who worked as the screenwriter on The Masked Gang: Cyprus?
Most similar relation: screenwriter

User Query: When was X released?
Most similar relation: time period

User Query: What is the release date of X?
Most similar relation: publication date

User Query: Can you tell me when X was first released?
Most similar relation: time period

Use

In [12]:
query_embedding = model.encode("Test", convert_to_tensor=True)
    
# Compute cosine similarities
cosine_scores = util.pytorch_cos_sim(query_embedding, relation_embeddings)

cosine_scores = cosine_scores.flatten()
# Find the most similar relation
most_similar_idx = cosine_scores.argmax()
print(cosine_scores[most_similar_idx])

tensor(0.4904)


In [22]:
import re
from sentence_transformers import SentenceTransformer, util

# Load the transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Sample relations from the graph
relations = graph.get_relations_labels()

import re
from nltk.corpus import stopwords

# Load stop words list
stop_words = set(stopwords.words('english'))

# Function to mask the movie name and remove stop words
def mask_movie_name(query):
    # Step 1: Mask the movie name
    # remove MOVIE_NAME from the query
    masked_query = re.sub(r'MOVIE_NAME', '', query)
    
    # Step 2: Remove stop words
    # Convert query to lowercase and split into words
    query_words = masked_query.lower().split()
    
    # Filter out stop words
    filtered_words = [word for word in query_words if word not in stop_words]
    
    # Join the filtered words back into a cleaned query string
    cleaned_query = " ".join(filtered_words)
    
    return cleaned_query

# Queries to process
queries = [
    # Queries related to the director of a movie
    "Who is the director of MOVIE_NAME?",
    "Can you tell me who directed MOVIE_NAME?",
    "Who was the director of the movie MOVIE_NAME?",
    "Do you know who directed MOVIE_NAME?",

    # Queries related to the screenwriter of a movie
    "Who is the screenwriter of MOVIE_NAME?",
    "Can you tell me who wrote the script for MOVIE_NAME?",
    "Who was responsible for writing MOVIE_NAME?",
    "Who worked as the screenwriter on MOVIE_NAME?",


    # Queries related to the release date of a movie
    "When was MOVIE_NAME released?",
    "What is the release date of MOVIE_NAME?",
    "Can you tell me when MOVIE_NAME was first released?",
    "When did the movie MOVIE_NAME come out?"
]


# Convert relations into embeddings
relation_embeddings = model.encode(relations, convert_to_tensor=True)

# Loop through each query, mask the movie name, compute similarity, and print the most similar relation
for user_query in queries:
    # Mask the movie name in the query
    masked_query = mask_movie_name(user_query)
    
    # Convert masked query into embedding
    query_embedding = model.encode(masked_query, convert_to_tensor=True)
    
    # Compute cosine similarities
    cosine_scores = util.pytorch_cos_sim(query_embedding, relation_embeddings)
    
    # Find the most similar relation
    most_similar_idx = cosine_scores.argmax()
    
    # Print the original and masked query, and the most similar relation
    print(f"Original Query: {user_query}")
    print(f"Masked Query: {masked_query}")
    print(f"Most similar relation: {relations[most_similar_idx]}\n")


Original Query: Who is the director of MOVIE_NAME?
Masked Query: director ?
Most similar relation: director

Original Query: Can you tell me who directed MOVIE_NAME?
Masked Query: tell directed ?
Most similar relation: director

Original Query: Who was the director of the movie MOVIE_NAME?
Masked Query: director movie ?
Most similar relation: director

Original Query: Do you know who directed MOVIE_NAME?
Masked Query: know directed ?
Most similar relation: director

Original Query: Who is the screenwriter of MOVIE_NAME?
Masked Query: screenwriter ?
Most similar relation: screenwriter

Original Query: Can you tell me who wrote the script for MOVIE_NAME?
Masked Query: tell wrote script ?
Most similar relation: screenwriter

Original Query: Who was responsible for writing MOVIE_NAME?
Masked Query: responsible writing ?
Most similar relation: writing language

Original Query: Who worked as the screenwriter on MOVIE_NAME?
Masked Query: worked screenwriter ?
Most similar relation: screenwrit

In [27]:
import re

# Custom stop words list that excludes domain-specific terms like 'release', 'date', 'come', etc.
custom_stop_words = {"who", "the", "of", "is", "was", "when", "can", "tell", "me", "first"}

# List of key phrases to preserve
key_phrases = ["release date", "come out", "published in", "first appearance", "screenwriter", "director"]

# Function to preprocess the query by masking movie names and preserving important context
def mask_movie_name(query):
    # Step 1: Mask the movie name
    movie_name_pattern = r"'(.*?)'|\"(.*?)\""  # Regex to find movie names enclosed in quotes
    masked_query = re.sub(movie_name_pattern, "", query)
    
    # Step 2: Remove stop words but preserve key phrases
    # Convert to lowercase and split into words
    query_words = masked_query.lower().split()
    
    # Filter out custom stop words
    filtered_words = [word for word in query_words if word not in custom_stop_words]
    
    # Rejoin filtered words
    filtered_query = " ".join(filtered_words)
    
    # Step 3: Ensure key phrases are preserved
    for phrase in key_phrases:
        if phrase in query.lower():
            filtered_query = re.sub(phrase, phrase, filtered_query)
    
    return filtered_query

# Example queries
queries = [
    "Who worked as the screenwriter on 'The Godfather'?",
    "When was 'The Godfather' released?",
    "What is the release date of 'The Godfather'?",
    "Can you tell me when 'The Godfather' was first released?",
    "When did the movie 'The Godfather' come out?"
]

# Apply the improved mask_movie_name function to each query
for query in queries:
    cleaned_query = mask_movie_name(query)
    print(f"Original Query: {query}")
    print(f"Cleaned Query: {cleaned_query}")
    query_embedding = model.encode(cleaned_query, convert_to_tensor=True)
    
    # Compute cosine similarities
    cosine_scores = util.pytorch_cos_sim(query_embedding, relation_embeddings)
    
    # Find the most similar relation
    most_similar_idx = cosine_scores.argmax()
    
    # Print the original and masked query, and the most similar relation
    print(f"Original Query: {user_query}")
    print(f"Masked Query: {masked_query}")
    print(f"Most similar relation: {relations[most_similar_idx]}\n")


Original Query: Who worked as the screenwriter on 'The Godfather'?
Cleaned Query: worked as screenwriter on ?
Original Query: When did the movie MOVIE_NAME come out?
Masked Query: movie come out?
Most similar relation: screenwriter

Original Query: When was 'The Godfather' released?
Cleaned Query: released?
Original Query: When did the movie MOVIE_NAME come out?
Masked Query: movie come out?
Most similar relation: published in

Original Query: What is the release date of 'The Godfather'?
Cleaned Query: what release date ?
Original Query: When did the movie MOVIE_NAME come out?
Masked Query: movie come out?
Most similar relation: publication date

Original Query: Can you tell me when 'The Godfather' was first released?
Cleaned Query: you released?
Original Query: When did the movie MOVIE_NAME come out?
Masked Query: movie come out?
Most similar relation: published in

Original Query: When did the movie 'The Godfather' come out?
Cleaned Query: did movie come out?
Original Query: When did

In [57]:
import re
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, util

# Load stop words
stop_words = set(stopwords.words('english'))

# List of additional common phrases to remove
additional_phrases = [
    "who", "tell me", "can you tell me", "could you", "i think", "was", "is", "the", "of", "and", "when", "what", "movie"
    # if we remove movie we get better stuff for director which is weird
]

# Function to clean and preprocess the query
def preprocess_query(query):
    # Convert to lowercase
    query = query.lower()
    
    # Step 1: Remove additional phrases
    for phrase in additional_phrases:
        query = re.sub(r'\b' + re.escape(phrase) + r'\b', '', query)
    
    # Step 2: Remove stop words
    query_words = query.split()
    filtered_words = [word for word in query_words if word not in stop_words]
    
    # Rejoin the filtered words into a cleaned query string
    cleaned_query = " ".join(filtered_words).strip()
    
    return cleaned_query

# Example query
# user_query = "Can you tell me who wrote the screenplay for 'The Godfather'?"
# user_query = "When was the movie released?"
user_query = "Who directed"

# Step 3: Preprocess the query
cleaned_query = preprocess_query(user_query)

# Load the transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Sample relations from the graph (you would replace this with your actual relations)
relations = graph.get_relations_labels()

# Step 4: Compute relation embeddings
relation_embeddings = model.encode(relations, convert_to_tensor=True)

# Step 5: Encode the cleaned query and compute similarity
query_embedding = model.encode(cleaned_query, convert_to_tensor=True)
cosine_scores = util.pytorch_cos_sim(query_embedding, relation_embeddings)

# Step 6: Find the most similar relation
most_similar_idx = cosine_scores.argmax()

# Output the results
print(f"Original User Query: {user_query}")
print(f"Cleaned Query: {cleaned_query}")
print(f"Most similar relation: {relations[most_similar_idx]}\n")


Original User Query: Who directed
Cleaned Query: directed
Most similar relation: director



In [13]:
from nltk.corpus import wordnet

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return synonyms

# Example usage
synonyms = get_synonyms("released")
print(synonyms)  # Could include terms like "publication", "launch", etc.


{'publish', 'unfreeze', 'issue', 'discharge', 'unloose', 'liberate', 'bring_out', 'let_go_of', 'relinquish', 'secrete', 'resign', 'give_up', 'unloosen', 'loose', 'free', 'expel', 'exhaust', 'turn', 'eject', 'unblock', 'let_go', 'put_out', 'release'}


In [22]:
import nltk
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

relations_with_synonyms = {
    "release date": ["release date", "launch date", "publication date", "when was it released"],
    "director": ["director", "film director", "who directed"],
    # Add more relations and synonyms
}

# Ensure you have the required data for bigrams
nltk.download('punkt')
nltk.download('punkt_tab')

def extract_bigrams(text):
    # Tokenize the sentence
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    filtered_tokens = [word for word in tokens if word.lower() not in ENGLISH_STOP_WORDS]
    # Generate bigrams
    bigrams = list(nltk.bigrams(filtered_tokens))
    # Join bigrams into phrases
    bigram_phrases = [' '.join(bigram) for bigram in bigrams]
    return bigram_phrases

# Example query
query = ""

# Extract bigrams
bigrams = extract_bigrams(query)
print("Bigrams extracted:", bigrams)

# Embed each bigram and find the most similar relation
bigram_embeddings = model.encode(bigrams, convert_to_tensor=True)

# Find the most similar relation
cosine_scores = util.pytorch_cos_sim(bigram_embeddings, relation_embeddings)
max_scores, _ = cosine_scores.max(dim=0)

# Get the index of the most similar relation
most_similar_idx = max_scores.argmax()

# Get the most similar relation based on the correct index
most_similar_relation = relations[most_similar_idx]

print("Cosine similarity:", 
print("Most similar relation:", most_similar_relation)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\adam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Bigrams extracted: ['tell release', 'release date', 'date Inception']
Cosine similarity: tensor([ 0.1188,  0.0998,  0.1987,  0.2629,  0.1713,  0.0906,  0.1797,  0.2561,
         0.1208,  0.2855,  0.2235,  0.2089,  0.1138,  0.2049,  0.2717,  0.1884,
         0.1105,  0.2477,  0.1864, -0.0122,  0.1661,  0.1852,  0.2064,  0.1725,
         0.2165,  0.0435,  0.1647,  0.1851,  0.3142,  0.2061,  0.0998,  0.2491,
         0.0841,  0.1790,  0.1812,  0.1259,  0.1812,  0.3307,  0.0884,  0.1524,
         0.1666,  0.1644,  0.1304,  0.1589,  0.1670,  0.3560,  0.1180,  0.1336,
         0.0926,  0.1659,  0.2088,  0.0993,  0.2596,  0.1823,  0.1926,  0.1732,
         0.2436,  0.0491,  0.0803,  0.1198,  0.1902,  0.1331,  0.1898,  0.1683,
         0.2164,  0.1689,  0.0939,  0.1859,  0.0760,  0.0631,  0.3519,  0.2288,
         0.2405,  0.1886,  0.2791,  0.1844,  0.0902,  0.1957,  0.1254,  0.0765,
         0.1765,  0.1650,  0.2236,  0.0777,  0.0440,  0.0688,  0.1629,  0.1829,
         0.1670,  0.2199,  0.22

In [23]:
def find_most_similar_relation(self, query):
    """
    Finds the most similar relation to the given query based on cosine similarity for each bigram.
    :param query: The user query as a string.
    :return: The most similar relation.
    """
    
    # Step 1: Extract bigrams from the query
    bigrams = extract_bigrams(query)  # Assuming you have the extract_bigrams function implemented
    print(f"Bigrams extracted: {bigrams}")
    
    # Step 2: Encode each bigram
    bigram_embeddings = self.transformer_model.encode(bigrams, convert_to_tensor=True)
    
    # Step 3: Compute cosine similarities between each bigram and all relation embeddings
    cosine_scores = util.pytorch_cos_sim(bigram_embeddings, self.relation_embeddings)
    
    # Step 4: Iterate over each bigram and print cosine similarity scores
    for i, bigram in enumerate(bigrams):
        print(f"\nCosine similarities for bigram: '{bigram}'")
        for j, relation in enumerate(self.relations):
            similarity_score = cosine_scores[i, j].item()  # Get the similarity score for the bigram and relation
            print(f"  Similarity with relation '{relation}': {similarity_score:.4f}")
    
    # Step 5: Reduce the scores to find the most similar relation (e.g., by max similarity)
    max_scores, _ = cosine_scores.max(dim=0)  # Maximize over bigrams for each relation
    most_similar_idx = max_scores.argmax()  # Get the index of the most similar relation

    # Step 6: Get and print the most similar relation and its cosine similarity
    most_similar_relation = self.relations[most_similar_idx]
    most_similar_distance = max_scores[most_similar_idx].item()
    print(f"\nMost similar relation: '{most_similar_relation}' with cosine similarity: {most_similar_distance:.4f}")
    
    # Step 7: Return the most similar relation if it passes the threshold
    if most_similar_distance < self.threshold:
        return None
    return most_similar_relation


In [None]:
find