In [8]:
%pip install thefuzz

Collecting thefuzz
  Downloading thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0 (from thefuzz)
  Downloading rapidfuzz-3.10.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Downloading thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Downloading rapidfuzz-3.10.0-cp312-cp312-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.6 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.6 MB 985.5 kB/s eta 0:00:02
   ------------- -------------------------- 0.5/1.6 MB 985.5 kB/s eta 0:00:02
   ------------------- -------------------- 0.8/1.6 MB 907.1 kB/s eta 0:00:01
   -------------------------------- ------- 1.3/1.6 MB 1.1 MB/s eta 0:00:01
   ---------------------------------------- 1.6/1.6 MB 1.2 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, thefuzz
Successfully installed rapidfuzz-

In [1]:
# import labels and relationships from the json files
import json
with open('./../../useful_dataset/graph/unique_movies.json') as f:
    file = json.load(f)
    entities = file
    # convert to lowercase
    entities = [entity.lower() for entity in entities]

with open('./../../useful_dataset/graph/unique_relationships.json.') as f:
    file = json.load(f)
    relations = file
    # convert to lowercase
    relations = [relation.lower() for relation in relations]

with open('./../../useful_dataset/graph/unique_persons.json') as f:
    file = json.load(f)
    persons = file
    # convert to lowercase
    persons = [person.lower() for person in persons]

In [2]:
import spacy
from thefuzz import process

# Step 2: Create the spaCy model and add the EntityRuler
nlp = spacy.blank("en")  # create an empty spaCy model
ruler = nlp.add_pipe("entity_ruler")  # add EntityRuler to the pipeline

# Step 3: Create patterns for movies and relationships
patterns = []

# Add movie entities
for movie in entities:
    patterns.append({"label": "movie", "pattern": movie})

# Add relationships (you could have relationships as separate labels)
for relation in relations:
    patterns.append({"label": "relation", "pattern": relation})

for relation in persons:
    patterns.append({"label": "person", "pattern": relation})

# Step 4: Add the patterns to the ruler
ruler.add_patterns(patterns)

In [3]:
# Fallback Fuzzy Search if no entity is found by exact match
def fuzzy_match_entity(input_text, labels):
    # Use fuzzy matching to find the closest match to the input text from a list of labels
    result = process.extractOne(input_text, labels)

    if result is None:
        return None
    
    match, score = result
    return match if score > 10 else None  # Set a score threshold for matching (75 is common for good matches)

# Function to get entities using spaCy and fallback to fuzzy search
def get_entities_with_fuzzy_matching(text):
    text = text.lower()
    doc = nlp(text)
    movies = []
    relations = []
    persons = []
    
    # Exact matching via spaCy
    for ent in doc.ents:
        if ent.label_ == "movie":
            movies.append(ent.text)
        elif ent.label_ == "relation":
            relations.append(ent.text)
        elif ent.label_ == "person":
            persons.append(ent.text)
        
    
    # Fallback to fuzzy matching if no exact matches are found
    if not movies:  # If no movies were found
        fuzzy_movie = fuzzy_match_entity(text, entities)
        if fuzzy_movie:
            movies.append(fuzzy_movie)
    
    if not relations:  # If no relations were found
        fuzzy_relation = fuzzy_match_entity(text, relations)
        if fuzzy_relation:
            relations.append(fuzzy_relation)
    
    return {"movies": movies, "relations": relations, "persons": persons}

# Test cases with fuzzy matching
sentences = [
    "Who is the director of Inception", 
    "When was 'The Godfather' released", 
    "Who is the director of 'The Godfather'" ,
    "What is the genre of Inception",
    "Show me an Picture of Leonardo Di Cesare"
]

for sentence in sentences:
    print(f"Sentence: {sentence}")
    found_entities = get_entities_with_fuzzy_matching(sentence)
    print(found_entities)
    print()

Sentence: Who is the director of Inception
{'movies': ['inception'], 'relations': ['director'], 'persons': []}

Sentence: When was 'The Godfather' released
{'movies': ['the godfather'], 'relations': [], 'persons': []}

Sentence: Who is the director of 'The Godfather'
{'movies': ['the godfather'], 'relations': ['director'], 'persons': []}

Sentence: What is the genre of Inception
{'movies': ['inception'], 'relations': ['genre'], 'persons': []}

Sentence: Show me an Picture of Leonardo Di Cesare
{'movies': ['lilies of the field'], 'relations': [], 'persons': ['leonardo di cesare']}



In [4]:
# Save the model to disk so it can be reused
nlp.to_disk("movie_relation_ner_model")