In [1]:
# set app as default directory to address imports
import os
import sys
sys.path.append(os.path.join(os.getcwd(), './../../'))

#activate autoreload to easier test classes
%load_ext autoreload
%autoreload 2

In [2]:
from app.services.sparql_graph import SPARQLGraph

from app.config.enums import Environment
graph = SPARQLGraph(Environment.DEV, False)

Metadata loaded successfully from JSON files.
Initializing SPARQLGraph
Graph loaded with 94107 triples after 0:00:25.869326


In [3]:
from app.services.extractors.main import SpacyExtractor
spacy_extractor = SpacyExtractor()

# Relationship extractor

In [None]:
from app.services.extractors.relationship_extraction import RelationshipExtractor

relation_extractor = RelationshipExtractor(spacy_extractor)

Spacy extractor initialized
Embedding extractor initialized


In [5]:
relation_extractor.extract("Who is the screenwriter of the movie The Godfather?")

Mutated text: who is the screenwriter of the movie the godfather?
Matches after filtering overlapping matches: [{'original_text': 'screenwriter', 'start': 11, 'end': 23}]
NLTK matches: [{'original_text': 'screenwriter', 'start': 11, 'end': 23}]
Full word matches: [{'match_text': 'screenwriter', 'original_text': 'screenwriter', 'label': 'relation', 'score': 100.0, 'start': 11, 'end': 23, 'is_full_word': True}]
Fuzzy matches: [{'match_text': 'screenwriter', 'original_text': 'screenwriter', 'label': 'relation', 'score': 100.0, 'start': 11, 'end': 23, 'is_full_word': True}]
NLTK relationship found: ['screenwriter']


['screenwriter']

In [6]:
# it works after removing the movie name -> dont forget that realtion extraction is done on masked queries
relation_extractor.extract("Who directed the movie \"\"")

Mutated text: who directed the movie ""
Matches after filtering overlapping matches: []
NLTK matches: []
Full word matches: [{'match_text': 'directed', 'original_text': 'director', 'label': 'relation', 'score': 75.0, 'start': 4, 'end': 12, 'is_full_word': True}]
Fuzzy matches: [{'match_text': 'directed', 'original_text': 'director', 'label': 'relation', 'score': 75.0, 'start': 4, 'end': 12, 'is_full_word': True}]
NLTK relationship found: ['director']


['director']

In [8]:
relation_extractor.extract("What is the imdb id of the movie \"The Matrix\"?")

Mutated text: what is the imdb id of the movie "the matrix"?
Matches after filtering overlapping matches: [{'original_text': 'imdb id', 'start': 12, 'end': 19}]
NLTK matches: [{'original_text': 'imdb id', 'start': 12, 'end': 19}]
Full word matches: [{'match_text': 'imdb id', 'original_text': 'imdb id', 'label': 'relation', 'score': 100.0, 'start': 12, 'end': 19, 'is_full_word': True}]
Fuzzy matches: [{'match_text': 'imdb id', 'original_text': 'imdb id', 'label': 'relation', 'score': 100.0, 'start': 12, 'end': 19, 'is_full_word': True}]
NLTK relationship found: ['IMDb ID']


['IMDb ID']

In [9]:
relation_extractor.extract("who directs the movie ?")

Mutated text: who directs the movie ?
Matches after filtering overlapping matches: []
NLTK matches: []
Full word matches: [{'match_text': 'directs', 'original_text': 'depicts', 'label': 'relation', 'score': 71.42857142857143, 'start': 4, 'end': 11, 'is_full_word': True}]
Fuzzy matches: [{'match_text': 'directs', 'original_text': 'depicts', 'label': 'relation', 'score': 71.42857142857143, 'start': 4, 'end': 11, 'is_full_word': True}]
NLTK relationship found: ['depicts']


['depicts']

# Movie extractor

In [4]:
from app.services.extractors.movie_extraction import MovieExtractor
movie_extractor = MovieExtractor(spacy_extractor)

In [9]:
len("Can you tell me who is the author of The Inception")/3

16.666666666666668

In [23]:
movie_extractor.extract("Can you tell me who is the author of The Inception")

Mutated text: can you tell me who is the author of the inception
Matches after filtering overlapping matches: [{'original_text': 'inception', 'start': 41, 'end': 50}, {'original_text': 'tell', 'start': 8, 'end': 12}]
NLTK matches: [{'original_text': 'inception', 'start': 41, 'end': 50}, {'original_text': 'tell', 'start': 8, 'end': 12}]
Alignment result: None
Full word matches: [{'match_text': 'tell', 'original_text': 'tell', 'label': 'movie', 'score': 100.0, 'start': 8, 'end': 12, 'is_full_word': True}, {'match_text': 'inception', 'original_text': 'inception', 'label': 'movie', 'score': 100.0, 'start': 41, 'end': 50, 'is_full_word': True}]
Fuzzy matches: [{'match_text': 'inception', 'original_text': 'inception', 'label': 'movie', 'score': 100.0, 'start': 41, 'end': 50, 'is_full_word': True}, {'match_text': 'tell', 'original_text': 'tell', 'label': 'movie', 'score': 100.0, 'start': 8, 'end': 12, 'is_full_word': True}]
Combined results: [{'original_text': 'tell', 'start': 8, 'end': 12, '

['Inception']

In [98]:
movie_extractor.extract("Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween")

Mutated text: recommend movies like nightmare on elm street, friday the 13th, and halloween
Matches after filtering overlapping matches: [{'original_text': 'halloween', 'start': 68, 'end': 77}, {'original_text': 'friday the 13th', 'start': 47, 'end': 62}]
NLTK matches: [{'original_text': 'halloween', 'start': 68, 'end': 77}, {'original_text': 'friday the 13th', 'start': 47, 'end': 62}]
Alignment result: None
Full word matches: [{'match_text': 'friday the 13th', 'original_text': 'friday the 13th', 'label': 'movie', 'score': 100.0, 'start': 47, 'end': 62, 'is_full_word': True}, {'match_text': 'e nightmare on elm street', 'original_text': 'a nightmare on elm street', 'label': 'movie', 'score': 96.0, 'start': 20, 'end': 45, 'is_full_word': True}, {'match_text': 'halloween', 'original_text': 'halloween', 'label': 'movie', 'score': 100.0, 'start': 68, 'end': 77, 'is_full_word': True}, {'match_text': 'friday', 'original_text': 'friday', 'label': 'movie', 'score': 100.0, 'start': 47, 'end': 53

['A Nightmare on Elm Street', 'Friday the 13th', 'Halloween']

In [99]:
movie_extractor.extract("Who is the screenwriter of masked Gang: cyprus ")

Mutated text: who is the screenwriter of masked gang: cyprus 
Matches after filtering overlapping matches: []
NLTK matches: []
Alignment result: None
Full word matches: [{'match_text': 'masked gang: cyprus', 'original_text': 'the masked gang: cyprus', 'label': 'movie', 'score': 88.88888888888889, 'start': 26, 'end': 48, 'is_full_word': True}, {'match_text': 'of masked gang:', 'original_text': 'the masked gang', 'label': 'movie', 'score': 80.0, 'start': 24, 'end': 39, 'is_full_word': True}]
Fuzzy matches: [{'match_text': 'masked gang: cyprus', 'original_text': 'the masked gang: cyprus', 'label': 'movie', 'score': 88.88888888888889, 'start': 26, 'end': 48, 'is_full_word': True}]
NLTK entity found: ['The Masked Gang: Cyprus']


['The Masked Gang: Cyprus']

In [100]:
movie_extractor.extract("Who directed \"Godzilla vs. Gigan\"")

Mutated text: who directed "godzilla vs. gigan"
Matches after filtering overlapping matches: [{'original_text': 'godzilla vs. gigan', 'start': 14, 'end': 32}]
NLTK matches: [{'original_text': 'godzilla vs. gigan', 'start': 14, 'end': 32}]
Alignment result: None
Full word matches: [{'match_text': 'godzilla vs. gigan', 'original_text': 'godzilla vs. gigan', 'label': 'movie', 'score': 100.0, 'start': 14, 'end': 32, 'is_full_word': True}, {'match_text': 'godzilla', 'original_text': 'godzilla', 'label': 'movie', 'score': 100.0, 'start': 14, 'end': 22, 'is_full_word': True}]
Fuzzy matches: [{'match_text': 'godzilla vs. gigan', 'original_text': 'godzilla vs. gigan', 'label': 'movie', 'score': 100.0, 'start': 14, 'end': 32, 'is_full_word': True}]
NLTK entity found: ['Godzilla vs. Gigan']


['Godzilla vs. Gigan']

In [101]:
movie_extractor.extract("Who directed \"godzilla vs gigan\"")

Mutated text: who directed "godzilla vs gigan"
Matches after filtering overlapping matches: [{'original_text': 'godzilla', 'start': 14, 'end': 22}]
NLTK matches: [{'original_text': 'godzilla', 'start': 14, 'end': 22}]
Alignment result: None
Full word matches: [{'match_text': 'godzilla vs gigan"', 'original_text': 'godzilla vs. gigan', 'label': 'movie', 'score': 94.44444444444444, 'start': 14, 'end': 32, 'is_full_word': True}, {'match_text': 'godzilla', 'original_text': 'godzilla', 'label': 'movie', 'score': 100.0, 'start': 14, 'end': 22, 'is_full_word': True}]
Fuzzy matches: [{'match_text': 'godzilla vs gigan"', 'original_text': 'godzilla vs. gigan', 'label': 'movie', 'score': 94.44444444444444, 'start': 14, 'end': 32, 'is_full_word': True}]
NLTK entity found: ['Godzilla vs. Gigan']


['Godzilla vs. Gigan']

In [102]:
movie_extractor.extract("Given that I like The Lion King, Pochontas, and The Beauty and the Beast, can you recommend some movies")

Mutated text: given that i like the lion king, pochontas, and the beauty and the beast, can you recommend some movies
Matches after filtering overlapping matches: [{'original_text': 'beauty and the beast', 'start': 52, 'end': 72}, {'original_text': 'the lion king', 'start': 18, 'end': 31}]
NLTK matches: [{'original_text': 'beauty and the beast', 'start': 52, 'end': 72}, {'original_text': 'the lion king', 'start': 18, 'end': 31}]
Alignment result: None
Full word matches: [{'match_text': 'beauty and the beast', 'original_text': 'beauty and the beast', 'label': 'movie', 'score': 100.0, 'start': 52, 'end': 72, 'is_full_word': True}, {'match_text': 'beast', 'original_text': 'beast', 'label': 'movie', 'score': 100.0, 'start': 67, 'end': 72, 'is_full_word': True}, {'match_text': 'pochontas', 'original_text': 'pocahontas', 'label': 'movie', 'score': 90.0, 'start': 32, 'end': 42, 'is_full_word': True}, {'match_text': 'beauty', 'original_text': 'beauty', 'label': 'movie', 'score': 100.0, 'start'

['The Lion King', 'Pocahontas', 'Beauty and the Beast']

## Person Extractor 

In [21]:
message = "Show me a picture of Tom Holland"
spacy_extractor.get_entities_with_fuzzy_matching(message, 'person')

Mutated text: show me a picture of tom holland
Matches after filtering overlapping matches: [{'original_text': 'tom holland', 'start': 21, 'end': 32}]
NLTK matches: [{'original_text': 'tom holland', 'start': 21, 'end': 32}]
Matches for candidate 'tom holland': ScoreAlignment(score=100.0, src_start=0, src_end=11, dest_start=21, dest_end=32)
Matches before filtering full word matches: [{'match_text': 'tom holland', 'original_text': 'tom holland', 'label': 'person', 'score': 100.0, 'start': 21, 'end': 32, 'is_full_word': True}]
Full word matches: [{'match_text': 'tom holland', 'original_text': 'tom holland', 'label': 'person', 'score': 100.0, 'start': 21, 'end': 32, 'is_full_word': True}]
Fuzzy matches: [{'match_text': 'tom holland', 'original_text': 'tom holland', 'label': 'person', 'score': 100.0, 'start': 21, 'end': 32, 'is_full_word': True}]


{'movies': [], 'relations': [], 'people': ['Tom Holland']}