In [None]:
# Install the huggingface transformers library
%pip install -U sentence-transformers
# Install the ipywidgets library
%pip install ipywidgets
# Install the python-graphql-client library
%pip install python-graphql-client

In [None]:
# Utils

dgraph_hostname = "localhost"

import socket

def check_port(url, port):
    """
    check_port returns true if the port at the url is accepting connections
    """
    try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(3)  # Set a timeout value for the connection attempt
        result = sock.connect_ex((url, port))
        sock.close()
        if result == 0:
            return True
        else:
            return False
    except socket.error:
        return False

# check Dgraph ports to ensure access
dgraph_http_port = 8080
if not check_port(dgraph_hostname, dgraph_http_port):
    raise Exception(f"HTTP Port {dgraph_http_port} at {dgraph_hostname} not responding, is the server running?")

print("Required port(s) accepting connections")

In [None]:
import requests

# Apply the schema to Dgraph

admin_url = f'http://{dgraph_hostname}:{dgraph_http_port}/admin/schema'
# Load the schema file
schema_file = "schema.graphql"
schema = open(schema_file).read()
headers = {
    'Content-Type': 'application/octet-stream',
}
response = requests.post(admin_url, data=schema, headers=headers)
if response.status_code == 200:
    message = response.text
    if "errors" in message:
        raise Exception(message)
    print("Schema applied successfully")
else:
    raise Exception(f"Error applying schema: {response.text}")



Handy browser utility from Apollo

https://studio.apollographql.com/sandbox/explorer?endpoint=http://localhost:8080/graphql

In [None]:
# Create an embedding model using the sentence-transformers library

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

from python_graphql_client import GraphqlClient
from episode_importer import load_episodes
from script_importer import load_lines

# Load episodes and lines into Dgraph, pass the model to the line and episodes importer

client = GraphqlClient(endpoint=f"http://{dgraph_hostname}:{dgraph_http_port}/graphql")
load_episodes(client, model)
load_lines(client, model)

In [None]:
from python_graphql_client import GraphqlClient

client = GraphqlClient(endpoint=f"http://{dgraph_hostname}:{dgraph_http_port}/graphql")

# Query Dgraph for the most similar lines to the input line
line = "Feelings of intense dread"
line_embedding = model.encode([line])[0].tolist()
query = """
    query byLine($vector: [Float!]!) {    
        querySimilarLineByEmbedding(by: text_v, topK: 3, vector: $vector) {
            id
            text
            vector_distance
            episode {
                identifier
                title
                summary
                lines(first: 5, order: {asc: number}) {
                    number
                    text
                    character {
                        name
                        lines(first: 5, order: {asc: number}) {
                            text
                        }
                    }
                }
            }
            character {
                name
            }
        }
    }
"""
variables = {
    "vector": line_embedding
}
data = client.execute(query=query, variables=variables)
# iterate results, pull out the text, episode, and character
for line in data["data"]["querySimilarLineByEmbedding"]:
    print(f"{line['episode']['title']} ({line['episode']['identifier']}) - {line['character']['name']}: {line['text']} (distance: {line['vector_distance']})")


In [None]:
import json
print(json.dumps(data["data"]["querySimilarLineByEmbedding"][0], indent=2))


In [None]:
# Query Dgraph for the most similar episodes to the input sentence
sentence = "Joys of the holidays"
sentence_embedding = model.encode([sentence])[0].tolist()
query = """
    query byEpisiode($vector: [Float!]!) {    
        querySimilarEpisodeByEmbedding(by: summary_v, topK: 3, vector: $vector) {
            identifier
            title
            summary
            vector_distance
        }
    }
"""
variables = {
    "vector": sentence_embedding
}
data = client.execute(query=query, variables=variables)
# iterate results, pull out the episode, summary, and distance
for episode in data["data"]["querySimilarEpisodeByEmbedding"]:
    print(f"{episode['identifier']} {episode['title']} - {episode['summary']} (distance: {episode['vector_distance']})")


In [None]:
# Query similar episodes by episode identifier

query = """
    query {
        querySimilarEpisodeById(by: summary_v, topK: 3, identifier: "S03E12") {
            identifier
            title
            summary
        }
    }
"""
data = client.execute(query=query)
# iterate results, pull out the episode, summary, and distance
for episode in data["data"]["querySimilarEpisodeById"]:
    print(f"{episode['identifier']} {episode['title']} - {episode['summary']}")



### Caveats

* The data generated in this Docker container is not persisted
* There is no auto-updating of the vectors, for instance if I update an Episode summary, I'll also need to regenerate the vector embedding. Dgraph's parent, Hypermode I think is working on that feature and other related things
* There seems to be some issues with cascading queries when using the generated querySimilar<Object>ByEmbedding endpoints

In [None]:
# Issues with cascading queries with the vector searching (Part 1)

# For instance, this cascading query works as expected

query = """
  query {
    queryLine(filter: { text: { anyofterms: "bagel" } }) @cascade {
      text
      episode {
        season(filter: { number: { eq: 9 } }) {
          number
        }
      }
    }
  }
"""
data = client.execute(query=query)
print(json.dumps(data, indent=2))

In [None]:
# Issues with cascading queries with the vector searching (continued)

# This cascading query is NOT working with the built-in similarity search
sentence = "Food"
sentence_embedding = model.encode([sentence])[0].tolist()
query = """
    query byLine($vector: [Float!]!) {    
        querySimilarLineByEmbedding(by: text_v, topK: 3, vector: $vector) @cascade {
            text
            vector_distance
            episode {
              identifier
              season(filter: { number: {eq: 9} }) {
                number
              }
            }
        }
    }
"""
variables = {
    "vector": sentence_embedding
}
data = client.execute(query=query, variables=variables)
print(json.dumps(data, indent=2))