In [12]:
from langchain_openai import OpenAIEmbeddings
from graphdatascience import GraphDataScience
from getpass import getpass
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


# Set up connection information

In [13]:
openai_api_key = getpass()

 ········


In [14]:
neo4j_password = getpass()

 ········


In [15]:
neo4j_uri = "neo4j+s://2fe3bf28.databases.neo4j.io"
neo4j_user = "neo4j"
gds = GraphDataScience(neo4j_uri, auth=(neo4j_user, neo4j_password))

# Clean up themes

Remove themes without English characters. Films with non-English titles sometimes caused the LLM to give themes in the language of the title even if the overview was in English.

In [29]:
gds.run_cypher("""
MATCH (t:Theme) WHERE t.description =~ '^[^a-zA-Z0-9]*$'
DETACH DELETE t""")

If the content is too explicit for the LLM to work with, drop it from our dataset.

In [38]:
gds.run_cypher("""
MATCH (t:Theme)<-[:HAS_THEME]-(m)
WHERE t.description STARTS WITH "I will not"
OR t.description STARTS WITH "I am sorry"
OR t.description STARTS WITH "I apologize"
OR t.description STARTS WITH "I will not generate a summary"
OR t.description STARTS WITH "I cannot provide"
OR t.description STARTS WITH "I will have to politely decline"
DETACH DELETE m""")

In [62]:
gds.run_cypher("""
MATCH (t:Theme) WHERE NOT EXISTS {()-[:HAS_THEME]->(t)} DETACH DELETE t""")

Sometimes LLM gave us a heading followed by a colon and a list. Drop the heading and just keep the list element.

If the first list item already exists as a separate node, merge this node with the existing node.

In [45]:
gds.run_cypher("""
MATCH (t:Theme) WHERE t.description contains ":\n" 
MATCH (t2:Theme)
WHERE t2.description = trim(split(t.description, ":\n")[1])
WITH [t2, t] AS nodeList
CALL apoc.refactor.mergeNodes(nodeList, {properties:"discard"}) YIELD node
RETURN node.description AS newDescription""")

Unnamed: 0,newDescription
0,Alone
1,Awakening
2,Cinema
3,Coming of age
4,Doctor Who
5,Family bonds
6,Friendship
7,Grandmother
8,Hero
9,Honk


If the same item exists with different headings, merge those nodes together.

In [46]:
gds.run_cypher("""
MATCH (t:Theme) WHERE t.description contains ":\n" 
WITH trim(split(t.description, ":\n")[1]) AS newDescription, collect(t) AS nodeList
CALL apoc.refactor.mergeNodes(nodeList, {properties:"discard"}) YIELD node
SET node.description = trim(split(node.description, ":\n")[1])
RETURN node.description AS newDescription""")

Unnamed: 0,newDescription
0,Child exploitation
1,Egotrip
2,Children's festival
3,Cosmic powers
4,Inventions
...,...
63,Chūshingura
64,Fallen Angel
65,Endaneleyay
66,Looking out


In [47]:
gds.run_cypher("""MATCH (t:Theme)<-[:HAS_THEME]-(m) WHERE t.description =~ ".*public figure.*\..*\.$" DETACH DELETE m RETURN count(*)""")

Unnamed: 0,count(*)
0,5


Sometimes the LLM delimited the themes with carriage return instead of commas.

In [88]:
gds.run_cypher("""
    MATCH (t:Theme) WHERE t.description contains "\n" 
    WITH t, split(t.description, "\n") AS split
    UNWIND split AS desc
    WITH t, desc
    WHERE desc <> ""
    AND size(desc) <= 50
    WITH t, CASE WHEN desc =~ '\d\..*' THEN split(desc, ". ")[1] 
    WHEN desc ENDS WITH "," THEN substring(desc, 0, size(desc)-1) ELSE desc END as cleanDesc
    MERGE (t2:Theme {description: cleanDesc})
    WITH t, t2
    MATCH (m:Movie)-[:HAS_THEME]->(t)
    MERGE (m)-[:HAS_THEME]->(t2)
    DETACH DELETE t
    RETURN count(*)""")

Unnamed: 0,count(*)
0,91


Drop trailing periods unless there is more than one period in the description like "A.I."

In [89]:
gds.run_cypher("""
MATCH (t:Theme) WHERE NOT t.description =~ ".*.*\..*\.$" AND t.description ENDS WITH '.' 
MATCH (t2:Theme)
WHERE trim(t2.description) = substring(t.description, 0, size(t.description)-1)
WITH [t2, t] AS nodeList
CALL apoc.refactor.mergeNodes(nodeList, {properties:"discard"}) YIELD node
RETURN node.description AS newDescription""")


Unnamed: 0,newDescription


In [90]:
gds.run_cypher("""
MATCH (t:Theme) WHERE NOT t.description =~ ".*.*\..*\.$" AND t.description ENDS WITH '.' 
SET t.description = substring(t.description, 0, size(t.description)-1)
RETURN t.description""")


Unnamed: 0,t.description


In [91]:
pd.set_option('display.max_colwidth', None)

We asked the LLM for memorable themes, settings, and public figures. Sometimes those terms came through in the output. Drop those themes.

In [92]:
gds.run_cypher("""
MATCH (t:Theme) WHERE toLower(t.description) CONTAINS "memorable themes" 
OR toLower(t.description) CONTAINS "settings"
OR toLower(t.description) CONTAINS "public figure"
RETURN t.description ORDER BY t.description""")


Unnamed: 0,t.description


In [93]:
gds.run_cypher("""
MATCH (t:Theme) WHERE t.description = "Elon (public figure)"
SET t.description = "Elon" """)

In [94]:
gds.run_cypher("""
MATCH (t:Theme) WHERE toLower(t.description) CONTAINS "memorable themes" 
OR toLower(t.description) CONTAINS "settings"
OR toLower(t.description) CONTAINS "public figure"
DETACH DELETE t""")


In [48]:
gds.run_cypher("""MATCH (t:Theme) WHERE toLower(t.description) CONTAINS "overview" RETURN t.description AS description""")['description'].tolist()

['Based on the provided title and overview',
 'Based on the title "Another Alone" and the overview provided',
 'Based on the title and overview provided',
 "but the overview is quite vague and doesn't provide enough context to extract more relevant phrases"]

In [49]:
gds.run_cypher("""MATCH (t:Theme) WHERE toLower(t.description) CONTAINS "overview" DETACH DELETE t""")

In [50]:
gds.run_cypher("""MATCH (t:Theme) WHERE toLower(t.description) CONTAINS "title" RETURN t.description AS description""")['description'].tolist()

['Alternate titles',
 'Alétheia (from the title)',
 'Based on the title "Zakaria: a Hero in Memory"',
 'Based solely on the title "ChayaBrikkho"',
 'Based solely on the title "Good Looking Out"',
 'British title',
 'Championship title',
 'Entitled behavior',
 'Flower (from title)',
 'King title',
 'Light heavyweight title',
 'Lightweight title',
 'Poetic intertitles',
 'Reflective intertitles',
 'Speculative intertitles',
 'Subtitles',
 'Tag Team Titles',
 'Title',
 'Title IX',
 'Title contention',
 'Title defense',
 'Title eliminator',
 'Title fight',
 'Title match',
 'Titles',
 'Ultraviolent Title',
 'Undisputed titles',
 'World title',
 'World title fight',
 'World titles',
 'epic title',
 'title challenger',
 'untitled']

In [51]:
gds.run_cypher("""MATCH (t:Theme) 
WHERE t.description in ['Based on the title "Zakaria: a Hero in Memory"',
 'Based solely on the title "ChayaBrikkho"',
 'Based solely on the title "Good Looking Out"']
 DETACH DELETE t RETURN COUNT(*) AS deletedCount""")

Unnamed: 0,deletedCount
0,3


In [53]:
gds.run_cypher("""MATCH (t:Theme) WHERE toLower(t.description) CONTAINS "theme" RETURN t.description AS description""")['description'].tolist()

['African themes',
 'Christian themes',
 'Feminist themes',
 'Haunted theme park',
 'LGBTQ themes',
 'LGBTQ+ themes',
 'Mature themes',
 'Religious themes',
 'Satya themes',
 'Theme park',
 'adult themes',
 'dark themes',
 'environmental themes',
 'erotic themes',
 "it's difficult to infer other relevant themes",
 'philosophical themes',
 'social themes',
 'themes']

In [54]:
gds.run_cypher("""MATCH (t:Theme) 
WHERE t.description in ["it's difficult to infer other relevant themes",
 "themes"]
 DETACH DELETE t RETURN COUNT(*) AS deletedCount""")

Unnamed: 0,deletedCount
0,2


In [55]:
gds.run_cypher("""MATCH (t:Theme) WHERE toLower(t.description) CONTAINS "relevant" RETURN t.description AS description""")['description'].tolist()

['Irrelevant groups',
 'but more context would be needed to provide additional relevant phrases']

In [56]:
gds.run_cypher("""MATCH (t:Theme) 
WHERE t.description in ['but more context would be needed to provide additional relevant phrases']
 DETACH DELETE t RETURN COUNT(*) AS deletedCount""")

Unnamed: 0,deletedCount
0,1


In [58]:
gds.run_cypher("""MATCH (t:Theme) WHERE toLower(t.description) STARTS WITH "based" RETURN t.description AS description""")['description'].tolist()

['Based on the limited information provided',
 'Based on the very limited information provided']

In [59]:
gds.run_cypher("""MATCH (t:Theme) WHERE toLower(t.description) STARTS WITH "based"
 DETACH DELETE t RETURN COUNT(*) AS deletedCount""")

Unnamed: 0,deletedCount
0,2


# Generate embeddings
Embed the themes for clustering and the movie text for testing RAG.

In [96]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=openai_api_key)

In [99]:
themes = gds.run_cypher("""
                        MATCH (t:Theme)
                        WHERE t.embedding IS NULL
                        RETURN t.description AS description""")

In [98]:
movie_text = gds.run_cypher("""
MATCH (m:Movie)
RETURN m.tmdbId AS id, m.title + ". " + m.overview AS movieText""")

In [100]:
themes

Unnamed: 0,description
0,Acid barrel murderer
1,Altering time
2,Combat swimmers
3,Corporate world
4,Detective inspector
5,Edinburgh 2021
6,Enduring wonder
7,Female pioneer
8,Financial risk
9,First loves


In [68]:
movie_text

Unnamed: 0,id,movieText
0,27296,"The Gingerdead Man. An evil yet adorable Gingerbread man comes to life with the soul of a convicted killer, and this real life cookie monster wreaks havoc on the girl who sent the killer to the electric chair."
1,65456,The Atrocity Exhibition. A doctor in a mental research institution is driven insane by the spectacle of the horrors of the twentieth century.
2,75594,"Blind Turn. Samantha Holt had the perfect life with a handsome fiancée and a future that couldn’t look brighter. But an unexpected turn on a dark lonely road puts her on a collision course with Bruce Miller, a devoted family man who descends into madness when he loses everything he ever cared about. One year later, Bruce decides to give Samantha the punishment he believes she escaped. Samantha has struggled with her own demons since that fateful night, and Bruce will take her struggles to the edge when he implements a demented plan to teach her a twisted lesson that may cost her the ultimate price."
3,187716,"Universal Groove. Filmed in 1999, Universal Groove explores 90s rave culture through eight diverse characters seeking escape and self-discovery amidst drugs and dance, offering a nostalgic journey into the underground party scene of the era."
4,269509,"The Canterville Ghost. An American family moves in to the Canterville Chase, a London mansion that has been haunted by ghost Sir Simon De Canterville for 300 years."
...,...,...
16171,1256551,"Agastya – Chapter 1. Saurya is haunted by a difficult past and grapples with his troubled mind. After a year, his brother embarks on a quest to find him. A touching story unfolds as Nisan delves into Saurya's history."
16172,1256574,"Eye of the Fen. Far in the future, in a place that was once called Finland, there lives a people whose history is shrouded in mystery. A mythical song describes the past, but it is not enough for the Young One. Fortunately, there is a sect in nearby fen that preserves ancient knowledge."
16173,1256587,"Mushrooms. n Mushrooms (2023), the artist trains her lens on the boreal forests of Northeast Asia. Beneath the canopy’s dappled light, the camera follows the transfer of energy and information through the micro-ecosystems on the forest’s floor, presenting the interplay of mushrooms, moss, insects, and soil as a grand dance."
16174,1256588,"How I Trafficked $5 Million Worth of Cocaine | Posh Pete’s Full Story. ""Former cocaine smuggler Pieter Tritton returns for a second interview with Business Insider about his experience trafficking drugs from South America to the United Kingdom. Tritton says he started selling cocaine in the illegal rave scene in the UK in the 2000s. He then established a cartel connection and began importing cocaine to Europe in larger quantities. Tritton was arrested in Ecuador and served 10 years in prison there, first in Garcia Moreno in Quito, and later in Litoral Penitentiary in Guayaquil, which is one of the world's most violent and corrupt prisons. ""He now works as a public speaker on the dangers of drugs, consults with the UK police force, and is writing a follow-up to his 2017 memoir, 'El Infierno: Drugs, Gangs, Riots and Murder: My time inside Ecuador's toughest prisons.'"""


## Create indexes for the vector properties.

In [12]:
gds.run_cypher("""CREATE VECTOR INDEX theme_vectors IF NOT EXISTS 
                  FOR (t:Theme)
                  ON (t.embedding)
                  OPTIONS {indexConfig: 
                      {`vector.dimensions`: 1536,
                       `vector.similarity_function`: 'cosine'
                       }}
                       """)

In [17]:
gds.run_cypher("""CREATE VECTOR INDEX movie_text_vectors IF NOT EXISTS 
                  FOR (m:Movie)
                  ON (m.embedding)
                  OPTIONS {indexConfig: 
                      {`vector.dimensions`: 1536,
                       `vector.similarity_function`: 'cosine'
                       }}
                       """)

## Send vectors to Neo4j

In [101]:
for i in range(0, int(themes.shape[0]/5000) + 1):
    theme_slice = themes.iloc[i*5000:(i+1)*5000].copy()
    theme_slice['embedding'] = embeddings.embed_documents(theme_slice['description'].to_list())
    gds.run_cypher("""
        UNWIND $themeData AS d
        MATCH (t:Theme {description:d['description']})
        CALL db.create.setNodeVectorProperty(t, 'embedding', d['embedding'])
        RETURN count(*) AS updateCount""",
                   {'themeData': theme_slice.to_dict('records')})
    print(f"Finished row {(i+1)*5000}.")



Finished row 5000.


In [75]:
for i in range(0, int(movie_text.shape[0]/5000) + 1):
    movie_slice = movie_text.iloc[i*5000:(i+1)*5000].copy()
    movie_slice['embedding'] = embeddings.embed_documents(movie_slice['movieText'].to_list())
    gds.run_cypher("""
        UNWIND $movieData AS d
        MATCH (m:Movie {tmdbId:d['id']})
        CALL db.create.setNodeVectorProperty(m, 'embedding', d['embedding'])
        RETURN count(*) AS updateCount""",
                   {'movieData': movie_slice.to_dict('records')})
    print(f"Finished row {(i+1)*5000}.")



Finished row 5000.




Finished row 10000.




Finished row 15000.
Finished row 20000.
