In [148]:
# from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from graphdatascience import GraphDataScience
from getpass import getpass
import pandas as pd
import os

In [149]:
from google.oauth2 import service_account
from dotenv import dotenv_values
import json


# Set up connection information

In [150]:
config = dotenv_values(".env")

In [151]:
neo4j_password = "diameter-captures-caves"

In [152]:
neo4j_database_name = "neo4j"
neo4j_user = "neo4j"
bolt_url = "bolt://52.87.157.164:7687"
gds = GraphDataScience(bolt_url, auth=(neo4j_user, neo4j_password))

# Clean up themes

Remove themes without English characters. Films with non-English titles sometimes caused the LLM to give themes in the language of the title even if the overview was in English.

In [153]:
gds.run_cypher("""
MATCH (t:Theme) WHERE t.description =~ '^[^a-zA-Z0-9]*$'
DETACH DELETE t""")

If the content is too explicit for the LLM to work with, drop it from our dataset.

In [154]:
gds.run_cypher("""
MATCH (t:Theme)<-[:HAS_THEME]-(m)
WHERE t.description STARTS WITH "I will not"
OR t.description STARTS WITH "I am sorry"
OR t.description STARTS WITH "I apologize"
OR t.description STARTS WITH "I will not generate a summary"
OR t.description STARTS WITH "I cannot provide"
OR t.description STARTS WITH "I will have to politely decline"
DETACH DELETE m""")

In [155]:
gds.run_cypher("""
MATCH (t:Theme) WHERE NOT EXISTS {()-[:HAS_THEME]->(t)} DETACH DELETE t""")

Sometimes LLM gave us a heading followed by a colon and a list. Drop the heading and just keep the list element.

If the first list item already exists as a separate node, merge this node with the existing node.

In [156]:
gds.run_cypher("""
MATCH (t:Theme) WHERE t.description contains ":\n" 
MATCH (t2:Theme)
WHERE t2.description = trim(split(t.description, ":\n")[1])
WITH [t2, t] AS nodeList
CALL apoc.refactor.mergeNodes(nodeList, {properties:"discard"}) YIELD node
RETURN node.description AS newDescription""")

Unnamed: 0,newDescription
0,Time Travel
1,The Great Gatsby
2,Justice
3,Choice


If the same item exists with different headings, merge those nodes together.

In [157]:
gds.run_cypher("""
MATCH (t:Theme) WHERE t.description contains ":\n" 
WITH trim(split(t.description, ":\n")[1]) AS newDescription, collect(t) AS nodeList
CALL apoc.refactor.mergeNodes(nodeList, {properties:"discard"}) YIELD node
SET node.description = trim(split(node.description, ":\n")[1])
RETURN node.description AS newDescription""")

Unnamed: 0,newDescription
0,1. The main character's arc and transformation.\n\n2. The central conflict and its resolution.\n\n3. The setting's influence on the story.\n\n4. The role of secondary characters in advancing the plot.\n\n5. The use of symbolism and thematic elements.\n\n6. The director's unique style and how it contributes to the film.\n\n7. The cinematography and its impact on the viewer's experience.\n\n8. The soundtrack and its contribution to the mood.\n\n9. The film's cultural or historical significance.\n\n10. The critical reception and any awards or nominations.\n\nOnly include phrases that are directly relevant to the movie's content and critical acclaim.\n\nPrioritize phrases that are central to the film's identity.\n\nYou can provide fewer than ten phrases.\n\nvery important
1,1. The main character's arc and transformation.\n\n2. Key supporting characters and their roles.\n\n3. The central conflict and its resolution.\n\n4. The setting and its influence on the story.\n\n5. The film's thematic elements.\n\n6. The director's unique style or signature elements.\n\n7. The cinematography and its contribution to the mood.\n\n8. The soundtrack and its impact on the narrative.\n\n9. The critical reception and any awards or nominations.\n\n10. The movie's legacy or influence on future works.\n\nOnly include phrases that are directly relevant to the movie's content and critical acclaim.\n\nPrioritize phrases that are central to the movie's identity.\n\nYou can provide fewer than ten phrases.\n\nvery important
2,1. The protagonist's psychological journey and internal conflicts.\n\n2. The antagonist's psychological motivations and flaws.\n\n3. The novel's exploration of existential themes.\n\n4. The influence of secondary characters on the protagonist's development.\n\n5. The setting's role in reflecting the protagonist's inner turmoil.\n\n6. The narrative structure and its effect on the pacing of the story.\n\n7. The use of literary devices to enhance the thematic depth.\n\n8. The author's writing style and its impact on the reader's experience.\n\n9. The novel's contribution to the genre and its originality.\n\n10. The historical and cultural context of the novel's setting.\n\n11. The novel's reception by the public and literary critics.\n\n12. The philosophical questions raised by the novel.\n\nOnly include phrases that are directly related to the novel's content and not general literary terms.\n\nPrioritize phrases that are central to the novel's identity.\n\nYou can provide fewer than twelve phrases.\n\nvery important
3,1. The main character's arc (using no more than three words).\n\n2. The antagonist's motivation (using no more than four words).\n\n3. The central conflict (using no more than three words).\n\n4. The resolution (using no more than three words).\n\n5. The setting's influence on the plot (using no more than four words).\n\n6. The film's genre (using no more than two words).\n\n7. The director's signature style (using no more than four words).\n\n8. The movie's impact on society (using no more than three words).\n\n9. The critical reception (using no more than four words).\n\n10. The film's contribution to the genre (using no more than three words).\n\n11. The movie's thematic depth (using no more than four words).\n\n12. The cinematography's role (using no more than four words).\n\n13. The soundtrack's effect (using no more than four words).\n\n14. The movie's cultural significance (using no more than three words).\n\n15. The lead actor's performance (using no more than four words).\n\n16. The supporting cast's contribution (using no more than four words).\n\n17. The movie's influence on future works (using no more than three words).\n\n18. The movie's legacy (using no more than four words).\n\n19. The movie's relevance to contemporary issues (using no more than four words).\n\n20. The movie's educational value (using no more than three words).\n\n\nReturn the analysis as a structured list
4,1. The central theme of the movie.\n\n2. Key settings that are pivotal to the plot.\n\n3. The main character's arc
...,...
178,1) Return the phrases as a pipe separated list.\n 2) Return only the list without a heading.\n title: The Shawshank Redemption\n synopsis: Two imprisoned men bond over a number of years
179,"1) Return the phrases as a pipe separated list.\n 2) Return only the list without a heading.\n title: The Echoes of Time\n overview: In ""The Echoes of Time"
180,1) Return the phrases as a pipe separated list.\n 2) Return only the list without a heading.\n title: The Godfather\n overview: The film follows the aging patriarch of an organized crime dynasty as he contemplates relinquishing his empire to his reluctant son. The saga explores themes of power
181,1) Use only one paragraph for each element.\n\n2) Include at least two direct quotes from the film to support your analysis.\n\n3) Ensure that your essay flows logically from one element to the next.\n\n4) Do not exceed 1500 words in total.\n\n5) Cite at least three film critics' reviews to support your points.\n\n6) Avoid spoilers for those who have not seen the film.\n\n7) Use APA citation style for any references.\n\n8) Provide a concluding paragraph that synthesizes your overall analysis.\n\n\ntitle: The Shawshank Redemption\n\nsynopsis: The story of Andy Dufresne


In [158]:
gds.run_cypher("""MATCH (t:Theme)<-[:HAS_THEME]-(m) WHERE t.description CONTAINS "Return the" DETACH DELETE m RETURN count(*)""")

Unnamed: 0,count(*)
0,139


In [159]:
gds.run_cypher("""MATCH (t:Theme)<-[:HAS_THEME]-(m) WHERE t.description =~ ".*public figure.*\..*\.$" DETACH DELETE m RETURN count(*)""")

Unnamed: 0,count(*)
0,1


Sometimes the LLM delimited the themes with carriage return instead of commas.

In [160]:
gds.run_cypher("""
    MATCH (t:Theme) WHERE t.description contains "\n" 
    WITH t, split(t.description, "\n") AS split
    UNWIND split AS desc
    WITH t, desc
    WHERE desc <> ""
    AND size(desc) <= 50
    WITH t, CASE WHEN desc =~ '\d\..*' THEN split(desc, ". ")[1] 
    WHEN desc ENDS WITH "," THEN substring(desc, 0, size(desc)-1) ELSE desc END as cleanDesc
    MERGE (t2:Theme {description: cleanDesc})
    WITH t, t2
    MATCH (m:Movie)-[:HAS_THEME]->(t)
    MERGE (m)-[:HAS_THEME]->(t2)
    DETACH DELETE t
    RETURN count(*)""")

Unnamed: 0,count(*)
0,2205


Drop trailing periods unless there is more than one period in the description like "A.I."

In [161]:
gds.run_cypher("""
MATCH (t:Theme) WHERE NOT t.description =~ ".*.*\..*\.$" AND t.description ENDS WITH '.' 
MATCH (t2:Theme)
WHERE trim(t2.description) = substring(t.description, 0, size(t.description)-1)
WITH [t2, t] AS nodeList
CALL apoc.refactor.mergeNodes(nodeList, {properties:"discard"}) YIELD node
RETURN node.description AS newDescription""")


Unnamed: 0,newDescription
0,1
1,Gustave H
2,Hollywood
3,The director's unique style
4,Vito Corleone
5,and public figures
6,film
7,love
8,stage


In [162]:
try:
    gds.run_cypher("""
    MATCH (t:Theme) WHERE NOT t.description =~ ".*.*\..*\.$" AND t.description ENDS WITH '.' 
    SET t.description = substring(t.description, 0, size(t.description)-1)
    RETURN t.description""")
except:
    print("duplicated constrain")

In [163]:
pd.set_option('display.max_colwidth', None)

We asked the LLM for memorable themes, settings, and public figures. Sometimes those terms came through in the output. Drop those themes.

In [164]:
gds.run_cypher("""
MATCH (t:Theme) WHERE toLower(t.description) CONTAINS "memorable themes" 
OR toLower(t.description) CONTAINS "settings"
OR toLower(t.description) CONTAINS "public figure"
RETURN t.description ORDER BY t.description""")


Unnamed: 0,t.description
0,33) If the movie's public figures
1,7) If a public figure is a character
2,If a public figure is mentioned in the title
3,If the movie features a notable public figure
4,If the movie involves a public figure
...,...
290,using no more than ten one-to-two word phrases.\n Include only the names of public figures if they are globally recognized.\n Prioritize phrases that are central to the movie's title.\n If the movie has a significant cultural impact
291,"which could be a central theme. ""Film"" and ""Data"" are included as they are relevant to the context of a movie. ""Public Figures"" and ""Memorable Themes"" are included as they are part of the instruction. ""Settings"" and ""Important"" are also included as they are relevant to the context of a movie."
292,which is a common setting for stories involving mystery or surprise. No public figures are mentioned
293,"which is why ""Memorable"" and ""Frightening"" are included. ""Themes"" and ""Settings"" are inferred from the context of a girl playing with a jack-in-the-box"


In [165]:
try:
    gds.run_cypher("""
MATCH (t:Theme) WHERE t.description = "Public Figures: Jimmy Lloyd"
SET t.description = "Jimmy Lloyd" """)
except:
    pass

In [166]:
gds.run_cypher("""
MATCH (t:Theme) WHERE toLower(t.description) CONTAINS "memorable themes" 
OR toLower(t.description) CONTAINS "settings"
OR toLower(t.description) CONTAINS "public figure"
DETACH DELETE t""")


In [167]:
gds.run_cypher("""MATCH (t:Theme) WHERE toLower(t.description) CONTAINS "1)" DETACH DELETE t RETURN count(*)""")

Unnamed: 0,count(*)
0,151


In [168]:
out = gds.run_cypher("""MATCH (t:Theme) WHERE toLower(t.description) CONTAINS "overview" RETURN t.description AS description""")['description'].tolist()
out[:10]

['    overview: A man with anterograde amnesia',
 '    overview: Andy Dufresne',
 '    overview: Clarice Starling',
 '    overview: Directed by Wes Anderson',
 '    overview: Florence',
 '    overview: In "The Echoes of Time',
 '    overview: In a dystopian future',
 '    overview: In a remote cabin in the woods',
 '    overview: In a small coastal town',
 '    overview: In a small town']

In [169]:
gds.run_cypher("""MATCH (t:Theme) WHERE toLower(t.description) CONTAINS "overview" DETACH DELETE t""")

In [170]:
out = gds.run_cypher("""MATCH (t:Theme) WHERE toLower(t.description) CONTAINS "title" RETURN t.description AS description""")['description'].tolist()
out[:5]

["    5) If an element appears in the movie's title",
 "    8) If a phrase is repeated in the film's title",
 '    title: *666',
 '    title: Comic Relief 2024: Funny for Money',
 '    title: Echoes of the Mind']

In [171]:
gds.run_cypher("""MATCH (t:Theme) 
WHERE t.description in ['Based on the title "Zakaria: a Hero in Memory"',
 'Based solely on the title "ChayaBrikkho"',
 'Based solely on the title "Good Looking Out"']
 DETACH DELETE t RETURN COUNT(*) AS deletedCount""")

Unnamed: 0,deletedCount
0,0


In [172]:
out = gds.run_cypher("""MATCH (t:Theme) WHERE toLower(t.description) CONTAINS "theme" RETURN t.description AS description""")['description'].tolist()
out[:10]

['    12) If a theme is a commentary on society',
 '    15) If a theme is a moral lesson',
 '    18) If a theme is a recurring motif',
 '    24) If a theme is a recurring motif',
 '    27) If a theme is a commentary on society',
 '    30) If a theme is a recurring motif',
 '    33) If a theme is a commentary on society',
 '    36) If a theme is a recurring motif',
 '    5) If a theme is complex',
 "    6) If a theme is central to the movie's plot"]

In [173]:
gds.run_cypher("""MATCH (t:Theme) 
WHERE t.description in ["it's difficult to infer other relevant themes",
 "themes"]
 DETACH DELETE t RETURN COUNT(*) AS deletedCount""")

Unnamed: 0,deletedCount
0,1


In [174]:
out = gds.run_cypher("""MATCH (t:Theme) WHERE toLower(t.description) CONTAINS "relevant" RETURN t.description AS description""")['description'].tolist()
out[:10]

["- Its message remains relevant in today's society",
 'the list includes generic terms that are relevant to the plot.)',
 'the list is based on the most relevant and prominent elements mentioned.)',
 'which might be relevant to the movie\'s title or themes. If "Caixa" is not a relevant term']

In [175]:
gds.run_cypher("""MATCH (t:Theme) 
WHERE t.description in ['but more context would be needed to provide additional relevant phrases']
 DETACH DELETE t RETURN COUNT(*) AS deletedCount""")

Unnamed: 0,deletedCount
0,0


In [176]:
gds.run_cypher("""MATCH (t:Theme) WHERE toLower(t.description) STARTS WITH "based" RETURN t.description AS description""")['description'].tolist()

[]

In [177]:
gds.run_cypher("""MATCH (t:Theme) WHERE toLower(t.description) STARTS WITH "based"
 DETACH DELETE t RETURN COUNT(*) AS deletedCount""")

Unnamed: 0,deletedCount
0,0


# Generate embeddings
Embed the themes for clustering and the movie text for testing RAG.

In [178]:
#!pip install --upgrade --quiet  langchain sentence_transformers langchain-huggingface

In [179]:
from langchain_huggingface import HuggingFaceEmbeddings

In [180]:
embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-mpnet-base-v2")

In [181]:
emb= embeddings.embed_documents(["test de embeddings"])

In [182]:
len(emb[0])

768

In [183]:
themes = gds.run_cypher("""
                        MATCH (t:Theme)
                        WHERE t.embedding IS NULL
                        RETURN t.description AS description""")



In [184]:
movie_text = gds.run_cypher("""
MATCH (m:Movie)
RETURN m.tmdbId AS id, m.title + ". " + m.overview AS movieText""")

In [185]:
themes.head()


Unnamed: 0,description
0,- Return only the list without a heading
1,- Return the list as a pipe-separated list
2,12) If a theme is a commentary on society
3,13) If the movie has won any awards
4,13) The list should not exceed eight elements


In [186]:
movie_text.head()

Unnamed: 0,id,movieText
0,42050,"Seduction: The Cruel Woman. Wanda is a dominatrix who runs a gallery in a building on the Hamburg waterfront, where audiences pay for the privilege of watching her humiliate her slaves. She is a business woman who smashes sexual stereotypes and social taboos with icy self-possession and an enigmatic smile. As artist she specializes in the staging of elaborate BDSM fantasies and her affairs transgress the usual boundaries of personal and professional life. Along the way she leaves her German lesbian lover, a shoe fetishist, for an American ""trainee,"" and does more than step on the toes of the male performer who has broken the rules of the master-slave relationship by falling in love with her."
1,65456,The Atrocity Exhibition. A doctor in a mental research institution is driven insane by the spectacle of the horrors of the twentieth century.
2,68621,"Serious Mixing (Part 1). Let the legend teach you how to be a DJ, taking you from the most basic level, to the more advanced techniques of how to mix. It features advice from other Serious artists such as Alan Walker, Illenium, Dj Khaled, along with amazing footage from various nights across the country, In the studio, the action is covered from four camera angles, providing the best view, as you are taught the principles of beat matching."
3,187716,"Universal Groove. Filmed in 1999, Universal Groove explores 90s rave culture through eight diverse characters seeking escape and self-discovery amidst drugs and dance, offering a nostalgic journey into the underground party scene of the era."
4,269579,A Circus Tale & A Love Song. A boy who grew up at the circus decides to leave his old life and go after the love of his life.


## Create indexes for the vector properties.

In [187]:
gds.run_cypher("""CREATE VECTOR INDEX theme_vectors IF NOT EXISTS 
                  FOR (t:Theme)
                  ON (t.embedding)
                  OPTIONS {indexConfig: 
                      {`vector.dimensions`: 768,
                       `vector.similarity_function`: 'cosine'
                       }}
                       """)

In [188]:
gds.run_cypher("""CREATE VECTOR INDEX movie_text_vectors IF NOT EXISTS 
                  FOR (m:Movie)
                  ON (m.embedding)
                  OPTIONS {indexConfig: 
                      {`vector.dimensions`: 768,
                       `vector.similarity_function`: 'cosine'
                       }}
                       """)

## Send vectors to Neo4j

In [189]:
for i in range(0, int(themes.shape[0]/5000) + 1):
    theme_slice = themes.iloc[i*5000:(i+1)*5000].copy()
    theme_slice['embedding'] = embeddings.embed_documents(theme_slice['description'].to_list())
    gds.run_cypher("""
        UNWIND $themeData AS d
        MATCH (t:Theme {description:d['description']})
        CALL db.create.setNodeVectorProperty(t, 'embedding', d['embedding'])
        RETURN count(*) AS updateCount""",
                   {'themeData': theme_slice.to_dict('records')})
    print(f"Finished row {(i+1)*5000}.")

Finished row 5000.
Finished row 10000.
Finished row 15000.
Finished row 20000.
Finished row 25000.
Finished row 30000.
Finished row 35000.
Finished row 40000.
Finished row 45000.
Finished row 50000.
Finished row 55000.


In [190]:
for i in range(0, int(movie_text.shape[0]/5000) + 1):
    movie_slice = movie_text.iloc[i*5000:(i+1)*5000].copy()
    movie_slice['embedding'] = embeddings.embed_documents(movie_slice['movieText'].to_list())
    gds.run_cypher("""
        UNWIND $movieData AS d
        MATCH (m:Movie {tmdbId:d['id']})
        CALL db.create.setNodeVectorProperty(m, 'embedding', d['embedding'])
        RETURN count(*) AS updateCount""",
                   {'movieData': movie_slice.to_dict('records')})
    print(f"Finished row {(i+1)*5000}.")

Finished row 5000.
Finished row 10000.
Finished row 15000.
