In [1]:
from langchain import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain


from pathlib import Path
from typing import List

from langchain.chains.openai_functions import create_structured_output_chain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.graphs import Neo4jGraph
from langchain.prompts import ChatPromptTemplate
from langchain.pydantic_v1 import BaseModel, Field
from langchain.text_splitter import TokenTextSplitter
from neo4j.exceptions import ClientError

import os
import glob
import json
import pandas as pd
from pathlib import Path

from typing import Any


from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())



##### Define source text file

In [2]:
txt_files = glob.glob("data/*.txt")
txt_files

['data/Harry Potter and the Deathly Hallows – Part 1.txt',
 'data/Harry Potter and the Deathly Hallows – Part 2.txt',
 'data/Harry Potter and the Prisoner of Azkaban (film).txt',
 'data/Harry Potter and the Goblet of Fire (film).txt',
 "data/Harry Potter and the Philosopher's Stone (film).txt",
 'data/Harry Potter and the Order of the Phoenix (film).txt',
 'data/Harry Potter and the Half-Blood Prince (film).txt',
 'data/Harry Potter and the Chamber of Secrets (film).txt']

##### graph model

<img src="img/graph_model.png" width="800"/>

##### Define a function to extract movie's information (structure information) from text files

In [3]:
def get_movies_info(file_list: list,movie_df):
    movies_template = """given the article about Harry Potter movie {information} ,Carefully read the article, and I want you to create:
         1. movie name
         2. producer name
         3. director name
         4. release year
         - NEVER Impute missing values.
         - DO NOT MISS out any movie related information
         - If you found multiple producer or director information, please return nested list of name
         return information in json format with fields name 'movieName' , 'producerName','directorName','releaseYear'
     """
    summary_movie_prompt_template = PromptTemplate(
        input_variables=["information"], template=movies_template)
    llm = ChatOpenAI(temperature=0, model_name=os.getenv('OPENAI_MODEL'))
    chain = LLMChain(llm=llm, prompt=summary_movie_prompt_template)
    for file in file_list:
        with open(file, 'r', encoding='utf-8', errors='ignore') as input_file:
            first_line = input_file.readline().strip('\n')
            content = input_file.read()
            data = chain.run(information=content)
            tmp_df = pd.DataFrame([json.loads(data.replace("```json\n",'').replace("\n```",''))])
            tmp_df['url'] = first_line
            movie_df = pd.concat([movie_df,tmp_df])
        movie_df = movie_df.explode('producerName')
        movie_df = movie_df.explode('directorName')
    return movie_df

In [15]:
movie_df = pd.DataFrame()
movie_df = get_movies_info(txt_files,movie_df)

In [16]:
movie_df

Unnamed: 0,movieName,producerName,directorName,releaseYear,url
0,Harry Potter and the Deathly Hallows – Part 1,David Heyman,David Yates,2010,Harry Potter and the Deathly Hallows – Part 1
0,Harry Potter and the Deathly Hallows – Part 1,David Barron,David Yates,2010,Harry Potter and the Deathly Hallows – Part 1
0,Harry Potter and the Deathly Hallows – Part 2,David Heyman,David Yates,2011,Harry Potter and the Deathly Hallows – Part 2
0,Harry Potter and the Prisoner of Azkaban,David Heyman,Alfonso Cuarón,2004,Harry Potter and the Prisoner of Azkaban
0,Harry Potter and the Prisoner of Azkaban,Chris Columbus,Alfonso Cuarón,2004,Harry Potter and the Prisoner of Azkaban
0,Harry Potter and the Goblet of Fire,David Heyman,Mike Newell,2005,Harry Potter and the Goblet of Fire
0,Harry Potter and the Philosopher's Stone,David Heyman,Chris Columbus,2001,Harry Potter and the Philosopher's Stone
0,Harry Potter and the Order of the Phoenix,David Heyman,David Yates,2007,Harry Potter and the Order of the Phoenix
0,Harry Potter and the Half-Blood Prince,David Heyman,David Yates,2009,Harry Potter and the Half-Blood Prince
0,Harry Potter and the Chamber of Secrets,David Heyman,Chris Columbus,2002,Harry Potter and the Chamber of Secrets


In [17]:
movie_df.to_csv('data/movie_df.csv',index=False)

In [5]:
movie_df = pd.read_csv('data/movie_df.csv')

##### Define a function to extract movie's information (structure information) from text files

In [18]:
def get_casts_info(file_list: list,casts_df):
    movies_template = """Given the article about the Harry Potter movie and its information {information}, Carefully read the article, and I want you to create:
         1. actor/actress name
         2. his/her role
         - NEVER Impute missing values.
         - DO NOT MISS out any actor/actress related information
         return information in json format with fields name 'name' , 'role'
     """
    summary_movie_prompt_template = PromptTemplate(
        input_variables=["information"], template=movies_template)
    llm = ChatOpenAI(temperature=0, model_name=os.getenv('OPENAI_MODEL'))
    chain = LLMChain(llm=llm, prompt=summary_movie_prompt_template)
    for file in file_list:
        with open(file, 'r', encoding='utf-8', errors='ignore') as input_file:
            first_line = input_file.readline().strip('\n')
            content = input_file.read()
            data = chain.run(information=content)
            tmp_df = pd.DataFrame(json.loads(data.replace("```json\n",'').replace("\n```",'')))
            tmp_df['url'] = first_line
            casts_df = pd.concat([casts_df,tmp_df])
    return casts_df

In [19]:
casts_df = pd.DataFrame()
casts_df = get_casts_info(txt_files,casts_df)

In [21]:
casts_df.to_csv('data/casts_df.csv',index=False)

In [6]:
casts_df =pd.read_csv('data/casts_df.csv')

In [7]:
movie_df

Unnamed: 0,movieName,producerName,directorName,releaseYear,url
0,Harry Potter and the Deathly Hallows – Part 1,David Heyman,David Yates,2010,Harry Potter and the Deathly Hallows – Part 1
1,Harry Potter and the Deathly Hallows – Part 1,David Barron,David Yates,2010,Harry Potter and the Deathly Hallows – Part 1
2,Harry Potter and the Deathly Hallows – Part 2,David Heyman,David Yates,2011,Harry Potter and the Deathly Hallows – Part 2
3,Harry Potter and the Prisoner of Azkaban,David Heyman,Alfonso Cuarón,2004,Harry Potter and the Prisoner of Azkaban
4,Harry Potter and the Prisoner of Azkaban,Chris Columbus,Alfonso Cuarón,2004,Harry Potter and the Prisoner of Azkaban
5,Harry Potter and the Goblet of Fire,David Heyman,Mike Newell,2005,Harry Potter and the Goblet of Fire
6,Harry Potter and the Philosopher's Stone,David Heyman,Chris Columbus,2001,Harry Potter and the Philosopher's Stone
7,Harry Potter and the Order of the Phoenix,David Heyman,David Yates,2007,Harry Potter and the Order of the Phoenix
8,Harry Potter and the Half-Blood Prince,David Heyman,David Yates,2009,Harry Potter and the Half-Blood Prince
9,Harry Potter and the Chamber of Secrets,David Heyman,Chris Columbus,2002,Harry Potter and the Chamber of Secrets


In [8]:
casts_df

Unnamed: 0,name,role,url
0,Daniel Radcliffe,Harry Potter,Harry Potter and the Deathly Hallows – Part 1
1,Rupert Grint,Ron Weasley,Harry Potter and the Deathly Hallows – Part 1
2,Emma Watson,Hermione Granger,Harry Potter and the Deathly Hallows – Part 1
3,Bonnie Wright,Ginny Weasley,Harry Potter and the Deathly Hallows – Part 1
4,Helena Bonham Carter,Bellatrix Lestrange,Harry Potter and the Deathly Hallows – Part 1
...,...,...,...
278,Alfred Burke,Master Dippet,Harry Potter and the Chamber of Secrets
279,Daisy Bates,moving picture,Harry Potter and the Chamber of Secrets
280,David Tysall,moving picture,Harry Potter and the Chamber of Secrets
281,Peter Taylor,moving picture,Harry Potter and the Chamber of Secrets


##### Ingest structure data to the graph database

In [9]:
graph = Neo4jGraph(url=os.getenv('NEO4J_URI'), username=os.getenv('NEO4J_USER'), password=os.getenv('NEO4J_PASSWORD'),database=os.getenv('NEO4J_DATABASE'))
for index, row in movie_df.iterrows():
    params = {
        "movieName": row['movieName'],
        "producerName": row['producerName'],
        "directorName": row['directorName'],
        "releaseYear": row['releaseYear'],
        "url": row['url']
    }
    graph.query(
        """
    MERGE (m:Movie {title: $movieName})
    SET m.releaseYear = toInteger($releaseYear),
    m.url = $url
    MERGE (a:Article {movieName: $movieName})
    MERGE (d:Person:Director {name: $producerName})
    MERGE (p:Person:Producer {name: $producerName})
    MERGE (m)-[:HAS_ARTICLE]->(a)
    MERGE (m)<-[:DIRECTED]-(d)
    MERGE (m)<-[:PRODUCED]-(p)
    """,
        params,
)


##### graph model

<img src="img/graph_model.png" width="800"/>

##### Create NEXT relationship

<img src="img/next_rels.png" width="800"/>

In [10]:
graph.query("""
            CALL apoc.periodic.iterate(
            'MATCH (m:Movie) RETURN m as movies order by m.releaseYear',
            'WITH movies as m
             WITH collect(m) AS ms
             WITH ms, ms[1..] as nextM
             UNWIND range(0,size(nextM)-1,1) as index
             WITH ms[index] as first, nextM[index] as second
            CREATE (first)-[:NEXT]->(second)',{batchSize: 1000, batchMode: 'BATCH'});
            """)

[{'batches': 1,
  'total': 8,
  'timeTaken': 0,
  'committedOperations': 8,
  'failedOperations': 0,
  'failedBatches': 0,
  'retries': 0,
  'errorMessages': {},
  'batch': {'total': 1, 'errors': {}, 'committed': 1, 'failed': 0},
  'operations': {'total': 8, 'errors': {}, 'committed': 8, 'failed': 0},
  'wasTerminated': False,
  'failedParams': {},
  'updateStatistics': {'relationshipsDeleted': 0,
   'relationshipsCreated': 7,
   'nodesDeleted': 0,
   'nodesCreated': 0,
   'labelsRemoved': 0,
   'labelsAdded': 0,
   'propertiesSet': 0}}]

##### Create relationships between Movies and casts

In [11]:
graph = Neo4jGraph(url=os.getenv('NEO4J_URI'), username=os.getenv('NEO4J_USER'), password=os.getenv('NEO4J_PASSWORD'),database=os.getenv('NEO4J_DATABASE'))
for index, row in casts_df.iterrows():
    params = {
        "name": row['name'],
        "role": row['role'],
        "url": row['url']
    }
    graph.query(
        """
    MATCH (m:Movie {url: $url})
    MERGE (a:Person:Cast {name: $name})
    MERGE (m)<-[:ACTED_IN {role: $role}]-(a)
    """,
        params,
    )

##### Ingest unstructure data to the graph database

In [12]:
def ingest_docs(parent_documents,movieName,embeddings,child_splitter):
    graph = Neo4jGraph(url=os.getenv('NEO4J_URI'), username=os.getenv('NEO4J_USER'), password=os.getenv('NEO4J_PASSWORD'),database=os.getenv('NEO4J_DATABASE'))

    for i, parent in enumerate(parent_documents):
        child_documents = child_splitter.split_documents([parent])
        #print(child_documents)
        params = {
        "parent_text": parent.page_content,
        "parent_id": i,
        "movieName" : movieName,
        "parent_embedding": embeddings.embed_query(parent.page_content + movieName),
        "children": [
            {
                "text": c.page_content,
                "id": f"{i}-{ic}-{movieName}",
                "embedding": embeddings.embed_query(c.page_content + movieName),
            }
            for ic, c in enumerate(child_documents)
        ],
        }
        # Ingest data
        graph.query(
        """
        MERGE (p:Parent:Document {id: $parent_id,movieName:$movieName})
        SET p.text = $parent_text
        WITH p
        CALL db.create.setVectorProperty(p, 'embedding', $parent_embedding)
        YIELD node
        WITH p
        UNWIND $children AS child
        MERGE (c:Child:Document {id: child.id})
        SET c.text = child.text
        MERGE (c)<-[:HAS_CHILD]-(p)
        WITH c, child
        CALL db.create.setVectorProperty(c, 'embedding', child.embedding)
        YIELD node
        RETURN count(*)
        """,
            params,
        )


In [13]:
# Embeddings
embeddings = OpenAIEmbeddings()
embedding_dimension = 1536
llm = ChatOpenAI(temperature=0)


for file in txt_files:
    with open(file, 'r', encoding='utf-8', errors='ignore') as input_file:
        first_line = input_file.readline().strip('\n')
        content = input_file.read()
    documents =  [Document(page_content=content, metadata={"source": file,"movieName":first_line})]
    # Ingest Parent-Child node pairs
    parent_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
    child_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=24)
    parent_documents = parent_splitter.split_documents(documents)
    print(f"ingest : {first_line}")
    ingest_docs(parent_documents,first_line,embeddings,child_splitter)


ingest : Harry Potter and the Deathly Hallows – Part 1
ingest : Harry Potter and the Deathly Hallows – Part 2
ingest : Harry Potter and the Prisoner of Azkaban
ingest : Harry Potter and the Goblet of Fire
ingest : Harry Potter and the Philosopher's Stone
ingest : Harry Potter and the Order of the Phoenix
ingest : Harry Potter and the Half-Blood Prince
ingest : Harry Potter and the Chamber of Secrets


<img src="img/has_content.png" width="1000"/>

In [14]:
graph = Neo4jGraph(url=os.getenv('NEO4J_URI'), username=os.getenv('NEO4J_USER'), password=os.getenv('NEO4J_PASSWORD'),database=os.getenv('NEO4J_DATABASE'))
graph.query("""
            CALL apoc.periodic.iterate(
            'MATCH (n:Article),(m:Parent) WHERE n.movieName = m.movieName RETURN m,n',
            'WITH n,m
             MERGE (n)-[:HAS_CONTENT]->(m)',{batchSize: 1000, batchMode: 'BATCH'});
            """)

[{'batches': 1,
  'total': 117,
  'timeTaken': 0,
  'committedOperations': 117,
  'failedOperations': 0,
  'failedBatches': 0,
  'retries': 0,
  'errorMessages': {},
  'batch': {'total': 1, 'errors': {}, 'committed': 1, 'failed': 0},
  'operations': {'total': 117, 'errors': {}, 'committed': 117, 'failed': 0},
  'wasTerminated': False,
  'failedParams': {},
  'updateStatistics': {'relationshipsDeleted': 0,
   'relationshipsCreated': 117,
   'nodesDeleted': 0,
   'nodesCreated': 0,
   'labelsRemoved': 0,
   'labelsAdded': 0,
   'propertiesSet': 0}}]

##### create vector indext for article's content

In [15]:
try:
    graph.query(
        "CALL db.index.vector.createNodeIndex('child_document', "
        "'Child', 'embedding', $dimension, 'cosine')",
        {"dimension": embedding_dimension},
    )
except ClientError:  # already exists
    pass

In [16]:
try:
    graph.query(
        "CALL db.index.vector.createNodeIndex('typical_rag', "
        "'Parent', 'embedding', $dimension, 'cosine')",
        {"dimension": embedding_dimension},
    )
except ClientError:  # already exists
    pass