In [None]:
import os

import requests
from bs4 import BeautifulSoup
from langchain_community.document_loaders import WikipediaLoader
from langchain_community.graphs import Neo4jGraph
from langchain_core.documents import Document
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI

os.environ["NEO4J_URI"] = "bolt://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "password"


os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "GraphRAG"

llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")

graph = Neo4jGraph()

llm_transformer = LLMGraphTransformer(
    allowed_nodes=[
        "Person",
        "College",
        "University",
        "Organization",
        "Institute",
        "Business",
    ],
    strict_mode=False,
    llm=llm,
    node_properties=True,
)
colleges = [
    "https://edurank.org/uni/princeton-university/alumni/",
    "https://edurank.org/uni/harvard-university/alumni/",
    "https://edurank.org/uni/stanford-university/alumni/",
    "https://edurank.org/uni/university-of-michigan-ann-arbor/alumni/",
    "https://edurank.org/uni/cornell-university/alumni/",
    "https://edurank.org/uni/university-of-california-berkeley/alumni/",
    "https://edurank.org/uni/massachusetts-institute-of-technology/alumni",
    "https://edurank.org/uni/columbia-university-in-the-city-of-new-york/alumni/",
    "https://edurank.org/uni/university-of-washington-seattle-campus/alumni/",
    "https://edurank.org/uni/johns-hopkins-university/alumni/",
    "https://edurank.org/uni/university-of-california-los-angeles/alumni/",
]
docs = []
for college in colleges:
    list_page = requests.get(college, verify=False)
    soup = BeautifulSoup(list_page.text, "html.parser")
    celebrity_names = soup.select("li.alumni h2.alumni__name")
    names = [element.text for element in celebrity_names]
    for name in names:
        new_docs = WikipediaLoader(
            query=name, load_max_docs=2, doc_content_chars_max=100000
        ).load()
        graph_documents_props = llm_transformer.convert_to_graph_documents(new_docs)
        graph.add_graph_documents(graph_documents_props)
        docs = docs + new_docs
        print(college, " - ", name, " - done!")

In [17]:
llm_transformer = LLMGraphTransformer(
    llm=llm,
    node_properties=True,
)
name="John F. Kennedy"
new_docs = WikipediaLoader(
    query=name, load_max_docs=1, doc_content_chars_max=100000
).load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
splitted_docs = text_splitter.split_documents(new_docs)
for doc in splitted_docs:
    graph_documents_props = llm_transformer.convert_to_graph_documents([doc])
    graph.add_graph_documents(graph_documents_props)

In [16]:
len(splitted_docs)

76

In [18]:
graph_documents_props

[GraphDocument(nodes=[Node(id='John F. Kennedy', type='Person', properties={'role': 'President', 'religion': 'Roman Catholic', 'birthdate': 'June 19, 1917'}), Node(id='Boston Red Sox', type='Sports team', properties={'league': 'Major League Baseball'}), Node(id='Boston Celtics', type='Sports team', properties={'league': 'National Basketball Association'}), Node(id='Hyannisport Club', type='Location', properties={'type': 'Golf Club', 'location': 'Massachusetts'}), Node(id='Palm Beach Country Club', type='Location', properties={'type': 'Golf Club', 'location': 'Florida'}), Node(id='Camelot', type='Concept', properties={'description': "Mythic grandeur of Kennedy's presidency"}), Node(id='Laetare Medal', type='Award', properties={'yearAwarded': '1961', 'description': 'Most prestigious award for American Catholics'}), Node(id='Pacem In Terris Award', type='Award', properties={'description': 'Peace on Earth', 'awardedPosthumously': 'True'}), Node(id='Presidential Medal Of Freedom', type='Awa