In [1]:
%pip install --upgrade langchain langchain-experimental langchain_google_genai python-dotenv pyvis

Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import load_dotenv
import os
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI 
from pyvis.network import Network

In [3]:
GEMINI_MODEL_NAME = "gemini-2.0-flash"
LLM_TEMPERATURE = 0

In [4]:
LONG_TEXT = """Albert Einstein[a] (14 March 1879 – 18 April 1955) was a German-born theoretical physicist who is best known for developing the theory of relativity. Einstein also made important contributions to quantum mechanics.[1][5] His mass–energy equivalence formula E = mc2, which arises from special relativity, has been called "the world's most famous equation".[6] He received the 1921 Nobel Prize in Physics for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect.[7]

Born in the German Empire, Einstein moved to Switzerland in 1895, forsaking his German citizenship (as a subject of the Kingdom of Württemberg)[note 1] the following year. In 1897, at the age of seventeen, he enrolled in the mathematics and physics teaching diploma program at the Swiss federal polytechnic school in Zurich, graduating in 1900. He acquired Swiss citizenship a year later, which he kept for the rest of his life, and afterwards secured a permanent position at the Swiss Patent Office in Bern. In 1905, he submitted a successful PhD dissertation to the University of Zurich. In 1914, he moved to Berlin to join the Prussian Academy of Sciences and the Humboldt University of Berlin, becoming director of the Kaiser Wilhelm Institute for Physics in 1917; he also became a German citizen again, this time as a subject of the Kingdom of Prussia.[note 1] In 1933, while Einstein was visiting the United States, Adolf Hitler came to power in Germany. Horrified by the Nazi persecution of his fellow Jews,[8] he decided to remain in the US, and was granted American citizenship in 1940.[9] On the eve of World War II, he endorsed a letter to President Franklin D. Roosevelt alerting him to the potential German nuclear weapons program and recommending that the US begin similar research.

In 1905, sometimes described as his annus mirabilis (miracle year), he published four groundbreaking papers.[10] In them, he outlined a theory of the photoelectric effect, explained Brownian motion, introduced his special theory of relativity, and demonstrated that if the special theory is correct, mass and energy are equivalent to each other. In 1915, he proposed a general theory of relativity that extended his system of mechanics to incorporate gravitation. A cosmological paper that he published the following year laid out the implications of general relativity for the modeling of the structure and evolution of the universe as a whole.[11][12] In 1917, Einstein wrote a paper which introduced the concepts of spontaneous emission and stimulated emission, the latter of which is the core mechanism behind the laser and maser, and which contained a trove of information that would be beneficial to developments in physics later on, such as quantum electrodynamics and quantum optics.[13]

In the middle part of his career, Einstein made important contributions to statistical mechanics and quantum theory. Especially notable was his work on the quantum physics of radiation, in which light consists of particles, subsequently called photons. With physicist Satyendra Nath Bose, he laid the groundwork for Bose–Einstein statistics. For much of the last phase of his academic life, Einstein worked on two endeavors that ultimately proved unsuccessful. First, he advocated against quantum theory's introduction of fundamental randomness into science's picture of the world, objecting that God does not play dice.[14] Second, he attempted to devise a unified field theory by generalizing his geometric theory of gravitation to include electromagnetism. As a result, he became increasingly isolated from mainstream modern physics.
"""

In [5]:
# Define allowed nodes/relationships as constants
ALLOWED_PERSON_AWARD_FIELDS = ["Person", "Award", "Theory", "Field"]
ALLOWED_PERSON_AWARD_RELATIONSHIPS = [("Person", "CORRESPONDENT", "Award")] # Consider a better name for this constant


In [6]:
def get_gemini_llm(api_key: str, model_name: str = GEMINI_MODEL_NAME, temperature: float = LLM_TEMPERATURE):

    if not api_key:
        raise ValueError("Google API key not found. Please set the GOOGLE_API_KEY environment variable.")
    return ChatGoogleGenerativeAI(temperature=temperature, model=model_name, google_api_key=api_key)

In [7]:
def _initialize_pyvis_network():
    """Initializes and returns a Pyvis Network object with default settings."""
    return Network(height="1200px", width="100%", directed=True,
                notebook=False, bgcolor="#222222", font_color="white")

In [8]:
def _add_nodes_to_network(net: Network, nodes: list):
    """Adds nodes to the Pyvis network."""
    for node in nodes:
        try:
            # Ensure node.id and node.type are string representation for labels/groups
            net.add_node(str(node.id), label=str(node.id), title=str(node.type), group=str(node.type))
        except Exception as e:
            # More specific error handling or logging
            print(f"Warning: Could not add node {node.id}: {e}")

In [9]:
def _add_edges_to_network(net: Network, relationships: list):
    """Adds edges to the Pyvis network."""
    for rel in relationships:
        try:
            # Ensure source/target IDs and relationship type are strings
            net.add_edge(str(rel.source.id), str(rel.target.id), label=str(rel.type).lower())
        except Exception as e:
            # More specific error handling or logging
            print(f"Warning: Could not add edge {rel.source.id}-{rel.target.id} ({rel.type}): {e}")

In [10]:
def _configure_pyvis_physics(net: Network):
    """Sets the physics options for the Pyvis network."""
    net.set_options("""
        {
            "physics": {
                "forceAtlas2Based": {
                    "gravitationalConstant": -100,
                    "centralGravity": 0.01,
                    "springLength": 200,
                    "springConstant": 0.08
                },
                "minVelocity": 0.75,
                "solver": "forceAtlas2Based"
            }
        }
    """)

In [11]:
def _save_and_open_html(net: Network, output_file: str = "knowledge_graph.html"):
    """Saves the graph to an HTML file and attempts to open it in a browser."""
    net.save_graph(output_file)
    print(f"Graph saved to {os.path.abspath(output_file)}")
    try:
        import webbrowser
        webbrowser.open(f"file://{os.path.abspath(output_file)}")
    except Exception as e:
        print(f"Could not open browser automatically: {e}")

In [12]:
def visualize_graph(graph_documents):
    """Visualizes the extracted knowledge graph using Pyvis."""
    if not graph_documents or not graph_documents[0].nodes:
        print("No graph documents or nodes to visualize.")
        return

    net = _initialize_pyvis_network()

    nodes = graph_documents[0].nodes
    relationships = graph_documents[0].relationships

    # You could make this filtering more robust or integrate it with node/edge addition
    node_ids_in_relationships = set()
    for rel in relationships:
        node_ids_in_relationships.add(rel.source.id)
        node_ids_in_relationships.add(rel.target.id)

    # Filter nodes to only include those present in relationships, or add all
    nodes_to_add = [node for node in nodes if node.id in node_ids_in_relationships]
    # Alternatively, just _add_nodes_to_network(net, nodes) if you want all extracted nodes

    _add_nodes_to_network(net, nodes_to_add) # Pass filtered nodes if desired
    _add_edges_to_network(net, relationships)
    _configure_pyvis_physics(net)
    _save_and_open_html(net)

In [13]:
# 3. Class for Encapsulating Logic
class KnowledgeGraphProcessor:
    def __init__(self, llm_model: ChatGoogleGenerativeAI):
        self.llm = llm_model
        self.graph_transformer = LLMGraphTransformer(llm=self.llm)

    async def extract_graph(self, text: str, allowed_nodes: list = None, allowed_relationships: list = None):
        """Extracts knowledge graph documents from text with optional filtering."""
        transformer_params = {"llm": self.llm}
        if allowed_nodes:
            transformer_params["allowed_nodes"] = allowed_nodes
        if allowed_relationships:
            transformer_params["allowed_relationships"] = allowed_relationships

        # Create a new transformer for each call if filtering changes, or if it has state.
        # Langchain's LLMGraphTransformer creates a new prompt/chain based on these, so it's fine.
        current_graph_transformer = LLMGraphTransformer(**transformer_params)
        documents = [Document(page_content=text)]
        graph_documents = await current_graph_transformer.aconvert_to_graph_documents(documents)
        return graph_documents

    def print_graph_details(self, graph_documents):
        """Prints the nodes and relationships of the extracted graph."""
        if graph_documents and graph_documents[0]:
            print(f"Nodes: {graph_documents[0].nodes}")
            print(f"Relationship: {graph_documents[0].relationships}")
        else:
            print("No graph documents to display.")

    def visualize(self, graph_documents):
        """Wrapper for the visualization function."""
        visualize_graph(graph_documents)



In [14]:
# 4. Main Execution Block
async def main(): # Make main async because `aconvert_to_graph_documents` is async
    load_dotenv()
    # IMPORTANT: Change "OPEN_API_KEY" to "GOOGLE_API_KEY" in your .env file
    # and in the getenv call if you haven't already.
    google_api_key = os.getenv("GOOGLE_API_KEY")

    if not google_api_key:
        print("Warning: GOOGLE_API_KEY environment variable not set. Please set it in your .env file.")
        return

    # Initialize LLM
    gemini_llm = get_gemini_llm(google_api_key)

    # Initialize processor
    kg_processor = KnowledgeGraphProcessor(gemini_llm)

    # Define the text
    text_to_process = LONG_TEXT # Use the defined constant

    # 1. Extract full graph
    print("\n--- Full Knowledge Graph Extraction ---")
    full_graph_documents = await kg_processor.extract_graph(text_to_process)
    kg_processor.print_graph_details(full_graph_documents)
    kg_processor.visualize(full_graph_documents)

    # 2. Extract with allowed nodes
    print("\n--- Knowledge Graph Extraction with Allowed Nodes ---")
    nodes_only_graph_documents = await kg_processor.extract_graph(text_to_process, allowed_nodes=ALLOWED_PERSON_AWARD_FIELDS)
    kg_processor.print_graph_details(nodes_only_graph_documents)
    kg_processor.visualize(nodes_only_graph_documents)


    # 3. Extract with allowed relationships
    print("\n--- Knowledge Graph Extraction with Allowed Relationships ---")
    filtered_graph_documents = await kg_processor.extract_graph(text_to_process,
                                                                allowed_nodes=ALLOWED_PERSON_AWARD_FIELDS,
                                                                allowed_relationships=ALLOWED_PERSON_AWARD_RELATIONSHIPS)
    kg_processor.print_graph_details(filtered_graph_documents)
    kg_processor.visualize(filtered_graph_documents)


In [18]:
import nest_asyncio
nest_asyncio.apply()

import asyncio
if __name__ == "__main__":
    asyncio.run(main())


--- Full Knowledge Graph Extraction ---
Nodes: [Node(id='Albert Einstein', type='Person', properties={}), Node(id='Germany', type='Country', properties={}), Node(id='Switzerland', type='Country', properties={}), Node(id='Swiss Patent Office', type='Organization', properties={}), Node(id='University Of Zurich', type='Organization', properties={}), Node(id='Prussian Academy Of Sciences', type='Organization', properties={}), Node(id='Humboldt University Of Berlin', type='Organization', properties={}), Node(id='Kaiser Wilhelm Institute For Physics', type='Organization', properties={}), Node(id='United States', type='Country', properties={}), Node(id='Adolf Hitler', type='Person', properties={}), Node(id='Franklin D. Roosevelt', type='Person', properties={}), Node(id='Kingdom Of Prussia', type='Country', properties={}), Node(id='Satyendra Nath Bose', type='Person', properties={}), Node(id='World War Ii', type='Event', properties={}), Node(id='Nobel Prize In Physics', type='Award', properti