### Building Knowledge graphs

#### 1. Get TOC from pdf (using Gemini)

In [2]:
import google.generativeai as genai
import os
import fitz

In [3]:
import getpass 
  
import google.generativeai as genai
import os

os.environ["API_KEY"] = getpass.getpass(prompt = 'Enter the Gemini API key')
genai.configure(api_key=os.environ["API_KEY"])


# Generate content
model = genai.GenerativeModel(model_name='gemini-1.5-pro')


Enter the Gemini API key ·······································


In [4]:
# import PIL.Image


# sample_img = PIL.Image.open("page_pngs/page_0.png")
# response = model.generate_content(["Print the content of the image.Do not hallucinate", sample_img])
# print(response.text)

In [17]:
sample_pdf = genai.upload_file("leph102.pdf")
response = model.generate_content(["""Prepare table of contents for the given file with the page numbers given at the bottom.Include sub sections as well.
Format should be json.
For eg:
```
[{
  "chapter_no" : 1,
  "chapter_name": "FIELDS",  
  "content" :
     [{ "section_no" : "1.1"
        "section_name": "Introduction",
         "page_no" : 1,
         "sub_sections" : []
         },
         { "section_no" : "1.2"
        "section_name": "Properties",
         "page_no" : 2
         "sub_sections" : [
             {
              "sub_section_no":1.2.1,
              "sub_section_name" : "Additivity"
              "sub_section_page_no" : 4
              },
              {
              "sub_section_no":1.2.2
              "sub_section_name" : "multiply"
              "sub_section_page_no" : 4
              }
              .
              .
         ]
         },
         { "section_no" : NULL
        "section_name": "Summary",
         "page_no" : 12,
         "sub_sections" : []
         }
    ]
},
 {
  "chapter_name": "MAGNETS",
  "chapter_no" : 2,
  contents: [
  ...
  ]
 }      
]
```
Do not hallucinate.

""",sample_pdf])
print(response.text)

```json
[
  {
    "chapter_no": 2,
    "chapter_name": "ELECTROSTATIC POTENTIAL AND CAPACITANCE",
    "content": [
      {
        "section_no": "2.1",
        "section_name": "Introduction",
        "page_no": 45,
        "sub_sections": []
      },
      {
        "section_no": "2.2",
        "section_name": "Electrostatic Potential",
        "page_no": 47,
        "sub_sections": []
      },
      {
        "section_no": "2.3",
        "section_name": "POTENTIAL DUE TO A POINT CHARGE",
        "page_no": 48,
        "sub_sections": []
      },
      {
        "section_no": "2.4",
        "section_name": "POTENTIAL DUE TO AN ELECTRIC DIPOLE",
        "page_no": 49,
        "sub_sections": []
      },
      {
        "section_no": "2.5",
        "section_name": "POTENTIAL DUE TO A SYSTEM OF CHARGES",
        "page_no": 51,
        "sub_sections": []
      },
      {
        "section_no": "2.6",
        "section_name": "Equipotential Surfaces",
        "page_no": 54,
        "sub_secti

#### 2. Parse the output JSON

In [18]:
from langchain_core.output_parsers import JsonOutputParser

In [19]:
parser = JsonOutputParser()

In [20]:
toc = parser.invoke(response.text)
toc

[{'chapter_no': 2,
  'chapter_name': 'ELECTROSTATIC POTENTIAL AND CAPACITANCE',
  'content': [{'section_no': '2.1',
    'section_name': 'Introduction',
    'page_no': 45,
    'sub_sections': []},
   {'section_no': '2.2',
    'section_name': 'Electrostatic Potential',
    'page_no': 47,
    'sub_sections': []},
   {'section_no': '2.3',
    'section_name': 'POTENTIAL DUE TO A POINT CHARGE',
    'page_no': 48,
    'sub_sections': []},
   {'section_no': '2.4',
    'section_name': 'POTENTIAL DUE TO AN ELECTRIC DIPOLE',
    'page_no': 49,
    'sub_sections': []},
   {'section_no': '2.5',
    'section_name': 'POTENTIAL DUE TO A SYSTEM OF CHARGES',
    'page_no': 51,
    'sub_sections': []},
   {'section_no': '2.6',
    'section_name': 'Equipotential Surfaces',
    'page_no': 54,
    'sub_sections': [{'sub_section_no': '2.6.1',
      'sub_section_name': 'Relation between field and potential',
      'sub_section_page_no': 55}]},
   {'section_no': '2.7',
    'section_name': 'POTENTIAL ENERGY OF 

In [21]:
# add 'subject' key to toc
toc = {'subject':'physics', 'subject_content':toc}
toc

{'subject': 'physics',
 'subject_content': [{'chapter_no': 2,
   'chapter_name': 'ELECTROSTATIC POTENTIAL AND CAPACITANCE',
   'content': [{'section_no': '2.1',
     'section_name': 'Introduction',
     'page_no': 45,
     'sub_sections': []},
    {'section_no': '2.2',
     'section_name': 'Electrostatic Potential',
     'page_no': 47,
     'sub_sections': []},
    {'section_no': '2.3',
     'section_name': 'POTENTIAL DUE TO A POINT CHARGE',
     'page_no': 48,
     'sub_sections': []},
    {'section_no': '2.4',
     'section_name': 'POTENTIAL DUE TO AN ELECTRIC DIPOLE',
     'page_no': 49,
     'sub_sections': []},
    {'section_no': '2.5',
     'section_name': 'POTENTIAL DUE TO A SYSTEM OF CHARGES',
     'page_no': 51,
     'sub_sections': []},
    {'section_no': '2.6',
     'section_name': 'Equipotential Surfaces',
     'page_no': 54,
     'sub_sections': [{'sub_section_no': '2.6.1',
       'sub_section_name': 'Relation between field and potential',
       'sub_section_page_no': 55}

#### 3. Remove the single/double quote in text 

In [22]:
def remove_quotes_from_dict(data):
    # Check if data is a dictionary
    if isinstance(data, dict):
        for key, value in data.items():
            # Recursively call the function if the value is a dictionary or a list
            data[key] = remove_quotes_from_dict(value)
    elif isinstance(data, list):
        for i in range(len(data)):
            # Recursively call the function for each element in the list
            data[i] = remove_quotes_from_dict(data[i])
    elif isinstance(data, str):
        # Remove single and double quotes from string
        data = data.replace("'", "").replace('"', "")
    
    return data


toc = remove_quotes_from_dict(toc)
print(toc)


{'subject': 'physics', 'subject_content': [{'chapter_no': 2, 'chapter_name': 'ELECTROSTATIC POTENTIAL AND CAPACITANCE', 'content': [{'section_no': '2.1', 'section_name': 'Introduction', 'page_no': 45, 'sub_sections': []}, {'section_no': '2.2', 'section_name': 'Electrostatic Potential', 'page_no': 47, 'sub_sections': []}, {'section_no': '2.3', 'section_name': 'POTENTIAL DUE TO A POINT CHARGE', 'page_no': 48, 'sub_sections': []}, {'section_no': '2.4', 'section_name': 'POTENTIAL DUE TO AN ELECTRIC DIPOLE', 'page_no': 49, 'sub_sections': []}, {'section_no': '2.5', 'section_name': 'POTENTIAL DUE TO A SYSTEM OF CHARGES', 'page_no': 51, 'sub_sections': []}, {'section_no': '2.6', 'section_name': 'Equipotential Surfaces', 'page_no': 54, 'sub_sections': [{'sub_section_no': '2.6.1', 'sub_section_name': 'Relation between field and potential', 'sub_section_page_no': 55}]}, {'section_no': '2.7', 'section_name': 'POTENTIAL ENERGY OF A SYSTEM OF CHARGES', 'page_no': 55, 'sub_sections': []}, {'section_

#### 4. Insert nested dict of TOC to neo4j

In [23]:
def clean_property_value(value):
    """
    Clean property value by removing/escaping quotes and converting to string
    """
    if value is None:
        return ""
    # Convert to string and replace quotes
    cleaned_value = str(value).replace("'", "")#.replace('"', "")
    return cleaned_value

def insert_nested_dict_to_neo4j(data, key_to_insert, node_label, primary_key, parent_label=None, parent_primary_key=None, parent_node=None, parent_props=None):
    """
    Insert nested dictionary values into Neo4j with properties from same-level keys
    Excludes any properties that are lists and cleans property values
    """
    # Handle list
    if isinstance(data, list):
        for item in data:
            if isinstance(item, dict):
                insert_nested_dict_to_neo4j(
                    item, key_to_insert, node_label, primary_key,
                    parent_label, parent_primary_key, parent_node, parent_props
                )
        return

    # Handle dict
    if isinstance(data, dict):
        # Get properties from current level, excluding lists and clean values
        current_props = {k: clean_property_value(v) for k, v in data.items() 
                       if not isinstance(v, (dict, list))}
        all_props = {**(parent_props or {}), **current_props}

        for key, value in data.items():
            if key == key_to_insert:
                if isinstance(value, list):
                    for item in value:
                        if isinstance(item, dict):
                            # Filter out lists and clean property values
                            filtered_item = {k: clean_property_value(v) for k, v in item.items() 
                                          if not isinstance(v, list)}
                            
                            # Combine with parent properties
                            node_props = {**all_props, **filtered_item}
                            print(node_props)
                            
                            # Remove primary key from props_str as it's handled in MERGE
                            props_dict = {k: v for k, v in node_props.items() 
                                        if k != primary_key}
                            props_str = ", ".join([f"{k}: '{v}'" for k, v in props_dict.items()])
                            
                            if parent_label and parent_primary_key:
                                parent_key_value = all_props.get(parent_primary_key)
                                if parent_key_value:
                                    query = f"""
                                    MATCH (parent:{parent_label} {{{parent_primary_key}: '{parent_key_value}'}})
                                    MERGE (n:{node_label} {{{primary_key}: '{filtered_item[primary_key]}'}})
                                    ON CREATE SET n += {{{props_str}}}
                                    ON MATCH SET n += {{{props_str}}}
                                    MERGE (parent)-[:HAS_{key}]->(n)
                                    """
                                else:
                                    print(f"Warning: Parent primary key {parent_primary_key} not found in properties")
                                    continue
                            else:
                                query = f"""
                                MERGE (n:{node_label} {{{primary_key}: '{filtered_item[primary_key]}'}})
                                ON CREATE SET n += {{{props_str}}}
                                ON MATCH SET n += {{{props_str}}}
                                """
                            print(query)
                            execute_query(query)
            
            elif isinstance(value, (dict, list)):
                insert_nested_dict_to_neo4j(
                    value, key_to_insert, node_label, primary_key,
                    parent_label, parent_primary_key, key, all_props
                )
                
                
                
                

In [24]:
insert_nested_dict_to_neo4j(
    data=toc,
    key_to_insert="subject_content",
    node_label="CHAPTER",
    primary_key="chapter_name",
    parent_label=None,
    parent_primary_key=None,
    
)


insert_nested_dict_to_neo4j(
    data=toc,
    key_to_insert="content",
    node_label="SECTION",
    primary_key="section_name",
    parent_label="CHAPTER",
    parent_primary_key="chapter_name",
    
)


insert_nested_dict_to_neo4j(
    data=toc,
    key_to_insert="sub_sections",
    node_label="SUBSECTION",
    primary_key="sub_section_name",
    parent_label="SECTION",
    parent_primary_key="section_name",
    
)

{'subject': 'physics', 'chapter_no': '2', 'chapter_name': 'ELECTROSTATIC POTENTIAL AND CAPACITANCE'}

                                MERGE (n:CHAPTER {chapter_name: 'ELECTROSTATIC POTENTIAL AND CAPACITANCE'})
                                ON CREATE SET n += {subject: 'physics', chapter_no: '2'}
                                ON MATCH SET n += {subject: 'physics', chapter_no: '2'}
                                
{'subject': 'physics', 'chapter_no': '2', 'chapter_name': 'ELECTROSTATIC POTENTIAL AND CAPACITANCE', 'section_no': '2.1', 'section_name': 'Introduction', 'page_no': '45'}

                                    MATCH (parent:CHAPTER {chapter_name: 'ELECTROSTATIC POTENTIAL AND CAPACITANCE'})
                                    MERGE (n:SECTION {section_name: 'Introduction'})
                                    ON CREATE SET n += {subject: 'physics', chapter_no: '2', chapter_name: 'ELECTROSTATIC POTENTIAL AND CAPACITANCE', section_no: '2.1', page_no: '45'}
                      

#### 5.Insert sequential relationships in TOC

In [25]:
def find_nested_list(data, list_key):
    """
    Recursively find all instances of nested lists in a dictionary structure
    
    Args:
        data: The nested dictionary to search in
        list_key: The key whose value is the list of dictionaries
    
    Returns:
        list: List of all matching lists found
    """
    found_lists = []
    
    if isinstance(data, dict):
        for key, value in data.items():
            if key == list_key and isinstance(value, list):
                found_lists.append(value)
            if isinstance(value, (dict, list)):
                found_lists.extend(find_nested_list(value, list_key))
    elif isinstance(data, list):
        for item in data:
            if isinstance(item, (dict, list)):
                found_lists.extend(find_nested_list(item, list_key))
                
    return found_lists

def create_sequential_relationships(data, list_key, node_label, primary_key):
    """
    Returns Cypher queries that create 'next' relationships between sequential items
    for all instances of the list_key found in the nested structure
    
    Args:
        data: The nested dictionary containing the lists
        list_key: The key whose value is the list of dictionaries
        node_label: Label of the nodes to match
        primary_key: The property name used as unique identifier
    
    Returns:
        str: Cypher queries for creating sequential relationships
    """
    all_lists = find_nested_list(data, list_key)
    if not all_lists:
        return ""
    
    
    for items in all_lists:
        for i in range(len(items) - 1):
            current_item = items[i]
            next_item = items[i + 1]
            
            query = f"""
            MATCH (current:{node_label}), (next:{node_label})
            WHERE current.{primary_key} = '{current_item[primary_key]}' 
            AND next.{primary_key} = '{next_item[primary_key]}'
            MERGE (current)-[:NEXT]->(next)"""
            
            print(query)
            execute_query(query)
            
    
    

In [26]:
# create seq reln for sections
create_sequential_relationships(
    data=toc,
    list_key='content',
    node_label='SECTION',
    primary_key='section_name'
)



# create seq reln for subsections
create_sequential_relationships(
    data=toc,
    list_key='sub_sections',
    node_label='SUBSECTION',
    primary_key='sub_section_name'
)




            MATCH (current:SECTION), (next:SECTION)
            WHERE current.section_name = 'Introduction' 
            AND next.section_name = 'Electrostatic Potential'
            MERGE (current)-[:NEXT]->(next)

            MATCH (current:SECTION), (next:SECTION)
            WHERE current.section_name = 'Electrostatic Potential' 
            AND next.section_name = 'POTENTIAL DUE TO A POINT CHARGE'
            MERGE (current)-[:NEXT]->(next)

            MATCH (current:SECTION), (next:SECTION)
            WHERE current.section_name = 'POTENTIAL DUE TO A POINT CHARGE' 
            AND next.section_name = 'POTENTIAL DUE TO AN ELECTRIC DIPOLE'
            MERGE (current)-[:NEXT]->(next)

            MATCH (current:SECTION), (next:SECTION)
            WHERE current.section_name = 'POTENTIAL DUE TO AN ELECTRIC DIPOLE' 
            AND next.section_name = 'POTENTIAL DUE TO A SYSTEM OF CHARGES'
            MERGE (current)-[:NEXT]->(next)

            MATCH (current:SECTION), (next:SECTIO

### neo4j utility functions

In [74]:
!pip install neo4j

Collecting neo4j
  Downloading neo4j-5.27.0-py3-none-any.whl.metadata (5.9 kB)
Downloading neo4j-5.27.0-py3-none-any.whl (301 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.7/301.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: neo4j
Successfully installed neo4j-5.27.0


In [1]:
import neo4j

def execute_query(query):
    uri = "neo4j+s://a83594c4.databases.neo4j.io" #"bolt://3.228.217.85:7687"
#     driver = neo4j.GraphDatabase.driver(uri, auth=neo4j.basic_auth('neo4j', 'system-advantage-brake'))
    driver = neo4j.GraphDatabase.driver(uri, auth=neo4j.basic_auth('neo4j', 'nXh9u6nEUPJKLVnyuzxe7NDgeCVJCYBoFX1NDJan9pw'))
    with driver.session() as session:
        result = session.run(query)
        return result
# function to loop through a list of dictionaries and insert into neo4j server
def insert_into_neo4j(records,node_label,primary_key):
    for row in records:
        print(row[HEADER.index(primary_key)])
        execute_query(f"MERGE (n:{node_label} {{{primary_key}: '{row[HEADER.index(primary_key)]}'}})")

# function to loop through a list of dictionaries and create relationships between them
def create_relationships(list_of_dicts):
    for dict in list_of_dicts:
        execute_query(f"MATCH (n1:Node {{name: '{dict['name']}'}}), (n2:Node {{name: '{dict['name']}'}}) CREATE (n1)-[:RELATIONSHIP]->(n2)")    



In [194]:
# insert_into_neo4j(chapter_list,node_label='CHAPTERS',primary_key='chapter_name')
# insert_into_neo4j(sections_list,node_label='SECTIONS',primary_key='section_name')
# insert_into_neo4j(subsections_list,node_label='SUBSECTIONS',primary_key='sub_section_name')

In [None]:
# MATCH (n:SUBJECT),(m:CHAPTER) where m.subject='physics' MERGE (n)-[r:HAS_CHAPTER]->(m)  RETURN r
# MERGE (n:SUBJECT{name:'physics'}) RETURN n