### Building Knowledge graphs

#### 1. Get TOC from pdf (using Gemini)

In [38]:
import google.generativeai as genai
import os
import fitz
from langchain_core.output_parsers import JsonOutputParser

In [93]:
import getpass 
  
import google.generativeai as genai
import os

os.environ["API_KEY"] = getpass.getpass(prompt = 'Enter the Gemini API key')
genai.configure(api_key=os.environ["API_KEY"])


# Generate content
model = genai.GenerativeModel(model_name='gemini-2.0-flash-exp')#'gemini-1.5-pro')


Enter the Gemini API key ·······································


In [4]:
# import PIL.Image


# sample_img = PIL.Image.open("page_pngs/page_0.png")
# response = model.generate_content(["Print the content of the image.Do not hallucinate", sample_img])
# print(response.text)

In [96]:
def extract_content_from_pdf(section_name, pdf_path="leph102.pdf"):
    sample_pdf = genai.upload_file(pdf_path)
    prompt = """ From the given pdf, extract and print the content of the section given.
    Extract the full content of the section even if it spans multiple pages.
    Extract all the content till the next section starts.
    Do not hallucinate. Do not include any prefix or suffix. Do not include ``` or markdown prefix.
    
    The output should contain entire content. section no and title can be skipped.
    Output needs to be in markdown format. 
    
    section to be extracted : {section_name} 

    
    
    """
    
    prompt = prompt.format(section_name = section_name)
    response = model.generate_content([prompt,sample_pdf],request_options={"timeout": 1000})
#     parser = JsonOutputParser()
#     section_content = parser.invoke(response.text)
    return response.text
    

In [95]:
extract_content_from_pdf("Introduction")

"In Chapters 5 and 7 (Class XI), the notion of potential energy was\nintroduced. When an external force does work in taking a body from a\npoint to another against a force like spring force or gravitational force,\nthat work gets stored as potential energy of the body. When the external\nforce is removed, the body moves, gaining kinetic energy and losing\nan equal amount of potential energy. The sum of kinetic and\npotential energies is thus conserved. Forces of this kind are called\nconservative forces. Spring force and gravitational force are examples of\nconservative forces.\n\nCoulomb force between two (stationary) charges is also a conservative\nforce. This is not surprising, since both have inverse-square dependence\non distance and differ mainly in the proportionality constants: the\nmasses in the gravitational law are replaced by charges in Coulomb's\nlaw. Thus, like the potential energy of a mass in a gravitational\nfield, we can define electrostatic potential energy of a char

#### 2. loop through TOC and insert the content into Neo4j

In [34]:
# add 'subject' key to toc
toc = {'subject': 'physics',
 'subject_content': [{'chapter_no': 2,
   'chapter_name': 'ELECTROSTATIC POTENTIAL AND CAPACITANCE',
   'content': [{'section_no': '2.1',
     'section_name': 'Introduction',
     'page_no': 45,
     'sub_sections': []},
    {'section_no': '2.2',
     'section_name': 'Electrostatic Potential',
     'page_no': 47,
     'sub_sections': []}]
                     }]
      }
toc

{'subject': 'physics',
 'subject_content': [{'chapter_no': 2,
   'chapter_name': 'ELECTROSTATIC POTENTIAL AND CAPACITANCE',
   'content': [{'section_no': '2.1',
     'section_name': 'Introduction',
     'page_no': 45,
     'sub_sections': []},
    {'section_no': '2.2',
     'section_name': 'Electrostatic Potential',
     'page_no': 47,
     'sub_sections': []}]}]}

#### 3. Remove the single/double quote in text 

In [22]:
def remove_quotes_from_dict(data):
    # Check if data is a dictionary
    if isinstance(data, dict):
        for key, value in data.items():
            # Recursively call the function if the value is a dictionary or a list
            data[key] = remove_quotes_from_dict(value)
    elif isinstance(data, list):
        for i in range(len(data)):
            # Recursively call the function for each element in the list
            data[i] = remove_quotes_from_dict(data[i])
    elif isinstance(data, str):
        # Remove single and double quotes from string
        data = data.replace("'", "").replace('"', "")
    
    return data


toc = remove_quotes_from_dict(toc)
print(toc)


{'subject': 'physics', 'subject_content': [{'chapter_no': 2, 'chapter_name': 'ELECTROSTATIC POTENTIAL AND CAPACITANCE', 'content': [{'section_no': '2.1', 'section_name': 'Introduction', 'page_no': 45, 'sub_sections': []}, {'section_no': '2.2', 'section_name': 'Electrostatic Potential', 'page_no': 47, 'sub_sections': []}, {'section_no': '2.3', 'section_name': 'POTENTIAL DUE TO A POINT CHARGE', 'page_no': 48, 'sub_sections': []}, {'section_no': '2.4', 'section_name': 'POTENTIAL DUE TO AN ELECTRIC DIPOLE', 'page_no': 49, 'sub_sections': []}, {'section_no': '2.5', 'section_name': 'POTENTIAL DUE TO A SYSTEM OF CHARGES', 'page_no': 51, 'sub_sections': []}, {'section_no': '2.6', 'section_name': 'Equipotential Surfaces', 'page_no': 54, 'sub_sections': [{'sub_section_no': '2.6.1', 'sub_section_name': 'Relation between field and potential', 'sub_section_page_no': 55}]}, {'section_no': '2.7', 'section_name': 'POTENTIAL ENERGY OF A SYSTEM OF CHARGES', 'page_no': 55, 'sub_sections': []}, {'section_

### neo4j utility functions

In [74]:
!pip install neo4j

Collecting neo4j
  Downloading neo4j-5.27.0-py3-none-any.whl.metadata (5.9 kB)
Downloading neo4j-5.27.0-py3-none-any.whl (301 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.7/301.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: neo4j
Successfully installed neo4j-5.27.0


In [39]:
import neo4j

def execute_query(query):
    uri = "neo4j+s://a83594c4.databases.neo4j.io" #"bolt://3.228.217.85:7687"
#     driver = neo4j.GraphDatabase.driver(uri, auth=neo4j.basic_auth('neo4j', 'system-advantage-brake'))
    driver = neo4j.GraphDatabase.driver(uri, auth=neo4j.basic_auth('neo4j', 'nXh9u6nEUPJKLVnyuzxe7NDgeCVJCYBoFX1NDJan9pw'))
    with driver.session() as session:
        result = session.run(query)
        return result
# function to loop through a list of dictionaries and insert into neo4j server
def insert_into_neo4j(records,node_label,primary_key):
    for row in records:
        print(row[HEADER.index(primary_key)])
        execute_query(f"MERGE (n:{node_label} {{{primary_key}: '{row[HEADER.index(primary_key)]}'}})")

# function to loop through a list of dictionaries and create relationships between them
def create_relationships(list_of_dicts):
    for dict in list_of_dicts:
        execute_query(f"MATCH (n1:Node {{name: '{dict['name']}'}}), (n2:Node {{name: '{dict['name']}'}}) CREATE (n1)-[:RELATIONSHIP]->(n2)")    



In [88]:
import time
def clean_property_value(value):
    """
    Clean property value by removing/escaping quotes and converting to string
    """
    if value is None:
        return ""
    # Convert to string and replace quotes
    cleaned_value = str(value).replace("'", "")
    return cleaned_value

def update_sections_with_content(chapter_name, pdf_path="leph102.pdf"):
    """
    Updates Neo4j SECTION nodes with content extracted from PDF for a given chapter
    
    Args:
        chapter_name: Name of the chapter whose sections need to be updated
    """
    # First get all sections for the given chapter from Neo4j
    query = f"""
    MATCH (c:CHAPTER {{chapter_name: '{chapter_name}'}})-[:HAS_content]->(s:SECTION)
    RETURN s.section_name as section_name
    """
    
    # Execute query and collect all results immediately
    with neo4j.GraphDatabase.driver(
        "neo4j+s://a83594c4.databases.neo4j.io", 
        auth=neo4j.basic_auth('neo4j', 'nXh9u6nEUPJKLVnyuzxe7NDgeCVJCYBoFX1NDJan9pw')
    ) as driver:
        with driver.session() as session:
            result = session.run(query)
            sections = [record["section_name"] for record in result.data()]
    
    print(f"Found sections: {sections}")
#     sections = ['SUMMARY', 'POTENTIAL DUE TO AN ELECTRIC DIPOLE', 'POTENTIAL ENERGY OF A SYSTEM OF CHARGES', 'ELECTROSTATICS OF CONDUCTORS','EXERCISES']
    # For each section, extract content and update Neo4j
    for section_name in sections:
        try:
            # Extract content
            section_content = extract_content_from_pdf(section_name,pdf_path)
#             section_content = section_content[section_name]
            time.sleep(30)
            print("sleep completed!")
            # Clean content
            cleaned_content = clean_property_value(str(section_content))
            
            # Update Neo4j
            update_query = f"""
            MATCH (s:SECTION {{section_name: '{section_name}', chapter_name: '{chapter_name}'}})
            SET s.section_content = '{cleaned_content}'
            """
            
            print(f"Updating content for section: {section_name}")
            execute_query(update_query)
            
        except Exception as e:
            print(f"Error processing section {section_name}: {str(e)}")
            continue

In [97]:
# Call the function with a chapter name
update_sections_with_content(chapter_name="ELECTRIC CHARGES AND FIELDS",pdf_path="leph101.pdf")

Found sections: ['INTRODUCTION', 'ELECTRIC CHARGE', 'CONDUCTORS AND INSULATORS', 'BASIC PROPERTIES OF ELECTRIC CHARGE', 'COULOMBS LAW', 'FORCES BETWEEN MULTIPLE CHARGES', 'ELECTRIC FIELD', 'ELECTRIC FIELD LINES', 'ELECTRIC FLUX', 'ELECTRIC DIPOLE', 'DIPOLE IN A UNIFORM EXTERNAL FIELD', 'CONTINUOUS CHARGE DISTRIBUTION', 'GAUSSS LAW', 'APPLICATIONS OF GAUSS’S LAW', 'SUMMARY']
sleep completed!
Updating content for section: INTRODUCTION
sleep completed!
Updating content for section: ELECTRIC CHARGE
sleep completed!
Updating content for section: CONDUCTORS AND INSULATORS
sleep completed!
Updating content for section: BASIC PROPERTIES OF ELECTRIC CHARGE
sleep completed!
Updating content for section: COULOMBS LAW
sleep completed!
Updating content for section: FORCES BETWEEN MULTIPLE CHARGES


KeyboardInterrupt: 

In [194]:
# insert_into_neo4j(chapter_list,node_label='CHAPTERS',primary_key='chapter_name')
# insert_into_neo4j(sections_list,node_label='SECTIONS',primary_key='section_name')
# insert_into_neo4j(subsections_list,node_label='SUBSECTIONS',primary_key='sub_section_name')

In [None]:
# MATCH (n:SUBJECT),(m:CHAPTER) where m.subject='physics' MERGE (n)-[r:HAS_CHAPTER]->(m)  RETURN r
# MERGE (n:SUBJECT{name:'physics'}) RETURN n