In [7]:
import google.generativeai as genai
import os
import fitz
from langchain_core.output_parsers import JsonOutputParser
from openai import OpenAI
client = OpenAI()
from typing import Optional
import yaml
import sys
from pydantic import BaseModel,Field,create_model 
from langchain_openai import ChatOpenAI
import json
from db import *
from langchain_community.document_loaders import PyPDFLoader

In [2]:
# from langchain_core.pydantic_v1 import BaseModel, Field 
from typing import Optional
from typing import Union

# class CHAPTER(BaseModel):
#     chapter_name:str = Field(description = "name of the chapter")
        
# class SECTION_NAME(BaseModel):
#     section_name:str = Field(description="Name of the section")
        
# class SECTION(BaseModel):
#     section_name:SECTION_NAME = Field(description = "name of the section")
#     section_content:str = Field(description = "content of the section")
        
# class SUBSECTION(BaseModel):
#     subsection_name:str = Field(description = "name of the sub-section within the section")
#     subsection_content:str = Field(description = "content of the sub-section within the section")
        
        
# class sec_subsec_rel(BaseModel):
#     from_entity:SECTION_NAME = Field(description = "SECTION details extracted from the content")
#     to_entity:SUBSECTION = Field(description = "SUBSECTION details extracted from the content")
        
# class chapter_sec_rel(BaseModel):
#     from_entity:CHAPTER = Field(description = "CHAPTER details extracted from the content")
#     to_entity:SECTION = Field(description = "SECTION details extracted from the content")

# class Graph(BaseModel):
#     all_relationships:list[Union[sec_subsec_rel,chapter_sec_rel]] = Field("all required relationships between nodes extracted from the data")




   


In [3]:

def load_schema(yaml_file):
    with open(yaml_file, 'r') as file:
        yaml_data = yaml.safe_load(file)
    return yaml_data
        
schema = load_schema('config/schema.yml')

In [4]:
print(schema)

{'graph': {'ENTITIES': [{'name': 'CHAPTER', 'description': 'A chapter of a book describing a broader topic', 'properties': [{'name': 'CHAPTER_TITLE', 'description': 'The title of the section', 'type': 'str', 'isPKEY': True}, {'name': 'CHAPTER_NO', 'description': 'The chapter no of the chapter', 'type': 'str'}]}, {'name': 'SECTION', 'description': 'A section of a book describing a broader topic', 'properties': [{'name': 'SECTION_TITLE', 'description': 'The title of the section', 'type': 'str', 'isPKEY': True}, {'name': 'SECTION_CONTENT', 'description': 'The content of the section', 'type': 'str'}]}, {'name': 'SUBSECTION', 'description': 'The list of subsections present in the section', 'properties': [{'name': 'SUBSECTION_TITLE', 'description': 'The title of the subsection', 'type': 'str', 'isPKEY': True}, {'name': 'SUBSECTION_CONTENT', 'description': 'The content of the subsection', 'type': 'str'}]}], 'RELATIONS': [{'name': 'chapter_sec_rel', 'properties': {'from': 'CHAPTER', 'to': 'SEC

In [8]:

def read_content_from_pdf(pdf_file_path):
#     pdf_file_path = "data/leph101.pdf"
    loader = PyPDFLoader(pdf_file_path)
    docs = loader.load()
    print(len(docs))
    content = ''
    for doc in docs:
        content += doc.page_content[0:]
    return content

content = read_content_from_pdf("data/leph101.pdf")

44


In [9]:
def read_entities(schema):
    entities = schema['graph']['ENTITIES']
    return entities
    
def read_relations(schema):
    if 'RELATIONS' not in schema['graph']:
        return None
    relations = schema['graph']['RELATIONS']
    return relations

read_entities(schema)  
read_relations(schema)

[{'name': 'chapter_sec_rel',
  'properties': {'from': 'CHAPTER', 'to': 'SECTION', 'relation': 'includes'}},
 {'name': 'sec_subsec_rel',
  'properties': {'from': 'SECTION',
   'to': 'SUBSECTION',
   'relation': 'contains'}}]

In [10]:
class ENTITY_NODE(BaseModel):
    entity_label:str = Field(description = "Type of entity extracted from the data For eg. PERSON,OBJECT,PLACE etc")
    entity_name:str = Field(description = "Name of the entity extracted For eg. Shakespear, London etc")   
        
class ENTITY(BaseModel):
    entity:ENTITY_NODE = Field(description="entity extracted from data")
              
class RELATION(BaseModel):
    from_entity:ENTITY_NODE = Field(description = "Entity from which the relationship exists")
    to_entity:ENTITY_NODE = Field(description = "Entity to which the relationship points to")
    relation_type:str = Field(description="Relation between from and to enitites.For eg: HAS,INCLUDES, OWNS, VISITS etc")
                                                             
class FREE_FLOW_GRAPH(BaseModel):
    entities:list[ENTITY] = Field(description="List of all entities extracted from data")
    relations:list[RELATION] = Field(description="Captures all relationships between all entities captured from data")
    
def create_freeflow_graph_model():
    return FREE_FLOW_GRAPH

def create_entity_class(all_defined_entities:dict):
    ENTITY_CLASS_FIELDS = {}   
    lst = [entity for entity in list(all_defined_entities.values())]
    ENTITY_CLASS_FIELDS['entity'] = (Union[tuple(lst)],Field(description="entity extracted from data"))
    ENTITY_CLASS = create_model(
                'ENTITY', **ENTITY_CLASS_FIELDS
            )
    return ENTITY_CLASS

def str_to_class(datatype):
    return getattr(sys.modules[__name__], datatype)


def create_relation_class(all_defined_relations):
    REL_CLASS_FIELDS = {}
    REL_CLASS_FIELDS['from_entity'] = (ENTITY,Field(description = "Entity from which the relationship exists"))
    REL_CLASS_FIELDS['to_entity'] = (ENTITY, Field(description = "Entity to which the relationship points to"))
    REL_CLASS_FIELDS['relation_type'] = (str,Field(description = "Relation between from and to enitites.For eg: HAS,INCLUDES, OWNS, VISITS etc"))
    RELATION_CLASS = create_model('RELATION' , **REL_CLASS_FIELDS)
    return RELATION_CLASS
    
def create_dyn_model_with_entities(entities,relations=None):
    all_defined_entities = {}
    for entity in entities:
        properties = entity['properties']
        fields = {}
        for prop in properties:
            fields[prop['name']] = (eval(prop['type']), Field(description=prop['description']))
        fields['entity_label'] = (str, entity['name'])
        print(f"creating model for {entity}")    
        entity_class = create_model(
                entity['name'], **fields
            )# creates individual defined entities
        all_defined_entities[entity['name']] = entity_class
        
    
    ENTITY_CLASS = create_entity_class(all_defined_entities)
    
    if relations is None:
        RELATION_CLASS = create_relation_class(all_defined_relations)
    else:
        all_defined_relations= {}
        for rel in relations:
            properties = rel['properties']
            print(properties)
            fields = {}
            if not properties['from'] in all_defined_entities.keys() or not properties['to'] in all_defined_entities.keys():
                print("entities not present!")
                return None

            fields['from_entity'] = (all_defined_entities[properties['from']],Field(description = "Entity from which the relationship exists"))
            fields['to_entity'] = (all_defined_entities[properties['to']],Field(description = "Entity to which the relationship exists"))
            fields['relation_type'] = (str,Field(description="Relation between from and to enitites.For eg: HAS,INCLUDES, OWNS, VISITS etc"))
            relation_class = create_model(rel['name'],**fields) 
            all_defined_relations[rel['name']] = relation_class
            
        RELATION_CLASS = Union[tuple([r for r in all_defined_relations.values()])]
    
    print(ENTITY_CLASS.model_json_schema())
    graph_fields = {}
    graph_fields['entities'] = (list[ENTITY_CLASS],Field(description="List of all entities extracted from data"))
    graph_fields['relations'] = (list[RELATION_CLASS],Field(description="Captures all relationships between all entities captured from data"))
    GRAPH_CLASS =  create_model('GRAPH' , **graph_fields)   
#     GRAPH_CLASS.model_rebuild()
    return GRAPH_CLASS
   
    
    
    
     
    
    
def create_structured_output_model(entities,relations):
    if entities is None and relations is None:
        structured_model = create_freeflow_graph_model()
    else:
        structured_model = create_dyn_model_with_entities(entities,relations)
    return structured_model
    

In [11]:
read_entities(schema)

[{'name': 'CHAPTER',
  'description': 'A chapter of a book describing a broader topic',
  'properties': [{'name': 'CHAPTER_TITLE',
    'description': 'The title of the section',
    'type': 'str',
    'isPKEY': True},
   {'name': 'CHAPTER_NO',
    'description': 'The chapter no of the chapter',
    'type': 'str'}]},
 {'name': 'SECTION',
  'description': 'A section of a book describing a broader topic',
  'properties': [{'name': 'SECTION_TITLE',
    'description': 'The title of the section',
    'type': 'str',
    'isPKEY': True},
   {'name': 'SECTION_CONTENT',
    'description': 'The content of the section',
    'type': 'str'}]},
 {'name': 'SUBSECTION',
  'description': 'The list of subsections present in the section',
  'properties': [{'name': 'SUBSECTION_TITLE',
    'description': 'The title of the subsection',
    'type': 'str',
    'isPKEY': True},
   {'name': 'SUBSECTION_CONTENT',
    'description': 'The content of the subsection',
    'type': 'str'}]}]

In [17]:
structured_model = create_structured_output_model(read_entities(schema),read_relations(schema))
# structured_model = create_structured_output_model(None,None)
structured_model

creating model for {'name': 'CHAPTER', 'description': 'A chapter of a book describing a broader topic', 'properties': [{'name': 'CHAPTER_TITLE', 'description': 'The title of the section', 'type': 'str', 'isPKEY': True}, {'name': 'CHAPTER_NO', 'description': 'The chapter no of the chapter', 'type': 'str'}]}
creating model for {'name': 'SECTION', 'description': 'A section of a book describing a broader topic', 'properties': [{'name': 'SECTION_TITLE', 'description': 'The title of the section', 'type': 'str', 'isPKEY': True}, {'name': 'SECTION_CONTENT', 'description': 'The content of the section', 'type': 'str'}]}
creating model for {'name': 'SUBSECTION', 'description': 'The list of subsections present in the section', 'properties': [{'name': 'SUBSECTION_TITLE', 'description': 'The title of the subsection', 'type': 'str', 'isPKEY': True}, {'name': 'SUBSECTION_CONTENT', 'description': 'The content of the subsection', 'type': 'str'}]}
{'from': 'CHAPTER', 'to': 'SECTION', 'relation': 'include

__main__.GRAPH

In [13]:
def extract_structured_output(structured_output_model,content):
    prompt = f"""
              you are a helpful assistant good at extracting the contents discussed in the topic in a structured way.
              
              
              topic: {content}
              """
    model = ChatOpenAI(model="gpt-4o", temperature=0)
    structured_llm = model.with_structured_output(structured_output_model)
#     structured_llm = model.with_structured_output(Graph)
    response = structured_llm.invoke(prompt)
    return response

In [None]:
section_content = """Chapter 1- Basic physics concepts
1.4 BASIC PROPERTIES OF ELECTRIC CHARGE
We have seen that there are two types of charges, namely positive and negative and their effects tend to cancel each other. Here, we shall now describe some other properties of the electric charge.
If the sizes of charged bodies are very small as compared to the distances between them, we treat them as point charges. All the charge content of the body is assumed to be concentrated at one point in space.
1.4.1 Additivity of charges
We have not as yet given a quantitative definition of a charge; we shall follow it up in the next section. We shall tentatively assume that this can be done and proceed. If a system contains two point charges q1 and q2, the total charge of the system is obtained simply by adding
4
algebraically q1 and q2 , i.e., charges add up like real numbers or they are scalars like the mass of a body. If a system contains n charges q1, q2,q3,...,qn,thenthetotalchargeofthesystemisq1 +q2 +q3 +...+qn . Charge has magnitude but no direction, similar to mass. However, there is one difference between mass and charge. Mass of a body is always positive whereas a charge can be either positive or negative. Proper signs have to be used while adding the charges in a system. For example, the total charge of a system containing five charges +1, +2, –3, +4 and –5, in some arbitrary unit, is (+1) + (+2) + (–3) + (+4) + (–5) = –1 in the same unit.
1.4.2 Charge is conserved
We have already hinted to the fact that when bodies are charged by rubbing, there is transfer of electrons from one body to the other; no new charges are either created or destroyed. A picture of particles of electric charge enables us to understand the idea of conservation of charge. When we rub two bodies, what one body gains in charge the other body loses. Within an isolated system consisting of many charged bodies, due to interactions among the bodies, charges may get redistributed but it is found that the total charge of the isolated system is always conserved. Conservation of charge has been established experimentally."""
# graph = extract_structured_output(section_content)
# graph = extract_structured_output(structured_model,section_content)
graph = extract_structured_output(structured_model,content)
graph = json.loads(graph.model_dump_json())
graph

### Graph builder

In [15]:
entity_nodes = graph['entities'] 
rel_nodes = graph['relations']
entity_nodes

[{'entity': {'entity_label': 'CONCEPT', 'entity_name': 'Electric Charge'}},
 {'entity': {'entity_label': 'PROPERTY',
   'entity_name': 'Additivity of Charges'}},
 {'entity': {'entity_label': 'PROPERTY',
   'entity_name': 'Conservation of Charge'}},
 {'entity': {'entity_label': 'OBJECT', 'entity_name': 'Point Charges'}},
 {'entity': {'entity_label': 'OBJECT', 'entity_name': 'Charged Bodies'}},
 {'entity': {'entity_label': 'OBJECT', 'entity_name': 'Electrons'}},
 {'entity': {'entity_label': 'OBJECT', 'entity_name': 'System of Charges'}}]

In [16]:
def convert_dict_to_str(node_dict):
    props_list = []
    for k,v in node_dict.items():
            v = v.replace("'", "`")
            props = f"{k}:'{v}'"
            props_list.append(props)
            
    attrs = ','.join(props_list)
    return attrs
    

def create_graph_nodes(entity_name, props):
    query = f""" MERGE (n:{entity_name} {{{convert_dict_to_str(props)}}}) RETURN n"""
    print(query)
    execute_query(query)


def create_nodes(entity_nodes):
    for entity in entity_nodes:
        node = entity['entity']
        node_label = entity['entity']['entity_label']
        print(node_label)
        create_graph_nodes(node_label,node)
        
create_nodes(entity_nodes)

CONCEPT
 MERGE (n:CONCEPT {entity_label:'CONCEPT',entity_name:'Electric Charge'}) RETURN n
Initializing Neo4j connection...
Loading Neo4j credentials...
✓ Credentials loaded successfully
✓ Neo4j connection established
PROPERTY
 MERGE (n:PROPERTY {entity_label:'PROPERTY',entity_name:'Additivity of Charges'}) RETURN n
Initializing Neo4j connection...
Loading Neo4j credentials...
✓ Credentials loaded successfully
✓ Neo4j connection established
PROPERTY
 MERGE (n:PROPERTY {entity_label:'PROPERTY',entity_name:'Conservation of Charge'}) RETURN n
Initializing Neo4j connection...
Loading Neo4j credentials...
✓ Credentials loaded successfully
✓ Neo4j connection established
OBJECT
 MERGE (n:OBJECT {entity_label:'OBJECT',entity_name:'Point Charges'}) RETURN n
Initializing Neo4j connection...
Loading Neo4j credentials...
✓ Credentials loaded successfully
✓ Neo4j connection established
OBJECT
 MERGE (n:OBJECT {entity_label:'OBJECT',entity_name:'Charged Bodies'}) RETURN n
Initializing Neo4j connecti

In [20]:
def create_graph_relations(rel):
    from_entity_props = rel['from_entity']
    from_node_entity = from_entity_props['entity_label']
    
    to_entity_props = rel['to_entity']
    to_node_entity = to_entity_props['entity_label']
    
    rel_type = rel['relation_type']
    
    query = f"""MATCH (p:{from_node_entity}{{{convert_dict_to_str(from_entity_props)}}}), (c:{to_node_entity}{{{convert_dict_to_str(to_entity_props)}}})
    WITH p,c
    MERGE (p)-[:{rel_type}]->(c)"""
    print(f"Creating relation from {from_node_entity} to {to_node_entity} as {rel_type}")
    execute_query(query)

def create_relations(relations):
    for rel in relations:
        create_graph_relations(rel)
        

create_relations(rel_nodes)

Creating relation from CONCEPT to PROPERTY as HAS
Initializing Neo4j connection...
Loading Neo4j credentials...
✓ Credentials loaded successfully
✓ Neo4j connection established
Creating relation from CONCEPT to PROPERTY as HAS
Initializing Neo4j connection...
Loading Neo4j credentials...
✓ Credentials loaded successfully
✓ Neo4j connection established
Creating relation from OBJECT to OBJECT as TREATED_AS
Initializing Neo4j connection...
Loading Neo4j credentials...
✓ Credentials loaded successfully
✓ Neo4j connection established
Creating relation from OBJECT to PROPERTY as EXHIBIT
Initializing Neo4j connection...
Loading Neo4j credentials...
✓ Credentials loaded successfully
✓ Neo4j connection established
Creating relation from OBJECT to PROPERTY as INVOLVED_IN
Initializing Neo4j connection...
Loading Neo4j credentials...
✓ Credentials loaded successfully
✓ Neo4j connection established
Creating relation from OBJECT to PROPERTY as EXHIBIT
Initializing Neo4j connection...
Loading Neo4j c