In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

import logging
import sys
from IPython.display import Markdown

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [5]:
from llama_index import (
    ServiceContext,
    StorageContext,
    LLMPredictor,
    VectorStoreIndex,
    load_index_from_storage
)

In [3]:
from src.index_struct.simple_vector_store import docs2vecstore
from src.utils.gen_utils import get_llm, get_llama_embeddings_model
from src.docstore.read_docs import read_docs

In [14]:
nodes, docs = docs2vecstore(
        path="./data/pub/building_codes.pdf",
        index_id = "property_collection",
        docname="International_Building_Code",
        chunk_chars=1024,
        overlap=5,
        force_pypdf=False,
        model_temperature=0,
        persist_dir="./_property_index_storage",
    )

DEBUG:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
DEBUG:openai:api_version=None data='{"input": [[6190, 25, 7327, 97786, 287, 39184, 198, 11014, 25, 220, 16, 12, 17, 198, 1213, 1292, 25, 4857, 39582, 271, 88539, 220, 914, 12, 717, 13, 54528, 45, 15942, 14083, 50, 627, 3065, 33738, 220, 16, 13, 48797, 1753, 14083, 627, 52337, 220, 16, 13, 7327, 17283, 6247, 323, 8949, 99317, 627, 18332, 220, 914, 12, 717, 12, 16, 92511, 48797, 1753, 14083, 627, 4444, 340, 791, 7327, 17283, 6247, 11, 220, 2366, 16, 14398, 11, 4756, 555, 279, 7327, 6247, 9251, 3573, 2366, 16, 720, 34746, 17283, 6247, 909, 374, 18306, 323, 32762, 555, 5905, 1139, 420, 3857, 449, 279, 720, 451, 1169, 919, 304, 3804,

TypeError: cannot unpack non-iterable NoneType object

In [15]:
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from llama_index.embeddings import LangchainEmbedding

# def get_llm(model_temperature):
#     os.environ["OPENAI_API_KEY"] = api_key
#     return ChatOpenAI(temperature=model_temperature, model_name="gpt-3.5-turbo")


# llm = get_llm(model_temperature=0)

llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
llm_predictor_chat = LLMPredictor(llm=llm)
embed_model = LangchainEmbedding(OpenAIEmbeddings())

service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor_chat,
    embed_model=embed_model,
    chunk_size=1024
)

storage_context = StorageContext.from_defaults(persist_dir="./_property_index_storage")

DEBUG:llama_index.storage.kvstore.simple_kvstore:Loading llama_index.storage.kvstore.simple_kvstore from ./_property_index_storage\docstore.json.
Loading llama_index.storage.kvstore.simple_kvstore from ./_property_index_storage\docstore.json.
Loading llama_index.storage.kvstore.simple_kvstore from ./_property_index_storage\docstore.json.
DEBUG:fsspec.local:open file: c:/Users/pdoub/Desktop/python_projects/junk-drawer/_property_index_storage/docstore.json
open file: c:/Users/pdoub/Desktop/python_projects/junk-drawer/_property_index_storage/docstore.json
open file: c:/Users/pdoub/Desktop/python_projects/junk-drawer/_property_index_storage/docstore.json
DEBUG:llama_index.storage.kvstore.simple_kvstore:Loading llama_index.storage.kvstore.simple_kvstore from ./_property_index_storage\index_store.json.
Loading llama_index.storage.kvstore.simple_kvstore from ./_property_index_storage\index_store.json.
Loading llama_index.storage.kvstore.simple_kvstore from ./_property_index_storage\index_stor

In [16]:
bldg_index = load_index_from_storage(
    service_context=service_context,
    storage_context=storage_context,
    index_id='property_collection'
)

INFO:llama_index.indices.loading:Loading indices with ids: ['property_collection']
Loading indices with ids: ['property_collection']
Loading indices with ids: ['property_collection']


In [17]:
test_qe = bldg_index.as_query_engine()

In [18]:
res = test_qe.query("What are the regulations on roofing related repairs?")

DEBUG:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
DEBUG:openai:api_version=None data='{"input": ["What are the regulations on roofing related repairs?"], "encoding_format": "base64"}' message='Post details'
api_version=None data='{"input": ["What are the regulations on roofing related repairs?"], "encoding_format": "base64"}' message='Post details'
api_version=None data='{"input": ["What are the regulations on roofing related repairs?"], "encoding_format": "base64"}' message='Post details'
DEBUG:urllib3.util.retry:Converted retries value: 2 -> Retry(total=2, connect=None, read=None, redirect=None, status=None)
Converted retries value: 2 -> Retry(total=2, connect=None, read=None

In [19]:
print(res)

Based on the given context information, the regulations on roofing related repairs are as follows:

- Asphalt shingles can be replaced with new asphalt shingles.
- Replacement of any roof covering that does not adversely affect the roof structure is allowed.
- Gypsum board repair that does not exceed 128 square feet is permitted, as long as it is not part of a fire resistance rated construction assembly, a shear-wall assembly, or a tub and shower surround.
- Emergency removal of water damaged material, such as gypsum board, insulation, wood paneling, etc., is allowed to avoid health hazard issues, but a permit is required for the repairs.


In [23]:
print(res.source_nodes[1].node.metadata)

{'document': 'International_Building_Code', 'pages': '5-5', 'file_name': 'building_codes'}


In [1]:
import os
import spacy
from typing import List, Tuple
from pathlib import Path
import pypdf
from llama_index.schema import Document, TextNode, NodeRelationship, RelatedNodeInfo

nlp = spacy.load('en_core_web_lg')

def get_file_name(path: Path) -> str:
    """
    Given a file path, it extracts and returns the file name without extension.

    Args:
        path (Path): The file path.

    Returns:
        str: The file name without extension.
    """
    return os.path.splitext(os.path.basename(path))[0]

def create_relationships(nodes: List) -> List:
    """
    Given a list of nodes, it creates relationships between each node and its previous one.

    Args:
        nodes (List): The list of nodes.

    Returns:
        List: The list of nodes with relationships.
    """
    for i, node in enumerate(nodes[1:], start=1):
        previous_node = nodes[i - 1]
        node.relationships[NodeRelationship.PREVIOUS] = RelatedNodeInfo(node_id=previous_node.id_)
        previous_node.relationships[NodeRelationship.NEXT] = RelatedNodeInfo(node_id=node.id_)
    return nodes

def create_text_node(split: str, chunk_chars: int, pages: List[str], file_name: str, docname: str, split_number: int) -> Tuple:
    """
    Creates a TextNode object with specific attributes extracted from a PDF file.

    Args:
        split (str): Text split from a document.
        chunk_chars (int): Maximum number of characters a text split can contain.
        pages (List[str]): List of page numbers containing the text split.
        file_name (str): Name of the PDF file.
        docname (str): Name of the document the text split belongs to.
        split_number (int): Number denoting the sequence of the text split in the document.

    Returns:
        tuple: A tuple containing a TextNode object and any remaining text after performing the split.
    """
    split = nlp(split)
    while len(split.text) > chunk_chars:
        sentences = list(split.sents)
        i = 0
        length = 0
        while length < chunk_chars and i < len(sentences):
            length += len(sentences[i])
            i += 1
        new_split = ' '.join([str(s) for s in sentences[:i]])
        pg = "-".join([pages[0], pages[-1]])
        return TextNode(
            text=new_split,
            id_=f"{file_name}_Page_{pages[0]}_Split_{split_number}",
            metadata={"document": f"{docname}", "pages": f"{pg}", "file_name": f"{file_name}"}
        ), split.text[len(new_split):]

def parse_pdf(path: Path, docname: str, chunk_chars: int, overlap: int) -> Tuple[List, Document]:
    """
    Parses a PDF file into smaller text splits(chunks) and creates a list of TextNode objects from these splits.

    Args:
        path (Path): The path to the PDF file.
        docname (str): The name of the document in the PDF file.
        chunk_chars (int): The number of characters each text split can contain.
        overlap (int): Determines if there will be an overlap of text between adjacent splits.

    Returns:
        tuple: A tuple containing a list of TextNode objects and a Document object representing the entire document.
    """   
    pdfFileObj = open(path, "rb")
    file_name = get_file_name(path)
    pdfReader = pypdf.PdfReader(pdfFileObj)
    split = ""
    page_docs: List[str] = []
    pages: List[str] = []
    nodes: List[TextNode] = []
    split_number = 1
    for i, page in enumerate(pdfReader.pages):
        page_doc = page.extract_text()
        page_docs.append(page_doc)
        split += page.extract_text()
        pages.append(str(i + 1))
        while len(split) > chunk_chars:
            node, split = create_text_node(split, chunk_chars, pages, file_name, docname, split_number)
            nodes.append(node)
            pages = [str(i + 1)]
            split_number += 1
    if len(split) > overlap:
        node, _ = create_text_node(split, chunk_chars, pages, file_name, docname, split_number)
        nodes.append(node)
    nodes = create_relationships(nodes)
    all_pages = "".join(page_docs)
    docs = Document(
        text=all_pages,
        id_=f"{file_name}",
        metadata={"document": f"{docname}"}
    )
    pdfFileObj.close()

    return nodes, docs



In [4]:
nodes, docs = parse_pdf(
    path="./data/HO3_sample.pdf", docname="HO3_Policy", chunk_chars=1024, overlap=0)

In [17]:
print(nodes[2].id_)

HO3_sample_Page_2_Split_3


In [50]:
print(nodes[2].metadata)

{'document': 'HO3_Policy', 'pages': '2-3', 'file_name': 'HO3_sample'}


In [8]:
docs

Document(id_='HO3_sample', embedding=None, metadata={'document': 'HO3_Policy'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='66da5be1c582f4aa68ec00901bbfbd2167d3dd8946ef688f04dc7a61da5e692c', text='HOMEOWNERS\nHO 00 03 10 00\nHO 00 03 10 00 Copyright, Insurance Services Office, Inc., 1999 Page 1 of 22HOMEOWNERS 3 – SPECIAL FORM\nAGREEMENT\nWe will provide the insurance described in this policy\nin return for the premium and compliance with allapplicable provisions of this policy.\nDEFINITIONS\nA.In this policy, "you" and "your" refer to the "named\ninsured" shown in the Declarations and the spouseif a resident of the same household. "We", "us"and "our" refer to the Company providing this in-surance.\nB.In addition, certain words and phrases are definedas follows:\n1."Aircraft Liability", "Hovercraft Liability", "Motor\nVehicle Liability" and "Watercraft Liability",subject to the provisions in b. below, mean the\nfollowing:\na.Liability for "bo

In [19]:
from llama_index import VectorStoreIndex, StorageContext, SimpleDirectoryReader
from llama_index.langchain_helpers.text_splitter import SentenceSplitter
from llama_index.node_parser import SimpleNodeParser

In [27]:
documents = SimpleDirectoryReader(input_files=['./data/HO3_sample.pdf']).load_data()

In [44]:
from llama_index.langchain_helpers.text_splitter import SentenceSplitter

text_splitter = SentenceSplitter(
  chunk_size=512,
  chunk_overlap=128,
)

node_parser = SimpleNodeParser(text_splitter=text_splitter)

In [45]:
nodes_ = node_parser.get_nodes_from_documents(documents, show_progress=True)

Parsing documents into nodes: 100%|██████████| 22/22 [00:00<00:00, 338.31it/s]


In [46]:
len(nodes_)

65

In [49]:
print(nodes_[0].text)

HOMEOWNERS
HO 00 03 10 00
HO 00 03 10 00 Copyright, Insurance Services Office, Inc. 1999 Page 1 of 22HOMEOWNERS 3 – SPECIAL FORM
AGREEMENT
We will provide the insurance described in this policy
in return for the premium and compliance with allapplicable provisions of this policy.
DEFINITIONS
A.In this policy, "you" and "your" refer to the "named
insured" shown in the Declarations and the spouseif a resident of the same household. "We", "us"and "our" refer to the Company providing this in-surance.
B.In addition, certain words and phrases are definedas follows:
1."Aircraft Liability", "Hovercraft Liability", "Motor
Vehicle Liability" and "Watercraft Liability",subject to the provisions in b. below, mean the
following:
a.Liability for "bodily injury" or "property dam-age" arising out of the:
(1)Ownership of such vehicle or craft by an"insured";
(2)Maintenance, occupancy, operation,use, loading or unloading of such vehi-cle or craft by any person;
(3)Entrustment of such vehicle or craft by

In [48]:
print(nodes_[1].text)

For the purpose of this definition:
(1)Aircraft means any contrivance used ordesigned for flight except model orhobby aircraft not used or designed tocarry people or cargo;
(2)Hovercraft means a self-propelled mo-torized ground effect vehicle and in-cludes, but is not limited to, flarecraftand air cushion vehicles;
(3)Watercraft means a craft principallydesigned to be propelled on or in waterby wind, engine power or electric motor;and
(4)Motor vehicle means a "motor vehicle"as defined in 7. below.2."Bodily injury" means bodily harm, sickness ordisease, including required care, loss of serv-ices and death that results.
3."Business" means:
a.A trade, profession or occupation engagedin on a full-time, part-time or occasional ba-sis; or
b.Any other activity engaged in for money orother compensation, except the following:
(1)One or more activities, not described in(2) through (4) below, for which no "in-
sured" receives more than $2,000 intotal compensation for the 12 monthsbefore the begin