In [1]:
from langchain_community.document_loaders import UnstructuredXMLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import fnmatch
import os

load_dotenv('../../.env')

path = '/home/dev/master-thesis/data/bygning/bygningspunkt_docs.xsd'
# path = '/home/dev/master-thesis/data/elveg/elveg_docs.xsd'
# path = '/home/dev/master-thesis/data/AR50/ar50_docs.xsd'
# path = '/home/dev/master-thesis/data/adresse/adresse_docs.xsd'
# path = '/home/dev/master-thesis/data/kvikkleire/kvikkleire_docs.xsd'

# loader = UnstructuredXMLLoader(path, mode='elements', xml_keep_tags=True)
# docs = loader.load()

def find_files(root_folder, extensions):
    matches = []
    for root, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if any(fnmatch.fnmatch(filename, f'*.{ext}') for ext in extensions):
                matches.append(os.path.join(root, filename))
    return matches


def get_docs_paths():
    root_folder = '/home/dev/master-thesis/data'
    docs_extensions = ['xsd']
    return find_files(root_folder, docs_extensions)

paths = get_docs_paths()
docs = []
for path in paths:
    loader = UnstructuredXMLLoader(path, mode='elements', xml_keep_tags=True)
    docs.append(*loader.load())


text_splitter = RecursiveCharacterTextSplitter(
    # chunk_size=5000,
    # chunk_overlap=500,
    # length_function=len,
    is_separator_regex=False,
    separators=[
        '<simpleType',
        '</simpleType>',
        '<complexType',
        '</complexType>',
        '<element'
        '</element>'
    ],
    keep_separator=True,
)

docs_splitted = text_splitter.split_documents(docs)

print(docs_splitted[0].page_content)

<?xml version="1.0" encoding="UTF-8"?><schema xmlns="http://www.w3.org/2001/XMLSchema" xmlns:app="http://skjema.geonorge.no/SOSI/produktspesifikasjon/Matrikkelen-Bygningspunkt/20211101" xmlns:gml="http://www.opengis.net/gml/3.2" xmlns:sc="http://www.interactive-instruments.de/ShapeChange/AppInfo" elementFormDefault="qualified" targetNamespace="http://skjema.geonorge.no/SOSI/produktspesifikasjon/Matrikkelen-Bygningspunkt/20211101" version="20211101">
  <annotation>
    <documentation>Bygningspunkt registrert i matrikkelen med bygningsnummer, bygningens nåværende (tiltaks-)status, bygningstype.

Inneholder ikke registrerte bygningsendringer (påbygg o l) eller historikken/datoer for tidligere bygningsstatuser.

Bygg med bygningstatusverdiene under er ikke med i produktet :
BR (Bygning er revet eller brent)
BA (Bygging avlyst)
BF (Bygning er flyttet)
BU (Bygningsnummer er utgått)</documentation>
    <appinfo>
      <taggedValue xmlns="http://www.interactive-instruments.de/ShapeChange/AppIn

In [2]:


path = '/home/dev/master-thesis/data/bygning/bygningspunkt_docs.xsd'

In [3]:
import xml.etree.ElementTree as ET


def split_xml_by_tags(xml_content, tags):
    # Parse the XML content
    root = ET.fromstring(xml_content)

    # Dictionary to store results
    split_contents = {tag: [] for tag in tags}

    # Recursively search for the tags and extract their content
    def search_tree(element):
        if element.tag in tags:
            split_contents[element.tag].append(
                ET.tostring(element, encoding='unicode'))
        for child in element:
            search_tree(child)

    search_tree(root)

    return split_contents


tags_to_split = ['{http://www.w3.org/2001/XMLSchema}complexType',
                 '{http://www.w3.org/2001/XMLSchema}element',
                 '{http://www.w3.org/2001/XMLSchema}simpleType']




In [4]:
print(len(docs_splitted))

print(docs_splitted[4].page_content)

241
<complexType name="ByggPropertyType">
    <sequence minOccurs="0">
      <element ref="app:Bygg"/>
    </sequence>
    <attributeGroup ref="gml:AssociationAttributeGroup"/>
    <attributeGroup ref="gml:OwnershipAttributeGroup"/>
  </complexType>
  <element name="Bygning" substitutionGroup="app:Bygg" type="app:BygningType">
    <annotation>
      <documentation>Bygning er matrikkelens representasjon av en planlagt, under oppføring, fullført eller av en eller annen grunn utgått bygning. Alle bygninger oppført etter 1983 er registrert. I enkelte kommuner har man registrert samtlige bygninger.

En bygning er identifisert med sitt bygningsnummer som er unikt på tvers av kommuner. Enhver bygning har en bygningsstatus som angir tilstand ved siste registrering. Det lagres også historikk for dette.

En bygning har en eller flere bruksenheter som beskriver logiske enheter ved bygningen (leiligheter, lokaler). Hver av disse kan ha en egen knytning til adresse og matrikkelenhet.
For bygninger 

In [5]:
docs_splitted[0].metadata

{'source': '/home/dev/master-thesis/data/bygning/bygningspunkt_docs.xsd',
 'file_directory': '/home/dev/master-thesis/data/bygning',
 'filename': 'bygningspunkt_docs.xsd',
 'last_modified': '2024-03-18T14:55:00',
 'languages': ['nor', 'dan'],
 'filetype': 'application/xml',
 'category': 'UncategorizedText'}

In [6]:
# import xml.etree.ElementTree as ET
# from xml.dom.minidom import parseString
# from langchain_core.documents import Document


# tree = ET.parse(path)
# root = tree.getroot()

# # Iterate through each child of the <schema> element
# docs = [Document(page_content=parseString(ET.tostring(child)
#                                           ).toprettyxml(newl="")) for child in root]

# docs

In [7]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings


faiss_index = FAISS.from_documents(
    docs_splitted, 
    OpenAIEmbeddings(show_progress_bar=True, chunk_size=3)
)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 84/84 [00:43<00:00,  1.92it/s]


In [8]:
# for r in res: 
#     print(f'{r.page_content}\n\n{"-" * 100}\n\n')

In [9]:
from langchain_core.vectorstores import VectorStoreRetriever
from langchain_community.vectorstores.redis import RedisVectorStoreRetriever
from langchain_community.vectorstores.faiss import FAISS

retriever: VectorStoreRetriever = faiss_index.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.3}
)

In [10]:
res = faiss_index.similarity_search('vegetation')
for r in res: 
    # print(r)
    print(r.metadata)

100%|██████████| 1/1 [00:00<00:00,  4.36it/s]

{'source': '/home/dev/master-thesis/data/AR50/ar50_docs.xsd', 'file_directory': '/home/dev/master-thesis/data/AR50', 'filename': 'ar50_docs.xsd', 'last_modified': '2024-03-18T13:59:10', 'languages': ['eng'], 'filetype': 'application/xml', 'category': 'UncategorizedText'}
{'source': '/home/dev/master-thesis/data/elveg/elveg_docs.xsd', 'file_directory': '/home/dev/master-thesis/data/elveg', 'filename': 'elveg_docs.xsd', 'last_modified': '2024-03-18T14:00:25', 'languages': ['nor', 'dan'], 'filetype': 'application/xml', 'category': 'UncategorizedText'}
{'source': '/home/dev/master-thesis/data/elveg/elveg_docs.xsd', 'file_directory': '/home/dev/master-thesis/data/elveg', 'filename': 'elveg_docs.xsd', 'last_modified': '2024-03-18T14:00:25', 'languages': ['nor', 'dan'], 'filetype': 'application/xml', 'category': 'UncategorizedText'}
{'source': '/home/dev/master-thesis/data/elveg/elveg_docs.xsd', 'file_directory': '/home/dev/master-thesis/data/elveg', 'filename': 'elveg_docs.xsd', 'last_modifi




In [11]:
import json
from operator import itemgetter
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.pydantic_v1 import BaseModel, Field
from typing import List

class ValueWithDescription(BaseModel):
    value: str = Field(..., description=(
        'Machine-readable value that the attribute accepts'))
    description: str = Field(..., description=('Description of value'))

class AttributeInfo(BaseModel):
    name: str = Field(..., description='The attribute that is queried about')
    values: List[ValueWithDescription]

class AttributeInfos(BaseModel):
    attributes: List[AttributeInfo]


template = """Provide relevant context for the question based only on the following chunks from documentation (most relevant first):
{context}

Include any machine readable values together with a description. 
If asked about an enumeration type, make sure to include the enumeration values.
Make sure that enumeration annotations are coupled with its CORRECT parent enumeration value. 

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)


llm = ChatOpenAI()

chain = (
    {
        'context': itemgetter('question') | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    # | llm.with_structured_output(AttributeInfos)
    | llm
)

llm_res = chain.invoke({
    'question': (
        'gjentaksintervall'
    )
})


print(json.dumps(llm_res.dict(), indent=4))

100%|██████████| 1/1 [00:00<00:00,  2.03it/s]


RateLimitError: Error code: 429 - {'error': {'message': 'Request too large for gpt-3.5-turbo in organization org-kcpqglgeGsaWIn9z1kdGTR9k on tokens per min (TPM): Limit 60000, Requested 69127. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [None]:
# from typing import Literal

# from langchain_core.prompts import ChatPromptTemplate
# from langchain_core.pydantic_v1 import BaseModel, Field
# from langchain_openai import ChatOpenAI

# # Data model


# class RouteQuery(BaseModel):
#     """Route a user query to the most relevant datasource."""

#     datasource: Literal["python_docs", "js_docs", "golang_docs"] = Field(
#         ...,
#         description="Given a user question choose which datasource would be most relevant for answering their question",
#     )


# # LLM with function call
# llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
# structured_llm = llm.with_structured_output(RouteQuery)

# # Prompt
# system = """You are an expert at routing a user question to the appropriate data source.

# Based on the programming language the question is referring to, route it to the relevant data source."""

# prompt = ChatPromptTemplate.from_messages(
#     [
#         ("system", system),
#         ("human", "{question}"),
#     ]
# )

# # Define router
# router = prompt | structured_llm

# question = """Why doesn't the following code work:

# from langchain_core.prompts import ChatPromptTemplate

# prompt = ChatPromptTemplate.from_messages(["human", "speak in {language}"])
# prompt.invoke("french")
# """

# result = router.invoke({"question": question})