## ---------------------------- Azure ----------------------------------

In [47]:
import os
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential

endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"]) if len(os.environ["AZURE_SEARCH_ADMIN_KEY"]) > 0 else DefaultAzureCredential()
index_name = os.environ["AZURE_SEARCH_INDEX"]
blob_connection_string = os.environ["BLOB_CONNECTION_STRING"]
blob_container_name = os.environ["BLOB_CONTAINER_NAME"]
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.environ["AZURE_OPENAI_KEY"] if len(os.environ["AZURE_OPENAI_KEY"]) > 0 else None
azure_openai_embedding_deployment = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"]

Upload/load documents

In [41]:
from azure.storage.blob import BlobServiceClient

# Connect to Blob Storage
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
container_client = blob_service_client.get_container_client(blob_container_name)
if not container_client.exists():
    container_client.create_container()

# Add documents to blob storage
#documents_directory = os.path.join("..", "..", "data", "documents")
#for file in os.listdir(documents_directory):
#    with open(os.path.join(documents_directory, file), "rb") as data:
#        name = os.path.basename(file)
#        if not container_client.get_blob_client(name).exists():
#            container_client.upload_blob(name=name, data=data)
for name in container_client.list_blob_names():
    print(name)

CCB NS 8415 Underentreprise, avklarende kontraktsm_te, referat.pades.pdf
Sample-Contract-Agreement-Template-PDF.pdf


Create data source

In [42]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection
)
from azure.search.documents.indexes._generated.models import NativeBlobSoftDeleteDeletionDetectionPolicy

data_source_name = "pdfdocumentsdata"

# Create a data source 
indexer_client = SearchIndexerClient(endpoint, credential)
container = SearchIndexerDataContainer(name=blob_container_name)
data_source_connection = SearchIndexerDataSourceConnection(
    name=data_source_name,
    type="azureblob",
    connection_string=blob_connection_string,
    container=container,
    data_deletion_detection_policy=NativeBlobSoftDeleteDeletionDetectionPolicy()
)
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

Data source 'pdfdocumentsdata' created or updated


Create an index

In [43]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    SemanticConfiguration,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    SearchIndex
)

model_name = "text-embedding-ada-002"

# Create a search index client
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)

# Define the fields for the search index
fields = [
    SearchField(name="parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchField(name="title", type=SearchFieldDataType.String),
    SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),
    SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
    SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
]

# Configure the vector search configuration
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE,
            ),
        ),
        ExhaustiveKnnAlgorithmConfiguration(
            name="myExhaustiveKnn",
            parameters=ExhaustiveKnnParameters(
                metric=VectorSearchAlgorithmMetric.COSINE,
            ),
        ),
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
            vectorizer="myOpenAI",  # Use the vectorizer name defined below
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm_configuration_name="myExhaustiveKnn",
            vectorizer="myOpenAI",  # Use the vectorizer name defined below
        ),
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            name="myOpenAI",
            kind="azureOpenAI",
            azure_open_ai_parameters=AzureOpenAIParameters(
                resource_uri=azure_openai_endpoint,
                deployment_id=azure_openai_embedding_deployment,
                api_key=azure_openai_key,
                model_name=model_name  # Include the model name
            ),
        ),
    ],
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        content_fields=[SemanticField(field_name="chunk")]
    ),
)

# Create the semantic search with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index
index_name = "myprefix-index"
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)

print(f"{result.name} created")

myprefix-index created


Create a Skillset

In [44]:
from azure.search.documents.indexes.models import (
    SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    AzureOpenAIEmbeddingSkill,
    SearchIndexerIndexProjections,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    IndexProjectionMode,
    SearchIndexerSkillset
)

# Create a skillset  
skillset_name = f"{index_name}-skillset"  
  
split_skill = SplitSkill(  
    description="Split skill to chunk documents",  
    text_split_mode="pages",  
    context="/document",  
    maximum_page_length=2000,  
    page_overlap_length=500,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/content"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="textItems", target_name="pages")  
    ],  
)  
  
embedding_skill = AzureOpenAIEmbeddingSkill(  
    description="Skill to generate embeddings via Azure OpenAI",  
    context="/document/pages/*",  
    resource_uri=azure_openai_endpoint,  
    deployment_id=azure_openai_embedding_deployment,  
    api_key=azure_openai_key,
    model_name=model_name,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/pages/*"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="embedding", target_name="vector")  
    ],  
)  
  
index_projections = SearchIndexerIndexProjections(  
    selectors=[  
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,  
            parent_key_field_name="parent_id",  
            source_context="/document/pages/*",  
            mappings=[  
                InputFieldMappingEntry(name="chunk", source="/document/pages/*"),  
                InputFieldMappingEntry(name="vector", source="/document/pages/*/vector"),  
                InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),  
            ],  
        ),  
    ],  
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),  
)  
  
skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to chunk documents and generating embeddings",  
    skills=[split_skill, embedding_skill],  
    index_projections=index_projections,  
)  
  
client = SearchIndexerClient(endpoint, credential)  
client.create_or_update_skillset(skillset)  
print(f"{skillset.name} created")  

myprefix-index-skillset created


Create an indexer

In [45]:
from azure.search.documents.indexes.models import (
    SearchIndexer,
    FieldMapping
)

# Create an indexer  
indexer_name = f"{index_name}-indexer"  
  
indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings",  
    skillset_name=skillset_name,  
    target_index_name=index_name,  
    data_source_name=data_source.name,  
    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results  
    field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")]  
)  
  
indexer_client = SearchIndexerClient(endpoint, credential)  
indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer  
indexer_client.run_indexer(indexer_name)  
print(f' {indexer_name} is created and running. If queries return no results, please wait a bit and try again.')  

 myprefix-index-indexer is created and running. If queries return no results, please wait a bit and try again.


Perform vector similarity search

In [46]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

# Pure Vector Search
query = "Kan man be om mer penger for hulltaking i himling?"  
  
search_client = SearchClient(endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="vector", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["parent_id", "chunk_id", "chunk"],
    top=1
)  
  
for result in results:  
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['chunk']}")

parent_id: aHR0cHM6Ly9ibG9ic3RvcmFnZW5hbWV0aW51cy5ibG9iLmNvcmUud2luZG93cy5uZXQvcGRmZG9jdW1lbnRzL0NDQiUyME5TJTIwODQxNSUyMFVuZGVyZW50cmVwcmlzZSwlMjBhdmtsYXJlbmRlJTIwa29udHJha3RzbV90ZSwlMjByZWZlcmF0LnBhZGVzLnBkZg2
chunk_id: f527fb618209_aHR0cHM6Ly9ibG9ic3RvcmFnZW5hbWV0aW51cy5ibG9iLmNvcmUud2luZG93cy5uZXQvcGRmZG9jdW1lbnRzL0NDQiUyME5TJTIwODQxNSUyMFVuZGVyZW50cmVwcmlzZSwlMjBhdmtsYXJlbmRlJTIwa29udHJha3RzbV90ZSwlMjByZWZlcmF0LnBhZGVzLnBkZg2_pages_6
Score: 0.83721566
Content: daglig.  
Støvsuging utføres av UE ved behov – minimum ukentlig i soner der UE har arbeider.  
 

3.15.  Avfallsmengde:  
Prosjektet har et overordnet mål om en avfallsmengde på 18 kg/m2. Dette er et 
ambisiøst mål og det er derfor viktig at riktige tiltak treffes. UE bes synliggjøre hvilke 
tiltak som UE kan utføre i denne sammenheng.  
 
OBI: Returordning for himlingsplater fra Rockfon kan organiseres, men dette vil ha en 
kostnad som BH må dekke. 
 

3.16.  Akser, stikking, måltaking:  
UE benytter fasadeprofiler og påfori

## -----------------------------LangChain----------------------------------


In [38]:
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")
os.environ["LANGCHAIN_API_KEY"] = os.environ.get("LANGCHAIN_API_KEY")

os.environ["LANGCHAIN_TRACING_V2"]= "true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"]="RAG-test"

In [39]:
from langchain_openai import ChatOpenAI, OpenAI
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", api_key=os.environ.get("OPENAI_API_KEY"),temperature=0)

KeyboardInterrupt: 

In [None]:
import bs4
from langchain import hub
from langchain_chroma import Chroma

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings

from langchain.prompts import ChatPromptTemplate

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

def vectorstore_from_pdf(pdf_file,chunk_size=1000,chunk_overlap=200,embedding_model = "text-embedding-3-small"):
    loader = PyPDFLoader(pdf_file)
    pages = loader.load_and_split()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    splits = text_splitter.split_documents(pages)
    vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(model=embedding_model))
    return vectorstore

vectorstore = vectorstore_from_pdf("CCB NS 8415 Underentreprise, avklarende kontraktsm_te, referat.pades.pdf",
                                   chunk_size=250,
                                   chunk_overlap=100)
retriever = vectorstore.as_retriever()

In [None]:
# RAG-Fusion:
template = """ 
Du er en hjelpsom assistent som genererer flere søkeforespørsler basert på en enkelt inngangsforespørsel. \n
Generer flere søkeforespørsler relatert til: {question} \n
Output (3 søkeforespørsler):
"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

generate_queries = (
    prompt_rag_fusion 
    | ChatOpenAI(temperature=0)
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [None]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion

In [None]:
# RAG
from operator import itemgetter
template = """
Du er en assistent som skal sjekke om den 
innkommende etterspørselen fra et selskap med 
krav om betaling er gyldig i henhold til relevante kontraktsdokumenter, 
eller om arbeidet egentlig er inkludert i prising gitt i kontrakten. 
Begrunn hvorfor/hvorfor ikke den innkommende etterspørselen er 
gyldig i henhold til kontraktsdokumentene. 
Dersom kontraktsdokumentet nevner at "prisen er inkludert" betyr dette at selskapet ikke kan kreve mer penger på denne posten.
Begrens svaret til 100 ord.: {context}

Question: {question}
"""

question = """
Tittel
El.kanal gjennom himling

Beskrivelse av forholdet
Det skal monteres el-kanal gjennom himlingene

Utredning og forslag til løsning
Kappet vegglister for el-kanalene i bygg F 5-6-7-8 etg

Vederlagsjustering etter
Regningsarbeid

Kostnadsoversikt
Postnr.	Beskrivelse	Firma	Enhet	Mengde	Enhetspris	Påslag	Totalbeløp	
1	arbeid	OBI	timer	8	660	-	5 280	
Totalsum alle kostnader	5 280
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

'Den innkommende etterspørselen om å montere el-kanal gjennom himlingene er ikke gyldig i henhold til kontraktsdokumentene. Kontrakten nevner at "Hulltaking i systemvegger for el ektrokanal langs fasade er inkludert", noe som indikerer at arbeidet med el-kanalene allerede er dekket i prisen gitt i kontrakten. Derfor kan selskapet ikke kreve ekstra betaling for dette arbeidet.'