In [1]:
import os
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import AzureChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from InstructorEmbedding import INSTRUCTOR
from dotenv import load_dotenv
from langchain.document_loaders import DirectoryLoader, PyMuPDFLoader, UnstructuredExcelLoader, UnstructuredCSVLoader, BSHTMLLoader
from langchain.document_loaders import UnstructuredPDFLoader



  from tqdm.autonotebook import trange


In [2]:
load_dotenv("test.env") 

True

In [3]:
# Initialize LLM (Azure OpenAI)
llm = AzureChatOpenAI(
    azure_deployment=os.getenv("MODEL"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    verbose=False,
    temperature=0.3,
)

In [4]:
# Check LLM is working
response = llm.invoke("Tell me a motivational quote")
print(response)


content='"Success is not final, failure is not fatal: It is the courage to continue that counts." — Winston Churchill' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 24, 'prompt_tokens': 12, 'total_tokens': 36, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_ded0d14823', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual'

In [5]:
# Step 1: Load Documents from Multiple Folders
def load_documents(directory_path):
    # Define document types and their corresponding loaders
    document_types = {
        "**/*.pdf": UnstructuredPDFLoader,
        "**/*.html": BSHTMLLoader,
        "**/*.xlsx": UnstructuredExcelLoader,
        "**/*.csv": UnstructuredCSVLoader
    }
    
    docs = []
    for glob, loader_cls in document_types.items():
        loader = DirectoryLoader(
            directory_path,
            glob=glob,
            show_progress=True,
            use_multithreading=True,
            silent_errors=True,
            recursive=True,
            loader_cls=loader_cls
        )
        docs.extend(loader.load())
    
    return docs

In [6]:
# Load both extracted files and extracted HTML files
docs1 = load_documents("extracted_files")
docs2 = load_documents("extracted_html_files")

  0%|          | 0/184 [00:00<?, ?it/s]CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
  1%|          | 1/184 [00:03<11:24,  3.74s/it]CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to Media

In [7]:
# Combine all loaded documents
all_documents = docs1 + docs2

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
split_documents = text_splitter.split_documents(all_documents)

In [16]:
# Initialize Instructor Embedding model
instructor_model = INSTRUCTOR('hkunlp/instructor-base')


load INSTRUCTOR_Transformer
max_seq_length  512


In [23]:
def create_embeddings(text_chunks):
    # Ensure each input follows the INSTRUCTOR format
    sample_data = [("Represent this document for retrieval", text.strip()) for text in text_chunks]


    # Debugging: Print first few samples
    print("Sample Input to INSTRUCTOR:", sample_data[:3])

    # Encode with Instructor Model (try batch_size=8 to avoid memory issues)
    embeddings = instructor_model.encode(sample_data, batch_size=8, show_progress_bar=True)
  

    return embeddings.tolist()


In [24]:
text_chunks = [str(doc.page_content) for doc in split_documents if doc.page_content]

# Debugging: Print first 3 text chunks before encoding
print("First 3 text chunks:", text_chunks[:3])

# Generate embeddings
embeddings = create_embeddings(text_chunks)


First 3 text chunks: ['R3 Security and Resilience Policy\n\nThe nature and breadth of our business means we are vulnerable to situations that can potentially impact the wellbeing of our people, disrupt our business and threaten the attainment of our strategic objectives.\n\nWe are committed to applying our R3 security and resilience management system to ensure we are adequately prepared to mitigate and manage the impact of any critical incident situation or R3 event.\n\nIn pursuing this commitment, we will:\n\nComply with applicable laws, regulations and governance standards in all areas\n\nwhere we operate\n\nApply risk management with an emphasis on preventative and preparatory actions,\n\nto effectively identify, monitor and manage our areas of exposure\n\nDevelop and maintain risk-appropriate security, response and recovery plans and\n\nsupport resources\n\nProvide regular training for our people responsible for managing declared R3 events\n\nAppoint authorized spokespersons in lin

Batches:   0%|          | 0/1593 [00:00<?, ?it/s]

ValueError: not support other modes

In [None]:
# Convert documents into text chunks for embedding
text_chunks = [str(doc.page_content) for doc in split_documents if doc.page_content]
embeddings = create_embeddings(text_chunks)


In [None]:
# Convert text chunks into LangChain Document format
documents = [Document(page_content=text) for text in text_chunks]

In [None]:
# Store embeddings in FAISS
db = FAISS.from_embeddings(embeddings, documents)

In [None]:
# Save FAISS database
db.save_local("faiss-db")

In [None]:
# Load vector store
new_vectorstore = FAISS.load_local("faiss-db", allow_dangerous_deserialization=True)


In [None]:
# Retrieval-Augmented Generation (RAG) setup
system_prompt = (
    "Use the given context to answer the question. "
    "If context is different from the question, say you don't know. "
    "Use three sentences maximum and keep the answer concise. "
    "Context: {context}"
)

In [None]:
retrieval_qa_chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])


In [None]:
combine_docs_chain = create_stuff_documents_chain(llm, retrieval_qa_chat_prompt)
retrieval_chain = create_retrieval_chain(new_vectorstore.as_retriever(), combine_docs_chain)


In [None]:
# Example Query
query = "What is RAG?"
response = retrieval_chain.invoke({"input": query})
print(response["answer"])