In [2]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
from langchain_community.document_loaders import UnstructuredFileLoader

In [3]:
files = ["Employment Policy - WB (Updated).pdf", "ATTENDANCE POLICY.docx"]

In [6]:
loader = UnstructuredFileLoader("Performance Improvement Plan (form).xlsx")

In [7]:
docs = loader.load()

In [13]:
docs[:5]

[Document(page_content="PERFORMANCE IMPROVEMENT PLAN (PIP)\n\n\n\n\nName\n\nDOJ\n\nEmp ID\nPIP Period\nLine Manager\n\n\n\n\nFrom:\nTo:\n\n\n\n\nCurrent Designation\n\n\n\nDepartment\n\nEvaluation\\n(Tick the relevent one)\n\n\n\n\n\n\n\n\n\n\n\n\n\n1st                         2nd\n\n\n\n\n\n\n\nTarget\n\n\n\nPerformance Measures\nEmployee's  input\nLine Manager's Input\nDoes not Meet\n-\nMeet\n-\nExceed\n\n\nThis section should note the current performance issues that are to be addressed.\n\n\n\nWrite specifics like date/time, cost, \\nquality, % change etc.\nSummarize in qualitative terms, the extent objectives were met and the level of Measure Criteria mentioned achieved\n\n●\n●\n●\n●\n●\n\n\n\n\nObservation / Comments\n\nPlease Note : Unless improvement is made and sustained, termination of employment will occur\n\n\n\n\nScale Value\nObjectives Rating Scale\n\n\n\n\n\n\n\n\n1\nDoes Not Meet:\nResults did not meet expectations regarding this objective\n\n\n\n\n\n\n\n2\n-\nUse this a

In [10]:
docs

[Document(page_content="PERFORMANCE IMPROVEMENT PLAN (PIP)\n\n\n\n\nName\n\nDOJ\n\nEmp ID\nPIP Period\nLine Manager\n\n\n\n\nFrom:\nTo:\n\n\n\n\nCurrent Designation\n\n\n\nDepartment\n\nEvaluation\\n(Tick the relevent one)\n\n\n\n\n\n\n\n\n\n\n\n\n\n1st                         2nd\n\n\n\n\n\n\n\nTarget\n\n\n\nPerformance Measures\nEmployee's  input\nLine Manager's Input\nDoes not Meet\n-\nMeet\n-\nExceed\n\n\nThis section should note the current performance issues that are to be addressed.\n\n\n\nWrite specifics like date/time, cost, \\nquality, % change etc.\nSummarize in qualitative terms, the extent objectives were met and the level of Measure Criteria mentioned achieved\n\n●\n●\n●\n●\n●\n\n\n\n\nObservation / Comments\n\nPlease Note : Unless improvement is made and sustained, termination of employment will occur\n\n\n\n\nScale Value\nObjectives Rating Scale\n\n\n\n\n\n\n\n\n1\nDoes Not Meet:\nResults did not meet expectations regarding this objective\n\n\n\n\n\n\n\n2\n-\nUse this a

In [9]:
len(docs)

1

In [None]:
def read_unstructured_data(directory: str) -> str:
    text = []
    for file in tqdm(os.listdir(directory)):
        file_path = os.path.join(directory, file)
        loader = UnstructuredFileLoader(file_path)
        docs = loader.load()
        text.append(docs)

    return '\n'.join(text)

In [30]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [None]:
def recursive_character_text_splitter(text: str, chunk_size: int = 1000, chunk_overlap: int = 0) -> List[str]:
    """
    Split the text into chunks of size `chunk_size` with overlap `chunk_overlap`.
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    splits = text_splitter.split_documents(text)
    return splits

In [31]:
splits

[Document(page_content='ATTENDANCE POLICY\n\nPolicy:\n\nIt is company policy that all employees should report to their work at the right time so that they have right output at the end of the day. The company considers it important and encourages compliance of the policy.\n\nPurpose:\n\nTo streamline and regulate the attendance of all the employees in the Company.\n\nScope:\n\nThis policy applies to all employees of the company working at Head Office.\n\nProcedure:\n\nEach employee has a RFID (Radio Frequency Identification) card containing the necessary details of the employee required for attendance & identification.\n\nAt office premises, employees need to have the employee card if you don’t have one please get it from HR.\n\nThe card will be the company’s property with the employee and in case of separation from the company, the card must be returned to HRD. Failure to do so, an amount of Rs.500/- will be deducted from final settlement.', metadata={'source': 'ATTENDANCE POLICY.docx'

In [32]:
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings

embeddings_model = HuggingFaceInferenceAPIEmbeddings(
    api_key="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", model_name="sentence-transformers/all-mpnet-base-v1"
)

In [59]:
type(embeddings_model)

langchain_community.embeddings.huggingface.HuggingFaceInferenceAPIEmbeddings

In [33]:
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=embeddings_model)

retriever = vectorstore.as_retriever()

In [None]:
def create_vectorstore(documents: List[str], embedding):
    vectorstore = Chroma.from_documents(documents=documents, embedding=embedding)
    return vectorstore

def get_vectorstore_retriever(vectorstore):
    return vectorstore.as_retriever()

In [34]:
prompt = hub.pull("rlm/rag-prompt")

In [35]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [36]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [38]:
import boto3
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms.bedrock import Bedrock

In [43]:
bedrock = boto3.client(service_name="bedrock-runtime", region_name="us-east-1")

In [None]:
def initialize_boto3_client(service_name: str = "bedrock-runtime", region_name: str = "us-east-1"):
    return boto3.client(service_name=service_name, region_name=region_name)

In [44]:
def create_bedrock_instance(client, model_id: str = "meta.llama2-13b-chat-v1", streaming: bool = True, max_gen_len: int = 512, temperature: float = 0.2, top_p: float = 0.9):
    """
    Creates and returns a Bedrock instance with specified parameters.

    :param client: The bedrock client instance.
    :param model_id: The model ID to use.
    :param streaming: Boolean indicating if streaming is enabled.
    :param max_gen_len: Maximum generation length.
    :param temperature: Temperature setting for generation.
    :param top_p: Top-p setting for generation.
    :return: A Bedrock instance.
    """
    return Bedrock(model_id=model_id, client=client, streaming=streaming, 
                   callbacks=[StreamingStdOutCallbackHandler()], 
                   model_kwargs={'max_gen_len': max_gen_len, "temperature": temperature, "top_p": top_p})


In [46]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [47]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
def create_rag_chain(retriever, prompt, llm):
    return (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

In [53]:
rag_chain.invoke("list the hospitals which are in Lahore")

 Here is a list of hospitals in Lahore:

1. Mayo Hospital
2. Services Hospital
3. Lady Willingdon Hospital
4. Jinnah Hospital
5. Fatima Memorial Hospital
6. Lahore General Hospital
7. Shaukat Khanum Memorial Cancer Hospital and Research Centre
8. Punjab Institute of Cardiology
9. Bahria International Hospital
10. Akram Medical Complex

' Here is a list of hospitals in Lahore:\n\n1. Mayo Hospital\n2. Services Hospital\n3. Lady Willingdon Hospital\n4. Jinnah Hospital\n5. Fatima Memorial Hospital\n6. Lahore General Hospital\n7. Shaukat Khanum Memorial Cancer Hospital and Research Centre\n8. Punjab Institute of Cardiology\n9. Bahria International Hospital\n10. Akram Medical Complex'