In [10]:

from transformers import pipeline

In [11]:
import os
import streamlit as st
import pickle
import time
import langchain
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.vectorstores import FAISS


In [12]:
from langchain_community.document_loaders import PyMuPDFLoader

# List of PDF paths
pdf_paths = [
    r"C:\Users\Madhur.Gauri\Desktop\pdf_work\Ref_doc4 1_english.pdf",
    r"C:\Users\Madhur.Gauri\Desktop\pdf_work\Ref_doc2 2_english.pdf"
]

# Create a loader instance for each PDF path
loaders = [PyMuPDFLoader(pdf_path) for pdf_path in pdf_paths]

# Optionally, you can load documents from all loaders
data = []
for loader in loaders:
    data.extend(loader.load())




In [15]:
data=loader.load()

In [16]:
data

[Document(metadata={'source': 'C:\\Users\\Madhur.Gauri\\Desktop\\pdf_work\\Ref_doc2 2_english.pdf', 'file_path': 'C:\\Users\\Madhur.Gauri\\Desktop\\pdf_work\\Ref_doc2 2_english.pdf', 'page': 0, 'total_pages': 6, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'PDFium', 'producer': '', 'creationDate': 'D:20240812074418', 'modDate': '', 'trapped': ''}, page_content='Mayra  \nCitlali  \nLucero  \nMartinez  \n(luceroma)\nAuthor  \nauthorization  \nof  \nQuality  \nDepartment.  \nNOTICE:  \nThis  \nDocument  \nis  \nfor  \ninternal  \nuse  \nonly  \nand  \nshall  \nnot  \nbe  \nshared  \noutside  \nthe  \norganization  \nwithout  \nthe  \nMay  \n30,  \n2023  \nEffective  \nDate  \nLisette  \nLuna  \n(moon),\nVerónica  \nHernández  \nHerná,  \nJorge\nChange  \nDescription  \nNovember  \n10,  \n2020  \nDiana  \nRodriguez,  \nDiomary  \nRangel,  \nHilda\nSeptember  \n17,  \n2020  \nUpdate  \nto  \nFCAL  \n1-3  \nRev  \n8\nArmando  \nSalas,  \nDiana\nOt

In [17]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [18]:
docs


[Document(metadata={'source': 'C:\\Users\\Madhur.Gauri\\Desktop\\pdf_work\\Ref_doc2 2_english.pdf', 'file_path': 'C:\\Users\\Madhur.Gauri\\Desktop\\pdf_work\\Ref_doc2 2_english.pdf', 'page': 0, 'total_pages': 6, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'PDFium', 'producer': '', 'creationDate': 'D:20240812074418', 'modDate': '', 'trapped': ''}, page_content='Mayra  \nCitlali  \nLucero  \nMartinez  \n(luceroma)\nAuthor  \nauthorization  \nof  \nQuality  \nDepartment.  \nNOTICE:  \nThis  \nDocument  \nis  \nfor  \ninternal  \nuse  \nonly  \nand  \nshall  \nnot  \nbe  \nshared  \noutside  \nthe  \norganization  \nwithout  \nthe  \nMay  \n30,  \n2023  \nEffective  \nDate  \nLisette  \nLuna  \n(moon),\nVerónica  \nHernández  \nHerná,  \nJorge\nChange  \nDescription  \nNovember  \n10,  \n2020  \nDiana  \nRodriguez,  \nDiomary  \nRangel,  \nHilda\nSeptember  \n17,  \n2020  \nUpdate  \nto  \nFCAL  \n1-3  \nRev  \n8\nArmando  \nSalas,  \nDiana\nOt

In [19]:
from langchain_community.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [20]:
def flatten_and_convert_to_string(docs):
    result = []
    for item in data:
        if isinstance(item, (list, tuple)):
            result.extend(flatten_and_convert_to_string(item))
        else:
            result.append(str(item))
    return result
 
# Flatten and convert split_text_0
split_text_0 = flatten_and_convert_to_string(docs)
 
# Now call embed_documents with a list of strings   

In [21]:
embedding=hf.embed_documents(split_text_0)

In [22]:
embedding

[[-0.019713928923010826,
  -0.005481869913637638,
  -0.0037891201209276915,
  0.03091745264828205,
  -0.024301547557115555,
  -0.040859438478946686,
  0.021455159410834312,
  -0.001186602283269167,
  -0.05242801457643509,
  0.002287711016833782,
  0.007316185161471367,
  0.028596902266144753,
  0.06380406767129898,
  0.07244372367858887,
  -0.01825796812772751,
  0.03045959770679474,
  -0.007457224186509848,
  0.03235986456274986,
  -0.08423221856355667,
  0.003455597907304764,
  -0.04802587628364563,
  0.033288512378931046,
  0.02242913655936718,
  -0.002875210717320442,
  0.013344774954020977,
  0.06123547628521919,
  -0.031539496034383774,
  -0.044592101126909256,
  -0.02300415188074112,
  -0.04405050352215767,
  0.07505400478839874,
  0.07313074916601181,
  -0.007953873835504055,
  0.0418892540037632,
  2.959105131594697e-06,
  -0.045613083988428116,
  -0.02513980306684971,
  0.008706944063305855,
  -0.00824669562280178,
  0.062028344720602036,
  0.01857295073568821,
  -0.047703519

In [23]:
from langchain_community.vectorstores import FAISS

vector = FAISS.from_documents(docs, embedding=hf)

In [24]:
groq_api_key=os.environ["GROQ_API_KEY"] = "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"

In [25]:
from langchain_groq import ChatGroq

llm = ChatGroq(groq_api_key=groq_api_key,
    model="mixtral-8x7b-32768",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

In [26]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


In [27]:
prompt_template = """Instruct: With this context\n\n{context}{input}\nOutput:"""

In [28]:
prompt = ChatPromptTemplate.from_template(prompt_template)
document_chain = create_stuff_documents_chain(llm, prompt)

In [29]:
from langchain.chains import create_retrieval_chain

retriever = vector.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [30]:
retrieval_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001DF25116A10>), config={'run_name': 'retrieve_documents'})
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), config={'run_name': 'format_inputs'})
            | ChatPromptTemplate(input_variables=['context', 'input'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'input'], template='Instruct: With this context\n\n{context}{input}\nOutput:'))])
            | ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x000001DF610BE560>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x000001DF610BF220>, temperature=1e-08, gro

In [32]:
question="what are table of contents in supplier selection? ,name them"
response = retrieval_chain.invoke({"input": question})

In [33]:
response

{'input': 'what are table of contents in supplier selection? ,name them',
 'context': [Document(metadata={'source': 'C:\\Users\\Madhur.Gauri\\Desktop\\pdf_work\\Ref_doc2 2_english.pdf', 'file_path': 'C:\\Users\\Madhur.Gauri\\Desktop\\pdf_work\\Ref_doc2 2_english.pdf', 'page': 4, 'total_pages': 6, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'PDFium', 'producer': '', 'creationDate': 'D:20240812074418', 'modDate': '', 'trapped': ''}, page_content='Purchasing  \nDirector,  \nAmericas /  \nQuality  \nDirector  \nof  \nthe  \nProduction  \nSite  \nSMR /  \nProgram  \nManager /  \nCommodity  \nBuyers /  \nProgram  \nBuyer /  \nSDE(s)  \nof  \nthe  \nProduction  \nSite  \n4.0  \nReview  \nof  \nSourcing  \nTable,  \napproval  \nby  \nteam  \ncenter  \n-PLM  \nsystem  \nof  \nmultidisciplinary  \nteam.\nCommodity  \nBuyer  \n8.0  \nTransfer  \nrelevant  \nsupplier  \nand  \ncomponent  \ninformation  \nto  \nthe  \nProgram  \nBuyer\nProgram  \nBuyer 