<a href="https://colab.research.google.com/github/netmatze/mlmatze/blob/main/using_langchain_and_huggingface_embeddings_to_load_and_query_multible_pdf_files.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain
!pip install huggingface_hub
!pip install transformers
!pip install chromadb
!pip install sentence_transformers
!pip install unstructured
!pip install tiktoken
!pip install pdf2image

In [26]:
import os
from langchain.chains.question_answering import load_qa_chain

from langchain.document_loaders import OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFacePipeline, HuggingFaceHub
from langchain.embeddings import HuggingFaceHubEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

model_name = "google/flan-t5-xxl"

llm = HuggingFaceHub(
        repo_id=model_name,
        model_kwargs={"temperature":0.9, "max_length":256},
        huggingfacehub_api_token='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'
)

query = "what is tensorflow library"

llm_prompt = llm.generate([query])
print(llm_prompt.generations)

embeddings = HuggingFaceEmbeddings()

file_url_list = ['https://www.60leaders.com/_files/ugd/e7ba2f_e965e98b5ab3423f8b131d1a120dc1bf.pdf',
                 'https://falksangdata.no/wp-content/uploads/2022/07/python-machine-learning-and-deep-learning-with-python-scikit-learn-and-tensorflow-2.pdf',
                 'https://www.nrigroupindia.com/e-book/Introduction%20to%20Machine%20Learning%20with%20Python%20(%20PDFDrive.com%20)-min.pdf']

loaders = [OnlinePDFLoader(file_url) for file_url in file_url_list]

raw_text = ""
for onlinePdfLoader in loaders:
  data = onlinePdfLoader.load()
  for d in data:
    if d.page_content:
      raw_text += d.page_content

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_text(raw_text)

db = Chroma.from_texts(texts, embeddings)

embedding_values = embeddings.embed_query(query)
print(f"query {query}") 
print(f"embeddings {embedding_values}")

docs = db.similarity_search(query, k = 3)
for doc in docs:
  print(doc)

chain = load_qa_chain(llm, chain_type="stuff") #chain_type="stuff",

print(f'query: {query}')

result = chain.run(input_documents=docs, question=query)

print(f'result: {result}')

generations=[[Generation(text='tensorflow application tensorflow library is open source framework for numerical computation and', generation_info=None)]] llm_output=None
query what is tensorflow library
embeddings [0.02386467531323433, 0.027008185163140297, -0.027076488360762596, 0.01098327524960041, -0.0199420303106308, 0.0178977120667696, 0.06192576885223389, 0.05044565349817276, -0.021566055715084076, -0.042529113590717316, 0.016376225277781487, 0.003491736249998212, -0.02225092239677906, -0.0014435286866500974, 0.05229076370596886, -0.03990481421351433, -0.014188673347234726, 0.00943312980234623, -0.025881486013531685, -0.028685245662927628, -0.018968649208545685, 0.0016313205705955625, 0.015178383328020573, -0.0011128104524686933, 0.04617966338992119, -0.004173655062913895, 0.001861890428699553, -0.023866115137934685, 0.04372298717498779, 0.06932688504457474, 0.031767237931489944, 0.026399677619338036, -0.01363446470350027, 0.09765766561031342, 1.5746626331747393e-06, 0.0517181791