<a href="https://colab.research.google.com/github/netmatze/mlmatze/blob/main/using_langchain_and_huggingface_embeddings_to_load_and_query_multible_pdf_files.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### installing the libraries we need with pip

In [None]:
!pip install langchain
!pip install huggingface_hub
!pip install transformers
!pip install chromadb
!pip install sentence_transformers
!pip install unstructured
!pip install tiktoken
!pip install pdf2image

### importing langchain classes

In [2]:
import os
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFacePipeline, HuggingFaceHub
from langchain.embeddings import HuggingFaceHubEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

### create llm model google/flan-t5-xxl from huggingface hub and huggingface embeddings

In [3]:
model_name = "google/flan-t5-xxl"

llm = HuggingFaceHub(
        repo_id=model_name,
        model_kwargs={"temperature":0.9, "max_length":256},
        huggingfacehub_api_token=''
)

embeddings = HuggingFaceEmbeddings()

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

### define query

In [4]:
query = "what is tensorflow library"


### using raw llm prompt

In [5]:
llm_prompt = llm.generate([query])
raw_llm_prompt = llm_prompt.generations[0][0].text
print(f"llm prompt: {raw_llm_prompt}")

llm prompt: tensorflow library tensorflow is a library of computational models from 


### load pdf files

In [6]:
file_url_list = ['https://vincentqin.gitee.io/blogresource-2/cv-books/Building%20Machine%20Learning%20Projects%20with%20TensorFlow.pdf',
                 'https://falksangdata.no/wp-content/uploads/2022/07/python-machine-learning-and-deep-learning-with-python-scikit-learn-and-tensorflow-2.pdf',
                 'https://www.nrigroupindia.com/e-book/Introduction%20to%20Machine%20Learning%20with%20Python%20(%20PDFDrive.com%20)-min.pdf']

loaders = [OnlinePDFLoader(file_url) for file_url in file_url_list]

raw_text = ""
for onlinePdfLoader in loaders:
  data = onlinePdfLoader.load()
  for d in data:
    if d.page_content:
      raw_text += d.page_content

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


### split the text of the pdfs  into 1000 token chunks

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_text(raw_text)

### save embeddings to chroma vector database using Huggingface embeddings

In [8]:
db = Chroma.from_texts(texts, embeddings)

### display embedding generated for the query to show how huggingface embeddings look like

In [9]:
embedding_values = embeddings.embed_query(query)
print(f"query {query}") 
print(f"embeddings {embedding_values}")

query what is tensorflow library
embeddings [0.02386470139026642, 0.027008192613720894, -0.02707648277282715, 0.010983308777213097, -0.01994200237095356, 0.017897753044962883, 0.06192578375339508, 0.050445642322301865, -0.02156602405011654, -0.04252909868955612, 0.01637614145874977, 0.0034918254241347313, -0.02225092612206936, -0.0014435311313718557, 0.05229073017835617, -0.03990481048822403, -0.014188623987138271, 0.009433185681700706, -0.02588149905204773, -0.02868521213531494, -0.018968671560287476, 0.0016312721418216825, 0.0151783786714077, -0.001112793106585741, 0.046179600059986115, -0.004173683002591133, 0.0018618633039295673, -0.023866135627031326, 0.04372302442789078, 0.06932692229747772, 0.03176731616258621, 0.026399753987789154, -0.01363446842879057, 0.09765776246786118, 1.574662746861577e-06, 0.05171817168593407, -0.01803399994969368, 0.04071325063705444, 0.05844952538609505, -0.030042173340916634, 0.08525554835796356, 0.019667815417051315, 0.0029854760505259037, -0.0208528

### perform a similarity search at the chroma vector database store to find the three most simmular chunks of text that are closesed to the given query

In [10]:
docs = db.similarity_search(query, k = 3)
for doc in docs:
  print(doc)

page_content='What is TensorFlow? TensorFlow is a scalable and multiplatform programming interface for implementing and running machine learning algorithms, including convenience wrappers for deep learning. TensorFlow was developed by the researchers and engineers from the Google Brain team. While the main development is led by a team of researchers and software engineers at Google, its development also involves many contributions from the open source community. TensorFlow was initially built for internal use at Google, but it was subsequently released in November 2015 under a permissive open source license. Many machine learning researchers and practitioners from academia and industry have adapted TensorFlow to develop deep learning solutions.\n\n[ 427 ]' metadata={}
page_content="Luckily for us as Python users, TensorFlow's Python API is currently the most complete API, thereby it attracts many machine learning and deep learning practitioners. Furthermore, TensorFlow has an official 

### load the question answering chain of langchain and initialize it with the google/flan-t5-xxl llm model

In [11]:
chain = load_qa_chain(llm, chain_type="stuff") #chain_type="stuff",

### use the chain that we have defined with the llm model with the k nearest chunks of documents and with the defined query
#### the llm model uses the chunks of documents information to create a more precise answert to the question

In [12]:
result = chain.run(input_documents=docs, question=query)

print(f'query: {query}')

print(f'result: {result}')

query: what is tensorflow library
result: open source library and can be freely used by everyone, its development is funded and supported by Google


### complete code

In [None]:
import os
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFacePipeline, HuggingFaceHub
from langchain.embeddings import HuggingFaceHubEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

model_name = "google/flan-t5-xxl"

llm = HuggingFaceHub(
        repo_id=model_name,
        model_kwargs={"temperature":0.9, "max_length":256},
        huggingfacehub_api_token=''
)

query = "what is tensorflow library"

llm_prompt = llm.generate([query])
raw_llm_prompt = llm_prompt.generations[0][0].text
print(f"llm prompt: {raw_llm_prompt}")

embeddings = HuggingFaceEmbeddings()

file_url_list = ['https://vincentqin.gitee.io/blogresource-2/cv-books/Building%20Machine%20Learning%20Projects%20with%20TensorFlow.pdf',,
                 'https://falksangdata.no/wp-content/uploads/2022/07/python-machine-learning-and-deep-learning-with-python-scikit-learn-and-tensorflow-2.pdf',
                 'https://www.nrigroupindia.com/e-book/Introduction%20to%20Machine%20Learning%20with%20Python%20(%20PDFDrive.com%20)-min.pdf']

loaders = [OnlinePDFLoader(file_url) for file_url in file_url_list]

raw_text = ""
for onlinePdfLoader in loaders:
  data = onlinePdfLoader.load()
  for d in data:
    if d.page_content:
      raw_text += d.page_content

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_text(raw_text)

db = Chroma.from_texts(texts, embeddings)

embedding_values = embeddings.embed_query(query)
print(f"query {query}") 
print(f"embeddings {embedding_values}")

docs = db.similarity_search(query, k = 3)
for doc in docs:
  print(doc)

chain = load_qa_chain(llm, chain_type="stuff") #chain_type="stuff",

print(f'query: {query}')

result = chain.run(input_documents=docs, question=query)

print(f'result: {result}')

llm prompt: tensorflow library tensorflow is a library of computational models from 
