In [4]:
# Need some pre-requisites

!pip install langchain
!pip install huggingface_hub
!pip install sentence_transformers
!pip install unstructured[local-inference] -q
!pip install -q faiss-cpu

Collecting langchain
  Downloading langchain-0.0.346-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-core<0.1,>=0.0.10 (from langchain)
  Downloading langchain_core-0.0.10-py3-none-any.whl (178 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.2/178.2 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langsmith<0.1.0,>=0.0.63 (from langchain)
  Downloading langsmith-0.0.69-py3-none-any.whl (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)
  Downloading ma

In [6]:
# FAISS needed to store the vector representations of PDF text data from UnstructuredPDFLoader
# QA will provide me with answer based data from PDF provided, to the prompt given

from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain


In [7]:
# PLEASE PROVIDE HUGGINGFACE TOKEN, SO THAT WE ARE ABLE TO USE HUGGINGFACE MODELS

import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "PUT_YOUR_HUGGING_FACE_TOKEN_HERE"

In [8]:
# Load the document of where the data is.. In my case., Clothes Catalogue

loader = UnstructuredPDFLoader("/content/doc2.pdf")
document = loader.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
# Take a document and split it into chunks of a specified size while considering overlap and using defined separators.
# Useful for processing large text documents in smaller, more manageable pieces.

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=0, separators=[" ", ",", "\n"])
docs = text_splitter.split_documents(document)

In [None]:
# Numerical representations of words, sentences, or documents that capture semantic information are done with HuggingFaceEmbeddings
# Creating a FAISS index from the embedded documents, which allows for efficient similarity search based on the learned embeddings.

from langchain.embeddings import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings()
db = FAISS.from_documents(docs, embedding)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
# Previously, By fetching Hugging Face Token, We use a HuggingFace LLM powered to fetch response to the qa_chain

from langchain import HuggingFaceHub

llm=HuggingFaceHub(repo_id="HuggingFaceH4/zephyr-7b-alpha", model_kwargs={"temperature":0.2, "max_length":256})
chain = load_qa_chain(llm, chain_type="stuff")




In [None]:
# I provide a query, and based on the query and PDF's vector values, similarities are matched to fetch a response

query = "I am a Man, looking for white shirt in summer. Give me type, brand and colors. Suggest one more. Do not say anything other than that"
docs = db.similarity_search(query)

chain.run(input_documents=docs, question=query)

'\nType: Short-Sleeve Linen Tee\nBrand: Uniqlo\nColors: White\n\nAnother suggestion:\nType: Linen Button-Down Shirt\nBrand: J.Crew\nColors: White, Light Blue, Striped Patterns.'