<a href="https://colab.research.google.com/github/noumantechie/langchain/blob/main/retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**STEP 1 Loading Document**

In [None]:
%pip install -qU pypdf langchain_community

In [None]:
from langchain.document_loaders import PyPDFLoader

# Load the PDF file
pdf_path = "/content/General_Instructions_candiadtes.pdf"  # Ensure the correct file path
loader = PyPDFLoader(pdf_path)

In [None]:
# Now use load_and_split()
pages = loader.load_and_split()

In [None]:
len(pages)

In [None]:
pages[1].page_content

**Step 2 Chunking**

In [None]:
%pip install -qU langchain-text-splitters

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=200,
    chunk_overlap=50,
)
chunks = text_splitter.split_documents(pages)

In [None]:
chunks

In [25]:
len(chunks)

79

In [26]:
chunks[0]

Document(metadata={'producer': 'Microsoft速 Word 2016', 'creator': 'Microsoft速 Word 2016', 'creationdate': '2019-07-25T11:55:13+05:00', 'author': 'Rubina Mustafa', 'moddate': '2019-07-25T11:55:13+05:00', 'source': '/content/General_Instructions_candiadtes.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}, page_content='NATIONAL TEXTILE UNIVERSITY \n \n \nGeneral Instructions for Candidates \n \nA. SUBMISSION OF APPLICATION \n \n1. All applications must be submitted online through NTU website: www.ntu.edu.pk. Anyhow the')

In [29]:
chunks[1]

Document(metadata={'producer': 'Microsoft速 Word 2016', 'creator': 'Microsoft速 Word 2016', 'creationdate': '2019-07-25T11:55:13+05:00', 'author': 'Rubina Mustafa', 'moddate': '2019-07-25T11:55:13+05:00', 'source': '/content/General_Instructions_candiadtes.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}, page_content='candidates applying against posts at  Sr # 30, 31, 32, 33  of advertisement, may apply by \ndownloading the prescribed form from NTU website. All others need to apply online.')

**Step 3 Generate Embeddings **

In [None]:
!pip install langchain_huggingface

In [None]:
# from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

In [None]:
# Load an open-source Sentence Transformer model
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [40]:
docs = [
    "Thrilling Finale Awaits: The Countdown to the Cricket World Cup Championship",
    "Global Giants Clash: Footbal World Cup Semi-Finals Set the Stage for Epic Showdowns",
    "From Underdogs to Contenders: Football world Cup Suprises and Breakout Stars"
]

In [41]:
embedding_vectors = embeddings.embed_documents(docs)


In [42]:
len(embedding_vectors)

3

In [None]:
embedding_vectors[0]

**Step 4 Semantic Search and Storing into Database**

In [38]:
!pip install -qU "langchain-chroma>=0.1.2"

In [39]:
from langchain_chroma import Chroma

In [43]:
# Load an open-source Sentence Transformer model
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [44]:
docs = [
    "Thrilling Finale Awaits: The Countdown to the Cricket World Cup Championship",
    "Global Giants Clash: Footbal World Cup Semi-Finals Set the Stage for Epic Showdowns",
    "From Underdogs to Contenders: Football world Cup Suprises and Breakout Stars"
]

In [45]:
vectorstore = Chroma.from_texts(texts=docs , embedding=embeddings)

In [50]:
vectorstore.similarity_search('Imran Khan' , 1)

[Document(id='e10c4b6b-fefa-42f6-964e-4f0f03d631bb', metadata={}, page_content='Thrilling Finale Awaits: The Countdown to the Cricket World Cup Championship')]

In [51]:
vectorstore.similarity_search('Lionel Messi' , 2)

[Document(id='14a3295e-f6e6-4eec-8ded-590b1c2dbc98', metadata={}, page_content='From Underdogs to Contenders: Football world Cup Suprises and Breakout Stars'),
 Document(id='8d38afc3-4a30-4e6d-b3b9-3965bd2f7535', metadata={}, page_content='Global Giants Clash: Footbal World Cup Semi-Finals Set the Stage for Epic Showdowns')]