## Retrieval Augmented Generation

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
os.environ["GROQ_API_KEY"] = os.environ["GROQ_API_KEY"]

In [3]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

In [4]:
def download_embedding_model():
    embeddings = HuggingFaceEmbeddings(
        model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2",
    )
    return embeddings

In [5]:
embedding = download_embedding_model()



In [6]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
), model_name='sentence-transformers/paraphrase-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

#### CHROMADB: 

In [37]:
# FOR THE CHROMA DB

# where is my current directory
# current_dir = os.path.dirname(os.path.abspath(__file__))
current_dir = "D:\\Generative AI\\Gen AI Language\\langchain\3] Langchain beginner"

# where is my text file
# file_path = os.path.join(current_dir, "books","odyssey.txt")
file_path = "./book/odyssey.txt"

# where i am storing a chroma database
persistent_directory = os.path.join(current_dir, "db", "chromadb")

In [None]:
##NOTE: CHROMA DB doesnt work so this doesnt work.

# Check is the Chroma Vector store already exists
if not os.path.exists(persistent_directory):
    print(f"Persistent directory does not exists. Initializing vector store")
    
    # Ensuring the text file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File {file_path} does not exist")
    
    # Read the text content from the file
    loader = TextLoader(file_path)
    documents = loader.load()
    
    # Split the document into chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
    docs = text_splitter.split_documents(documents)
    
    # Information about the split documents
    print(f"_____________DOCUMENT CHUNKS INFORMATION________________")
    print(f"Number of chunks: {len(docs)}")
    print(f"Sample chunk: \n{docs[0].page_content}\n")
    
    # Create Embeddings
    print("\n_____________CREATING EMBEDDINGS________________")
    # embedding is already loaded in **embedding** variable
    
    # Creating the vector store and persist it automatically
    # db = Chroma.from_documents(
    #     docs,
    #     embedding,
    #     persistent_directory
    # )
    print("Finished creating the vector store")
    

### PINECONE:


In [19]:
from langchain_pinecone import PineconeVectorStore

os.environ["PINECONE_API_KEY"] = os.environ["PINECONE_API_KEY"]
index_name = "odyssey-pinecone"

vectorstore = PineconeVectorStore(
    index_name=index_name,
    embedding=embedding
)

# or initialize while adding the records here

# # vectorstore = PineconeVectorStore.from_texts(
#     docs,
#     index_name=index_name,
#     embedding=embedding
# )

In [21]:
type(vectorstore)

langchain_pinecone.vectorstores.PineconeVectorStore

In [None]:
# Adding the records to the Pinecone Vector Store run only once.
# vectorstore.add_documents(docs)

In [31]:
vectorstore.similarity_search("Who is Odysseus wife?")

[Document(page_content='Thus did she talk with her maids as she sat in her own room, and in\nthe meantime Ulysses was getting his dinner. Then she called for the\nswineherd and said, "Eumaeus, go and tell the stranger to come here,\nI want to see him and ask him some questions. He seems to have travelled\nmuch, and he may have seen or heard something of my unhappy husband."', metadata={'source': './books/odyssey.txt'}),
 Document(page_content='"Then I saw Chloris, whom Neleus married for her beauty, having given\npriceless presents for her. She was youngest daughter to Amphion son\nof Iasus and king of Minyan Orchomenus, and was Queen in Pylos. She\nbore Nestor, Chromius, and Periclymenus, and she also bore that marvellously\nlovely woman Pero, who was wooed by all the country round; but Neleus\nwould only give her to him who should raid the cattle of Iphicles\nfrom the grazing grounds of Phylace, and this was a hard task. The\nonly man who would undertake to raid them was a certain ex

In [33]:
query = "Who is Odysseus's wife?"

# Retrieve relevant documents based on the query.
retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={
        "k": 3,
        "score_threshold": 0.5,
    }
)

relevant_docs = retriever.invoke(query)

In [34]:
relevant_docs

[Document(page_content='Thus did she talk with her maids as she sat in her own room, and in\nthe meantime Ulysses was getting his dinner. Then she called for the\nswineherd and said, "Eumaeus, go and tell the stranger to come here,\nI want to see him and ask him some questions. He seems to have travelled\nmuch, and he may have seen or heard something of my unhappy husband."', metadata={'source': './books/odyssey.txt'}),
 Document(page_content='"Then I saw Chloris, whom Neleus married for her beauty, having given\npriceless presents for her. She was youngest daughter to Amphion son\nof Iasus and king of Minyan Orchomenus, and was Queen in Pylos. She\nbore Nestor, Chromius, and Periclymenus, and she also bore that marvellously\nlovely woman Pero, who was wooed by all the country round; but Neleus\nwould only give her to him who should raid the cattle of Iphicles\nfrom the grazing grounds of Phylace, and this was a hard task. The\nonly man who would undertake to raid them was a certain ex

In [35]:
for i, doc in enumerate(relevant_docs,1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Metadata: {doc.metadata.get("source", "unknown")}\n")

Document 1:
Thus did she talk with her maids as she sat in her own room, and in
the meantime Ulysses was getting his dinner. Then she called for the
swineherd and said, "Eumaeus, go and tell the stranger to come here,
I want to see him and ask him some questions. He seems to have travelled
much, and he may have seen or heard something of my unhappy husband."

Metadata: ./books/odyssey.txt

Document 2:
"Then I saw Chloris, whom Neleus married for her beauty, having given
priceless presents for her. She was youngest daughter to Amphion son
of Iasus and king of Minyan Orchomenus, and was Queen in Pylos. She
bore Nestor, Chromius, and Periclymenus, and she also bore that marvellously
lovely woman Pero, who was wooed by all the country round; but Neleus
would only give her to him who should raid the cattle of Iphicles
from the grazing grounds of Phylace, and this was a hard task. The
only man who would undertake to raid them was a certain excellent
seer, but the will of heaven was against h

### RAG With MetaData

In [49]:
books_dir = "book"

book_files = [f for f in os.listdir(books_dir) if f.endswith(".txt")]
book_files

['adventures_of_huckleberry_finn.txt',
 'adventures_of_sherlock_holmes.txt',
 'declaration_of_independence_of_the_united_states.txt',
 'frankenstein.txt',
 'iliad.txt',
 'langchain_demo.txt',
 'moby_dick.txt',
 'odyssey.txt',
 'pride_and_prejudice.txt',
 'romeo_and_juliet.txt',
 'scarlet_letter.txt',
 'strange_case_of_dr_jekyll_and_mr_hyde.txt',
 'tale_of_two_cities.txt',
 'ulysses.txt',
 'us_bill_of_rights.txt',
 'war_and_peace.txt']

In [48]:
loader = TextLoader("./book/odyssey.txt")

loader.load()



In [51]:
# Read the text content from each file and store it with metadata

documents= []

for book_file in book_files:
    file_path = os.path.join(books_dir, book_file)
    # Here provide encoding utf-8 to read the text file
    loader = TextLoader(file_path, encoding="utf-8")
    book_docs = loader.load(
        
    )
    for doc in book_docs:
        # add metadata
        doc.metadata = {"source": book_file}
        documents.append(doc)

In [55]:
len(documents)

16

In [57]:
print(documents[0].page_content[:100])

The Project Gutenberg eBook of Adventures of Huckleberry Finn
    
This ebook is for the use of any


In [None]:
# Split the documents into chunks

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
docs = text_splitter.split_documents(documents)


In [63]:
# Information about the split documents
print(f"_____________DOCUMENT CHUNKS INFORMATION________________")
print(f"Number of chunks: {len(docs)}")
print(f"Sample chunk: \n{docs[0].page_content}\n")

_____________DOCUMENT CHUNKS INFORMATION________________
Number of chunks: 13248
Sample chunk: 
﻿The Project Gutenberg eBook of Adventures of Huckleberry Finn
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: Adventures of Huckleberry Finn

Author: Mark Twain

Illustrator: E. W. Kemble

Release date: June 29, 2004 [eBook #76]
                Most recently updated: November 16, 2023

Language: English

Credits: David Widger


*** START OF THE PROJECT GUTENBERG EBOOK ADVENTURES OF HUCKLEBERRY FINN ***


ADVENTURES
OF
HUCKLEBERRY FINN

(Tom Sawyer’s Comrade)

By Mark Twain


CONTENTS.

CHAP

In [65]:
# now we have embeddings and documents, we can create a vector store

from langchain_pinecone import PineconeVectorStore

os.environ["PINECONE_API_KEY"] = os.environ["PINECONE_API_KEY"]
index_name = "books-pinecone-with-metadata"

vectorstore = PineconeVectorStore.from_documents(
    docs,
    index_name=index_name,
    embedding=embedding
)

In [66]:
retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={
        "k": 3,
        "score_threshold": 0.5,
    }
)

relevant_docs = retriever.invoke("Who is Odysseus's wife?")

for i, doc in enumerate(relevant_docs,1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Metadata: {doc.metadata.get("source", "unknown")}\n")

Document 1:
Thus did she talk with her maids as she sat in her own room, and in
the meantime Ulysses was getting his dinner. Then she called for the
swineherd and said, "Eumaeus, go and tell the stranger to come here,
I want to see him and ask him some questions. He seems to have travelled
much, and he may have seen or heard something of my unhappy husband."

Metadata: odyssey.txt

Document 2:
“That is Bezúkhova’s brother, Anatole Kurágin,” she said,
indicating a handsome officer of the Horse Guards who passed by them
with head erect, looking at something over the heads of the ladies.
“He’s handsome, isn’t he? I hear they will marry him to that rich
girl. But your cousin, Drubetskóy, is also very attentive to her. They
say she has millions. Oh yes, that’s the French ambassador himself!”
she replied to the countess’ inquiry about Caulaincourt. “Looks as
if he were a king! All the same, the French are charming, very charming.
No one more charming in society. Ah, here she is! Yes, she is 

In [68]:
relevant_docs = retriever.invoke("Where is the kingdom of Ithaca?")

for i, doc in enumerate(relevant_docs,1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Metadata: {doc.metadata.get("source", "unknown")}\n")

Document 1:
Minerva answered, "Stranger, you must be very simple, or must have
come from somewhere a long way off, not to know what country this
is. It is a very celebrated place, and everybody knows it East and
West. It is rugged and not a good driving country, but it is by no
means a bid island for what there is of it. It grows any quantity
of corn and also wine, for it is watered both by rain and dew; it
breeds cattle also and goats; all kinds of timber grow here, and there
are watering places where the water never runs dry; so, sir, the name
of Ithaca is known even as far as Troy, which I understand to be a
long way off from this Achaean country." 

Ulysses was glad at finding himself, as Minerva told him, in his own
country, and he began to answer, but he did not speak the truth, and
made up a lying story in the instinctive wiliness of his heart.

Metadata: odyssey.txt

Document 2:
"'The third man,' he answered, 'is Ulysses who dwells in Ithaca. I
can see him in an island sorrowin