<a href="https://colab.research.google.com/github/mdeniz20/NLP-0/blob/main/RAG_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations

In [1]:
!pip install langchain langchain-community chromadb cohere
!pip install langchain_cohere
!pip install -U langchain-chroma
!pip install -U sentence-transformers
!pip install huggingface
!pip install langchain_huggingface

Collecting langchain
  Downloading langchain-0.2.11-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.2.10-py3-none-any.whl.metadata (2.7 kB)
Collecting chromadb
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting cohere
  Downloading cohere-5.6.2-py3-none-any.whl.metadata (3.3 kB)
Collecting langchain-core<0.3.0,>=0.2.23 (from langchain)
  Downloading langchain_core-0.2.24-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.93-py3-none-any.whl.metadata (13 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17

# Imports

In [2]:
import os
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_chroma import Chroma
from langchain_cohere import CohereEmbeddings
import requests
from google.colab import userdata
from langchain_huggingface import HuggingFaceEmbeddings

# Environment Variables

In [3]:
os.environ["COHERE_API_KEY"] = userdata.get("COHERE_API_KEY")

# Setting the Environment

In [126]:
os.system("rm -r ./db")
os.system("rm -r ./data")

0

In [134]:
books = {
    "odyssey": "https://www.gutenberg.org/cache/epub/1727/pg1727.txt",
    "romeo_and_juliet": "https://www.gutenberg.org/cache/epub/1513/pg1513.txt",
    "frankenstein": "https://www.gutenberg.org/cache/epub/84/pg84.txt",
    # "les_miserables": "https://www.gutenberg.org/cache/epub/135/pg135.txt"
}

directory_path = "./data"

os.makedirs(directory_path, exist_ok=True)
print(f"Directory '{directory_path}' created successfully.")

for book_name, book_url in books.items():
    print(f"Downloading {book_name}...")
    response = requests.get(book_url)
    if response.status_code == 200:
        with open(os.path.join(directory_path, f"{book_name}.txt"), "wb") as file:
            file.write(response.content)
    else:
        print(f"Failed to download {book_name}. Status code: {response.status_code}")

print("Book installation finished!")

if not os.path.exists("./db"):
    os.makedirs("./db")
    print("Directory './db' created successfully.")


Directory './data' created successfully.
Downloading odyssey...
Downloading romeo_and_juliet...
Downloading frankenstein...
Book installation finished!


# Embedder

In [17]:
class Embedder:
  model_name: str
  platform: str
  model: any

  platform_dict = {
      "cohere": CohereEmbeddings
  }

  def __init__(self, platform: str, model: str):
    self.model_name = model
    self.platform = platform
    self.model = self.platform_dict[self.platform](model = self.model_name)

# Chunker

In [27]:
class Chunker:
  chunk = None
  chunk_overlap = None
  chunker = None

  def __init__(self, chunk_size: int, chunk_overlap: int):
    """
    chunk_size: The number of characters in each chunk
    chunk_overlap: The number of characters to overlap between chunks
    """
    self.chunk_size = chunk_size
    self.chunk_overlap = chunk_overlap
    self.chunker = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)

  def get_chunked_documents(self, documents: list):
    return self.chunker.split_documents(documents)



# Vector Store

In [71]:
class VectorStore(Chroma):
  docs = None
  embedder = None
  persisten_directory = None
  store = None
  def __init__(self, docs, embedder, persisten_directory="./vector_database/chroma_db"):
    self.docs = docs
    self.embedder = embedder
    self.persisten_directory = persisten_directory
    self.set_environment()
    self.store = Chroma(persist_directory=persisten_directory, embedding_function=embedder)

  def set_environment(self):
    if not os.path.exists(self.persisten_directory):
      os.makedirs(self.persisten_directory)
    if len(os.listdir(self.persisten_directory)) == 0:
      Chroma.from_documents(self.docs, self.embedder, persist_directory=self.persisten_directory)

  def add_new_document(self, documents):
      for document in documents:
          self.docs.append(document)
          new_embedding = self.embedder.embed_query(document)
          self.add_documents([document], [new_embedding])
      self.save()
      return 1






# Retriver

In [23]:
"""
retriever = db.as_retriever(
    search_type = "mmr",
    search_kwargs = {"k": 3, "fetch_k": 20, "lambda_mult": 0.5}
    #0.4 means lower bound of similarity
    #"k": 3 means return the top 3 similar (most relevant) documents
)
"""
"""
retriever = db.as_retriever(
    search_type = "similarity_score_threshold",
    search_kwargs = {"k": 3, "score_threshold": 0.3}
)
"""

"""
retriever = db.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k": 2}
)
"""

class Retriver:
  class RetriveMethod:
    get: any
    def __init__(self, db, query):
      self.get = db.as_retriever(
        search_type = "similarity",
        search_kwargs = {"k": 2}
      ).invoke(query)


  class Chunker:
    chunk = None
    chunk_overlap = None
    chunker = None

    def __init__(self, chunk_size: int, chunk_overlap: int):
      self.chunk_size = chunk_size
      self.chunk_overlap = chunk_overlap
      self.chunker = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)

  class VectorStore(Chroma):
      docs = None
      embedder = None
      persisten_directory = None
      def __init__(self, docs, embedder, persisten_directory):
        self.docs = docs
        self.embedder = embedder
        self.persisten_directory = persisten_directory
        self.set_environment()
        super().__init__(persist_directory=persisten_directory, embedding_function=embedder)

      def set_environment(self):
        if not os.path.exists(self.persisten_directory):
          os.makedirs(self.persisten_directory)
        if len(os.listdir(self.persisten_directory)) == 0:
          Chroma.from_documents(self.docs, self.embedder, persist_directory=persisten_directory)

  class Embedder:
      model_name = None
      platform = None
      model = None
      class Platform:
        cohere = CohereEmbeddings
      class Model:
        embed_english_v3_0 = "embed-english-v3.0"

      def __init__(self, platform: Platform, model: Model):
        self.model_name = model
        self.platform = platform
        self.model = self.platform(model = self.model_name)


  query: str

  embedder: any
  embedder_platform: Embedder.Platform
  embedder_model: Embedder.Model

  source_dir_path: str
  source_files: list
  documents: list

  chunker: any
  chunker_chunk_size: int
  chunker_chunk_overlap: int

  db: VectorStore
  database_directory: str

  def __init__(self,
          query:str,
          source_dir_path: str,
          embedder_platform: Embedder.Platform,
          embedder_model: Embedder.Model,
          chunk_size: int = 1000,
          chunk_overlap: int = 500,
          database_directory:str = "./vector_database/chroma_db"
          ):

    self.query = query

    self.embedder = Embedder(platform=embedder_platform, model=embedder_model).model

    self.chunker = Chunker(chunk_size, chunk_overlap).chunker
    self.chunker_chunk_size = chunk_size
    self.chunker_chunk_overlap = chunk_overlap

    self.documents = []
    self.source_dir_path = source_dir_path
    self.embedder_platform = embedder_platform
    self.embedder_model = embedder_model
    self.source_files = [file for file in os.listdir(source_dir_path)]

    self.database_directory = database_directory
    self.db = VectorStore(self.documents, self.embedder, self.database_directory)


  def retrive(self):
    for source_file in self.source_files:
      file_path = os.path.join(self.dir_path, source_file)
      loader = TextLoader(file_path)
      file_docs = loader.load()

      for doc in file_docs:
        doc.metadata["source"] = source_file
        self.documents.append(doc)

    chunked_source = self.chunker.split_documents(self.source_files)

    print("The number of chunks:", len(chunked_source))
    print("There is a sample chunk:\n", chunked_source[0].page_content)
    self.db = VectorStore(self.documents, self.embedder, self.database_directory)
    method = Retriver.RetriveMethod(self.db, query = self.query)
    return method.get


# RAG Initialize Vector Store

In [129]:
current_dir = os.getcwd()
books_dir = os.path.join(current_dir, "data")
db_dir = os.path.join(current_dir, "db")
persistent_directory = os.path.join(db_dir, "chroma_db_with_metadata1")

book_files = [file for file in os.listdir(books_dir) if file.endswith(".txt")]
documents = []

for book_file in book_files:
  file_path = os.path.join(books_dir, book_file)
  loader = TextLoader(file_path)
  file_docs = loader.load()
  for doc in file_docs:
    doc.metadata["source"] = book_file
    documents.append(doc)

embedder = Embedder(platform="cohere", model="embed-english-v3.0").model
chunked_documents = Chunker(chunk_size=1000, chunk_overlap=500).get_chunked_documents(documents)
len(chunked_documents)
#db = VectorStore(chunked_documents, embedder, persistent_directory)
db = VectorStore(chunked_documents, embedder, persistent_directory).store

In [135]:
print(len(book_files))
raw_document = TextLoader("./data/frankenstein.txt").load()
chunked_doc = Chunker(chunk_size=1000, chunk_overlap=0).get_chunked_documents(raw_document)
print(len(chunked_doc))
db.add_documents(chunked_doc)

1
639


['a8b099f3-d5a3-4ad6-8ab9-fb5981abc962',
 '384b125f-0c33-406a-b4ac-f6be9b2d20de',
 'fca62642-2634-46be-aa58-be2d1e33450a',
 'e7066cd5-18cd-4a70-b3ae-b6c5046284dd',
 'e3c0866d-093d-43a2-899f-6186c5324dfa',
 '0f6227a7-2045-47fe-8edd-84bb6ddd0a19',
 '2559a976-a684-4fe5-ad3b-25e0efd6a117',
 'f08369d0-46d5-4f72-9a8a-e7908a5c0cba',
 'b19df134-ce61-43da-a254-8242ad25495d',
 'fc203f7c-977c-4a5f-a66b-63d8f166002a',
 'c8dea38b-3471-4e82-89c2-0112f8e9c046',
 '0856ada0-4c96-40af-a062-ee2adacad22a',
 'f2b9aba8-a568-418c-baeb-47edc4944dc3',
 '3b5501a9-1796-46cb-8858-cdc6f7e9f5e7',
 '528a9e2c-c400-4d0a-82cd-cea0be6285fd',
 '927683d8-6fec-4379-84bc-53b3f40b06cc',
 '675cfdcd-5847-492c-94ba-f1e7500d186e',
 '20562c9a-24b4-46ff-a7a6-f95ff8a3eb42',
 '4c7a12c0-8aba-4a6f-8e55-cf77c44f7db5',
 '1d9ea386-28e6-460e-bfac-97b651c170f0',
 'dd07139f-2a81-4737-a98e-86706b8fd0b7',
 'fbb007c0-5228-49d3-806b-d1d616d2351b',
 '38131f98-8f4f-4fe3-bebc-e0a16b8fa8f7',
 '0f9ec0b6-b5cf-48d8-9e23-a7bb2073a187',
 'b5ec47f6-0d19-

In [141]:
query = "Who is Frankenstein?"
relevant_docs = db.as_retriever(
    search_type = "similarity_score_threshold",
    search_kwargs = {"k": 2, "score_threshold": 0.2}
).invoke(query)

print("These are the most relevant documents to your query:")
print(len(relevant_docs))
for i, doc in enumerate(relevant_docs, 1):
  print(f"Document {i}:\n{doc.page_content}\n")
  print(f"Source {doc.metadata['source']}\n")

These are the most relevant documents to your query:
2
Document 1:
“How can I move thee? Will no entreaties cause thee to turn a
favourable eye upon thy creature, who implores thy goodness and
compassion? Believe me, Frankenstein, I was benevolent; my soul glowed
with love and humanity; but am I not alone, miserably alone? You, my
creator, abhor me; what hope can I gather from your fellow creatures,
who owe me nothing? They spurn and hate me. The desert mountains and
dreary glaciers are my refuge. I have wandered here many days; the
caves of ice, which I only do not fear, are a dwelling to me, and the
only one which man does not grudge. These bleak skies I hail, for they
are kinder to me than your fellow beings. If the multitude of mankind
knew of my existence, they would do as you do, and arm themselves for
my destruction. Shall I not then hate them who abhor me? I will keep
no terms with my enemies. I am miserable, and they shall share my
wretchedness. Yet it is in your power to reco

In [24]:
current_dir = os.getcwd()
dir_path = os.path.join(current_dir, "data")
persisten_directory = os.path.join(current_dir, "test", "chorma_db_0.1")
query = "How did juliet die?"
relevant_docs = Retriver(query, dir_path, Retriver.Embedder.Platform.cohere, Retriver.Embedder.Model.embed_english_v3_0).retrive()
print("These are the most relevant documents to your query:")
len(relevant_docs)
for i, doc in enumerate(relevant_docs, 1):
  print(f"Document {i}:\n{doc.page_content}\n")
  if doc.metadata:
    print(f"Source {doc.metadata.get('score', doc.metadata['source'])}\n")






IndexError: list index out of range

Running Queries

In [16]:
"""
retriever = db.as_retriever(
    search_type = "mmr",
    search_kwargs = {"k": 3, "fetch_k": 20, "lambda_mult": 0.5}
    #0.4 means lower bound of similarity
    #"k": 3 means return the top 3 similar (most relevant) documents
)
"""
"""
retriever = db.as_retriever(
    search_type = "similarity_score_threshold",
    search_kwargs = {"k": 3, "score_threshold": 0.3}
)
"""

retriever = db.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k": 2}
)

NameError: name 'db' is not defined

In [None]:
query = "How did juliet die?"
relevant_docs = retriever.invoke(query)

print("These are the most relevant documents to your query:")
len(relevant_docs)
for i, doc in enumerate(relevant_docs, 1):
  print(f"Document {i}:\n{doc.page_content}\n")
  if doc.metadata:
    print(f"Source {doc.metadata.get('score', doc.metadata['source'])}\n")

In [None]:
class Embedder:


