<a href="https://colab.research.google.com/github/mdeniz20/NLP-0/blob/main/RAG_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations

In [1]:
!pip install langchain langchain-community chromadb cohere
!pip install langchain_cohere
!pip install -U langchain-chroma


Collecting langchain
  Downloading langchain-0.2.11-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.2.10-py3-none-any.whl.metadata (2.7 kB)
Collecting chromadb
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting cohere
  Downloading cohere-5.6.2-py3-none-any.whl.metadata (3.3 kB)
Collecting langchain-core<0.3.0,>=0.2.23 (from langchain)
  Downloading langchain_core-0.2.23-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.93-py3-none-any.whl.metadata (13 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17

In [2]:
!pip install -U sentence-transformers
!pip install huggingface

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [3]:
!pip install langchain_huggingface

Collecting langchain_huggingface
  Downloading langchain_huggingface-0.0.3-py3-none-any.whl.metadata (1.2 kB)
Downloading langchain_huggingface-0.0.3-py3-none-any.whl (17 kB)
Installing collected packages: langchain_huggingface
Successfully installed langchain_huggingface-0.0.3


# Imports

In [4]:
import os
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_chroma import Chroma
from langchain_cohere import CohereEmbeddings
import requests
from google.colab import userdata
from langchain_huggingface import HuggingFaceEmbeddings

# Environment Variables

In [5]:
os.environ["COHERE_API_KEY"] = userdata.get("COHERE_API_KEY")

# Setting the Environment

In [55]:
os.system("rm -r ./db")

0

In [56]:
books = {
    "odyssey": "https://www.gutenberg.org/cache/epub/1727/pg1727.txt",
    "romeo_and_juliet": "https://www.gutenberg.org/cache/epub/1513/pg1513.txt"
}

directory_path = "./data"

os.makedirs(directory_path, exist_ok=True)
print(f"Directory '{directory_path}' created successfully.")

for book_name, book_url in books.items():
    print(f"Downloading {book_name}...")
    response = requests.get(book_url)
    if response.status_code == 200:
        with open(os.path.join(directory_path, f"{book_name}.txt"), "wb") as file:
            file.write(response.content)
    else:
        print(f"Failed to download {book_name}. Status code: {response.status_code}")

print("Book installation finished!")

if not os.path.exists("./db"):
    os.makedirs("./db")
    print("Directory './db' created successfully.")


Directory './data' created successfully.
Downloading odyssey...
Downloading romeo_and_juliet...
Book installation finished!
Directory './db' created successfully.


# Embedder

In [53]:
class Embedder:
  model_name = None
  platform = None
  model = None
  class Platform:
    cohere = CohereEmbeddings
  class Model:
    embed_english_v3_0 = "embed-english-v3.0"

  def __init__(self, platform: Platform, model: Model):
    self.model_name = model
    self.platform = platform
    self.model = platform(model = self.model_name)

# Chunker

In [83]:
class Chunker:
  chunk = None
  chunk_overlap = None
  chunker = None

  def __init__(self, chunk_size: int, chunk_overlap: int):
    self.chunk_size = chunk_size
    self.chunk_overlap = chunk_overlap
    self.chunker = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)

# Vector Store

In [106]:
class VectorStore(Chroma):
  docs = None
  embedder = None
  persisten_directory = None
  def __init__(self, docs, embedder, persisten_directory="./vector_database/chroma_db"):
    self.docs = docs
    self.embedder = embedder
    self.persisten_directory = persisten_directory
    self.set_environment()
    super().__init__(persist_directory=persisten_directory, embedding_function=embedder)

  def set_environment(self):
    if not os.path.exists(self.persisten_directory):
      os.makedirs(self.persisten_directory)
    if len(os.listdir(self.persisten_directory)) == 0:
      Chroma.from_documents(docs, self.embedder, persist_directory=persisten_directory)




# Retriver

In [116]:
"""
retriever = db.as_retriever(
    search_type = "mmr",
    search_kwargs = {"k": 3, "fetch_k": 20, "lambda_mult": 0.5}
    #0.4 means lower bound of similarity
    #"k": 3 means return the top 3 similar (most relevant) documents
)
"""
"""
retriever = db.as_retriever(
    search_type = "similarity_score_threshold",
    search_kwargs = {"k": 3, "score_threshold": 0.3}
)
"""

retriever = db.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k": 2}
)

class Retriver:
  class RetriveMethod:
    get: any
    def __init__(self, db, query):
      self.get = db.as_retriever(
        search_type = "similarity",
        search_kwargs = {"k": 2}
      ).invoke(query)


  class Chunker:
    chunk = None
    chunk_overlap = None
    chunker = None

    def __init__(self, chunk_size: int, chunk_overlap: int):
      self.chunk_size = chunk_size
      self.chunk_overlap = chunk_overlap
      self.chunker = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)

  class VectorStore(Chroma):
      docs = None
      embedder = None
      persisten_directory = None
      def __init__(self, docs, embedder, persisten_directory):
        self.docs = docs
        self.embedder = embedder
        self.persisten_directory = persisten_directory
        self.set_environment()
        super().__init__(persist_directory=persisten_directory, embedding_function=embedder)

      def set_environment(self):
        if not os.path.exists(self.persisten_directory):
          os.makedirs(self.persisten_directory)
        if len(os.listdir(self.persisten_directory)) == 0:
          Chroma.from_documents(self.docs, self.embedder, persist_directory=persisten_directory)

  class Embedder:
      model_name = None
      platform = None
      model = None
      class Platform:
        cohere = CohereEmbeddings
      class Model:
        embed_english_v3_0 = "embed-english-v3.0"

      def __init__(self, platform: Platform, model: Model):
        self.model_name = model
        self.platform = platform
        self.model = self.platform(model = self.model_name)


  query: str

  embedder: any
  embedder_platform: Embedder.Platform
  embedder_model: Embedder.Model

  source_dir_path: str
  source_files: list
  documents: list

  chunker: any
  chunker_chunk_size: int
  chunker_chunk_overlap: int

  db: VectorStore
  database_directory: str

  def __init__(self,
          query:str,
          source_dir_path: str,
          embedder_platform: Embedder.Platform,
          embedder_model: Embedder.Model,
          chunk_size: int = 1000,
          chunk_overlap: int = 500,
          database_directory:str = "./vector_database/chroma_db"
          ):

    self.query = query

    self.embedder = Embedder(platform=embedder_platform, model=embedder_model).model

    self.chunker = Chunker(chunk_size, chunk_overlap).chunker
    self.chunker_chunk_size = chunk_size
    self.chunker_chunk_overlap = chunk_overlap

    self.documents = []
    self.source_dir_path = source_dir_path
    self.embedder_platform = embedder_platform
    self.embedder_model = embedder_model
    self.source_files = [file for file in os.listdir(source_dir_path)]

    self.database_directory = database_directory
    self.db = VectorStore(self.documents, self.embedder, self.database_directory)


  def retrive(self):
    for source_file in self.source_files:
      file_path = os.path.join(self.dir_path, source_file)
      loader = TextLoader(file_path)
      file_docs = loader.load()

      for doc in file_docs:
        doc.metadata["source"] = source_file
        self.documents.append(doc)

    chunked_source = self.chunker.split_documents(self.source_files)

    print("The number of chunks:", len(chunked_source))
    print("There is a sample chunk:\n", chunked_source[0].page_content)
    self.db = VectorStore(self.documents, self.embedder, self.database_directory)
    method = Retriver.RetriveMethod(self.db, query = self.query)
    return method.get


# RAG Initialize Vector Store

In [117]:
current_dir = os.getcwd()
dir_path = os.path.join(current_dir, "data")
persisten_directory = os.path.join(current_dir, "test", "chorma_db_0.1")
query = "How did juliet die?"
relevant_docs = Retriver(query, dir_path,
          Retriver.Embedder.Platform.cohere, Retriver.Embedder.Model.embed_english_v3_0).retrive()
print("These are the most relevant documents to your query:")
len(relevant_docs)
for i, doc in enumerate(relevant_docs, 1):
  print(f"Document {i}:\n{doc.page_content}\n")
  if doc.metadata:
    print(f"Source {doc.metadata.get('score', doc.metadata['source'])}\n")




AttributeError: 'Retriver' object has no attribute 'dir_path'

Running Queries

In [104]:
"""
retriever = db.as_retriever(
    search_type = "mmr",
    search_kwargs = {"k": 3, "fetch_k": 20, "lambda_mult": 0.5}
    #0.4 means lower bound of similarity
    #"k": 3 means return the top 3 similar (most relevant) documents
)
"""
"""
retriever = db.as_retriever(
    search_type = "similarity_score_threshold",
    search_kwargs = {"k": 3, "score_threshold": 0.3}
)
"""

retriever = db.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k": 2}
)

In [105]:
query = "How did juliet die?"
relevant_docs = retriever.invoke(query)

print("These are the most relevant documents to your query:")
len(relevant_docs)
for i, doc in enumerate(relevant_docs, 1):
  print(f"Document {i}:\n{doc.page_content}\n")
  if doc.metadata:
    print(f"Source {doc.metadata.get('score', doc.metadata['source'])}\n")

These are the most relevant documents to your query:
Document 1:
What’s here? A cup clos’d in my true love’s hand?
Poison, I see, hath been his timeless end.
O churl. Drink all, and left no friendly drop
To help me after? I will kiss thy lips.
Haply some poison yet doth hang on them,
To make me die with a restorative.

 [_Kisses him._]

Thy lips are warm!

FIRST WATCH.
[_Within._] Lead, boy. Which way?

JULIET.
Yea, noise? Then I’ll be brief. O happy dagger.

 [_Snatching Romeo’s dagger._]

This is thy sheath. [_stabs herself_] There rest, and let me die.

 [_Falls on Romeo’s body and dies._]

 Enter Watch with the Page of Paris.

PAGE.
This is the place. There, where the torch doth burn.

FIRST WATCH.
The ground is bloody. Search about the churchyard.
Go, some of you, whoe’er you find attach.

 [_Exeunt some of the Watch._]

Pitiful sight! Here lies the County slain,
And Juliet bleeding, warm, and newly dead,
Who here hath lain this two days buried.
Go tell the Prince; run to the Capu

In [26]:
class Embedder:




<function Embedder.mro()>