In [10]:
import torch
torch.cuda.is_available()

False

### Install Langchain and Langchain community

In [11]:
# !pip install langchain langchain-community pypdf


In [12]:
# pip install langchain-openai


In [13]:
# pip install llama-index llama-parse


#### Install Cohere, which we'll use to generate embeddings.

In [14]:
# pip install cohere

In [15]:
import os
import shutil
import getpass

#### Create account on Cohere's platform to get free trial api key

In [16]:
os.environ['COHERE_API_KEY'] = getpass.getpass('Cohere API Key:')

Cohere API Key: ········


In [17]:
import cohere
co = cohere.Client(os.environ['COHERE_API_KEY'])

#### Getting embeddings from dummy text

In [18]:
response = co.embed(
  texts=['Pavan is a developer evangelist'],
  model='embed-english-v3.0',
  input_type='classification'
)
# print(response.embeddings)

In [19]:
DATA_PATH = "data"

In [20]:
 # pip install unstructured[md]  #to load docs

In [21]:
from langchain_community.document_loaders import DirectoryLoader

def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.md")
    documents = loader.load()
    return documents

In [22]:
# print(load_documents()) #works

In [23]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

In [24]:
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

In [25]:
 # split_text(load_documents())

In [26]:
# Define a custom Cohere embeddings class
class CohereEmbeddings:
    def __init__(self, model='embed-english-v2.0'):
        self.model = model

    def embed_documents(self, texts):
        response = co.embed(texts=texts, model=self.model)
        return response.embeddings
    def embed_query(self, texts):
        response = co.embed(texts=texts, model=self.model)
        return response.embeddings

#### Install chromadb (our vector database of choice)

In [27]:
# pip install chromadb

In [40]:
# pip install --upgrade chromadb==0.4.14

In [29]:
from langchain.vectorstores.chroma import Chroma

In [30]:
CHROMA_PATH = 'chroma_r'

In [31]:
def save_to_chroma(chunks: list[Document]):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
        # ensure_dir(CHROMA_PATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, CohereEmbeddings(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [32]:
def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)

In [33]:
# def ensure_dir(directory):
#     if not os.path.exists(directory):
#         os.makedirs(directory)
#     os.chmod(directory, 0o777)

In [34]:
def main():
    generate_data_store()

In [35]:
main()

Split 1 documents into 2219 chunks.
Part V – Driving and other offences relating to the Use of Vehicles on Roads
{'source': 'data/The Traffic Act.md', 'start_index': 1876}
Saved 2219 chunks to chroma_r.


  warn_deprecated(


In [36]:
os.listdir()

['Getting Started.ipynb',
 'images',
 '.ipynb_checkpoints',
 'd2l-pytorch-sagemaker-studio-lab',
 'training_llama.ipynb',
 'data',
 'chroma_r']

In [41]:
embedding_function = CohereEmbeddings()
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

In [42]:
query_text = "Is using phone while driving illegal"

In [43]:
result = db.similarity_search_with_relevance_scores(query_text, k=3)

AttributeError: 'CohereEmbeddings' object has no attribute 'embed_query'

In [37]:
# from langchain_community.document_loaders import PyPDFLoader

# loader = PyPDFLoader("TrafficAct_Cap.403.pdf")
# pages = loader.load_and_split()
# print(type(pages))

In [38]:
# os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

In [39]:
# from langchain_community.vectorstores import FAISS
# from langchain_openai import OpenAIEmbeddings

# faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings())
# docs = faiss_index.similarity_search("How will the community be engaged?", k=2)
# for doc in docs:
#     print(str(doc.metadata["page"]) + ":", doc.page_content[:300])