https://medium.com/@rubentak/unleashing-the-power-of-intelligent-chatbots-with-gpt-4-and-vector-databases-a-step-by-step-8027e2ce9e78

In [5]:
import dotenv
import openai
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import DirectoryLoader

In [3]:
dotenv.load_dotenv('.env')
openai.api_key = os.getenv("OPENAI_API_KEY")

# Documents

In [10]:
# Print number of txt files in directory
loader = DirectoryLoader('../../mmm/html/mmm', glob="./*.html")
doc = loader.load ( )
len(doc)

2

In [11]:
# Splitting the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size    = 1000, 
    chunk_overlap = 200
    )
texts = text_splitter.split_documents(doc)


# Count the number of chunks
len(texts)

6

# ChromaDB Database

In [12]:
persist_directory = 'db'

# OpenAI embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(
    documents        = texts,
    embedding        = embedding,
    persist_directory= persist_directory
    )

In [13]:
# Persist the db to disk
vectordb.persist()
vectordb = None

In [None]:
vectordb = Chroma(
    persist_directory  = persist_directory,
    embedding_function = embedding)


In [38]:
persist_directory = 'db'
docs_directory    = '../../mmm/html/mmm'

try:
    vectordb = Chroma(
        persist_directory  = persist_directory,
        embedding_function = embedding
        )
except:
    loader = DirectoryLoader(docs_directory, glob="./*.html")
    doc = loader.load()

    # Splitting the text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size    = 1000, 
        chunk_overlap = 200
        )
    texts = text_splitter.split_documents(doc)

    persist_directory = 'db'

    # OpenAI embeddings
    embedding = OpenAIEmbeddings()

    vectordb = Chroma.from_documents(
        documents        = texts,
        embedding        = embedding,
        persist_directory= persist_directory
        )

## Retriever

In [36]:
# Create retriever
retriever = vectordb.as_retriever(
    search_type   = 'mmr',
    search_kwargs = {"k": 1},
    # n_results = 1
    )

docs = retriever.get_relevant_documents("Qué función calcula el ad stock?")

len(docs)

Number of requested results 20 is greater than number of elements in index 6, updating n_results = 6


1

In [37]:
[print(x.page_content) for x in docs]

Index

Super-module

mmm

Functions

calculate_geom_ad_stock


[None]