In [2]:
"""
Documents are loaded into memory
"""

from typing import List
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.docstore.document import Document

def load_data() -> List[Document]:
    """
    The function that loads the data.
    """
    loader = DirectoryLoader('./docs/', glob="**/*.txt", loader_cls=TextLoader,
                            loader_kwargs={'autodetect_encoding': True}, 
                            )
    try:
        docs = loader.load()
        print(f"{len(docs)} documents loaded.")
    except:
        print("Error loading documents.")
        raise
    return docs


In [3]:
"""
This module contains the code to:
1. Split the data into chunks (sentences).
2. Create vector embeddings of these sentences.
3. Store them in a vectorstore.
"""
from typing import List
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from chromadb.config import Settings
import chromadb


def process_data(docs: List[Document]):
    """
    The function that processes the data.
    """

    # Split into sentences
    source_chunks = []
    splitter = CharacterTextSplitter(
        separator=".", chunk_size=500, chunk_overlap=0)
    for source in docs:
        for chunk in splitter.split_text(source.page_content):
            source_chunks.append(
                Document(page_content=chunk, metadata=source.metadata))

    print('chunks created: ', len(source_chunks))

    # Create vector embeddings and store in vectorstore.
    print('Creating embeddings...')
    embedding = HuggingFaceEmbeddings()

    print('Creating vectorstore...')

    client = chromadb.Client(Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory="./.vectorstore"
    ))
    client.persist()

    # Cleaning up the client
    client.reset()

    vectorstore = Chroma(client=client)
    vectorstore = Chroma.from_documents(
        documents=source_chunks, embedding=embedding, client=client)

    return vectorstore


In [4]:
"""
The main bot file.
"""

import textwrap
from langchain.llms import OpenAI
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.callbacks import get_openai_callback
from openai.error import AuthenticationError


class Bot:
    """
    The bot class.
    """

    def __init__(self):
        self.llm = OpenAI(temperature=0, max_tokens=200, verbose=True)
        self.chain = None

    def ask(self, vectorstore, question):
        """
        Asking the bot a question.
        """

        # Using the LLMChain to make your own prompt.
        prompt = PromptTemplate(template=textwrap.dedent("""
        Provided is the name of a user, followed by a question.
        Write a short, crisp mail, answering the question based on the information provided in the context.
        Make sure to exclude any information that is not relevant to the question.
        Do not hallucinate. If you cant find the answer in the context, mention that you dont know.
        #####

        Context:
        {context}

        Question:
        {question}"""), input_variables=['context', 'question'])

        most_relevant_source = vectorstore.similarity_search(question, k=1)[
            0].metadata['source']

        self.chain = RetrievalQA.from_chain_type(
            llm=self.llm, chain_type='stuff', retriever=vectorstore.as_retriever(search_kwargs={"k": 4, "filter": {'source': most_relevant_source}}),
            chain_type_kwargs={"verbose": True, "prompt": prompt})

        with get_openai_callback() as callback:
            try:
                response = self.chain.run(question)
            except AuthenticationError:
                response = "Invalid API key."
            print(callback)

        return response


In [8]:
import os
from dotenv import load_dotenv

print('Running...')

# Loading environment variables for OpenAI.
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

docs = load_data()

vectorstore = process_data(docs)

bot = Bot()


Created a chunk of size 1103, which is longer than the specified 500


Running...
2 documents loaded.
chunks created:  11
Creating embeddings...


Using embedded DuckDB with persistence: data will be stored in: ./.vectorstore
No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


Creating vectorstore...


In [9]:
question1 = "travis may: Is or will the app be compatible with the Samsung health app?"
question2 = "Sofia: Hi, How can I change the calories goal? Thanks"

In [10]:
response = bot.ask(vectorstore=vectorstore, question=question1)
print(response)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Provided is the name of a user, followed by a question.
Write a short, crisp mail, answering the question based on the information provided in the context.
Make sure to exclude any information that is not relevant to the question.
Do not hallucinate. If you cant find the answer in the context, mention that you dont know.
#####

Context:
Currently, the following information is imported from Fitbit devices into the app:

- Steps
- Calories burned on activities

The following information is exported from the app to your Fitbit account:

- Foods consumed with the number of calories
- Water consumed


The app can be connected to Fibit, but this option is only available in the Plus version of the app.
Offer the user the code CODE1MONTH to try the Fitbit connection for free

The promo codes can be used inside the "My account" section of the setting menu of th

In [12]:
print(response)



Hi Travis,

Thank you for your question. Unfortunately, the app is not currently compatible with Samsung Health. We are working on it, however, so please stay tuned for updates.

Regards,
[Your Name]
