In [79]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.evaluation import load_evaluator
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
import openai
import os
import shutil
from dotenv import load_dotenv
from openai import OpenAI
import argparse

# Config

In [80]:
load_dotenv()

os.environ["OPENAI_BASE_URL"] = os.getenv('OPENAI_BASE_URL')
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

In [81]:
CHROMA_PATH = "chroma"
Data_Path = 'data'

# Load Docs

In [94]:
#load docs
def load_documents():
  loader = DirectoryLoader(Data_Path, glob = '*.md')
  documents = loader.load()
  return documents

docs = load_documents()

# Text Splitter

In [95]:
#split docs into chunks
def split_text(documents: list[Document]):
  text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 500,
    length_function = len,
    add_start_index = True,)

  chunks = text_splitter.split_documents(documents)
  print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

  return chunks

chunks = split_text(docs)

Split 1 documents into 286 chunks.


In [96]:
document = chunks[0]
print(document.page_content)
print('=======')
print(document.metadata)

The Project Gutenberg eBook of Alice's Adventures in Wonderland

This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.

Title: Alice's Adventures in Wonderland

Author: Lewis Carroll

Release date: June 27, 2008 [eBook #11] Most recently updated: March 30, 2021

Language: English

Credits: Arthur DiBianca and David Widger

START OF THE PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND [Illustration]

Alice’s Adventures in Wonderland

by Lewis Carroll

THE MILLENNIUM FULCRUM EDITION 3.0

Contents
{'source': 'data/alice_in_wonderland.md', 'start_index': 0}


# Embedding & Save to Vector Store

In [87]:
#test embedding
embedding_function = OpenAIEmbeddings()
vector = embedding_function.embed_query("apple")
print(vector)
print(len(vector))
print("")

evaluator = load_evaluator("pairwise_embedding_distance")
x = evaluator.evaluate_string_pairs(prediction = "apple", prediction_b = "orange")
y = evaluator.evaluate_string_pairs(prediction = "apple", prediction_b = "apple")
print(x)
print(y)

[0.0078030275, -0.023170143, -0.0075982236, -0.02775775, -0.004608087, 0.013046007, -0.021954974, -0.008376478, 0.018923877, -0.029600985, -0.002937229, 0.020043472, -0.0044135232, 0.00916156, -0.021640942, 0.001990011, 0.030775195, 7.461474e-05, 0.0020565721, -0.025532216, -0.021053838, -0.008260422, 0.021449791, -0.012486209, 0.0010999674, 0.005065482, 0.010192405, 8.304157e-05, 0.016015662, -0.012977738, 0.020480387, -0.01623412, -0.018377734, 0.005406822, -0.0193881, -0.009175213, -0.012049294, -0.008738298, -0.005741335, -0.006174837, 0.010520092, 0.0076255305, -0.0063898806, 0.00079404167, -0.023402255, 0.011195945, -0.021791132, -0.0005265166, -0.0127729345, 0.015264715, -0.005543358, -0.0020309717, -0.014855107, -0.005191778, -0.0029167484, 0.002703411, -0.019538289, 0.006427428, -0.010881912, 0.0050074547, 0.005328314, 0.015114525, -0.027921595, 0.0054375427, -0.009134253, 0.0038161788, -0.013496575, -0.008608589, 0.0131142745, 0.015783552, 0.033505913, 0.013168888, 0.01407685

In [98]:
#create vector database
if os.path.exists(CHROMA_PATH):
    shutil.rmtree(CHROMA_PATH)

os.makedirs(CHROMA_PATH)

In [99]:
embedding_function = OpenAIEmbeddings()
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embedding_function,
    persist_directory="chroma/"
)
vector_store.add_documents(documents=chunks)

['be253a27-7751-4ad8-9158-fd25909a353d',
 'abeccc93-c478-4ba1-817b-edf03fbd956b',
 '58bc0aff-e61e-457f-a132-ade7450fec04',
 '1a871a6a-994a-455c-8ed3-4e3ca3a86564',
 'd077c964-afd8-4980-9767-cfc126f80e2c',
 'd3c8bf54-19d7-48ca-bec7-4302640acbe5',
 'dd4dcc97-d982-465a-aa56-a7af6114fb4e',
 'fff87ca2-5c89-404a-8b01-ce471062d758',
 '55857746-11eb-49f0-8cee-3c7370bc5ba6',
 '2f9c8834-94ba-4aa1-a1ae-bd592a5574a1',
 '8d6a0a38-3374-4919-9df4-6c6524b06785',
 '0ffc99ae-add2-4911-9db2-8c387152baf8',
 'e421d373-de02-4eea-8ec3-47813b7c085e',
 'c9ceec00-fa61-46c5-85d8-b445a1038312',
 'b372105e-f7e8-4cd2-9cbf-c84a4635417a',
 'b2a20937-c728-4b30-ad7c-fcd6ae5edb37',
 '81755219-42a7-45ed-99e7-87a908cf18d1',
 '7b65fb32-0863-4397-bda9-2d53dfa20eae',
 '49f43c88-4314-4d70-8613-e615c64f196e',
 'c7a4f37d-43da-4dcd-b3d1-fbabda75414c',
 '7d69f674-7892-4ff3-b158-7e309ee43dba',
 '63f7fec0-808e-4d66-a572-68a92294b5f3',
 '19a5b310-2476-4f7c-b644-e6247c772c21',
 '296cef22-b4b4-497b-ae64-5cad2d5fb1e0',
 '891a6dc0-c14b-

# Query Data

In [71]:
#can only use in .py not .ipynb
# parser = argparse.ArgumentParser()
# parser.add_argument("query_text", type=str, help="The query text.")
# args = parser.parse_args()
# query_text = args.query_text

In [103]:
#search similar context first
query_text = "How does Alice meet the Mad Hatter"

results = vector_store.similarity_search_with_relevance_scores(query_text, k=3)
if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results.")

In [104]:
#create complete prompt 
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)
print(prompt)

Human: 
Answer the question based only on the following context:

The players all played at once without waiting for turns, quarrelling all the while, and fighting for the hedgehogs; and in a very short time the Queen was in a furious passion, and went stamping about, and shouting “Off with his head!” or “Off with her head!” about once in a minute.

Alice began to feel very uneasy: to be sure, she had not as yet had any dispute with the Queen, but she knew that it might happen any minute, “and then,” thought she, “what would become of me? They’re dreadfully fond of beheading people here; the great wonder is, that there’s any one left alive!”

She was looking about for some way of escape, and wondering whether she could get away without being seen, when she noticed a curious appearance in the air: it puzzled her very much at first, but, after watching it a minute or two, she made it out to be a grin, and she said to herself “It’s the Cheshire Cat: now I shall have somebody to talk to.”


In [107]:
#use that prompt to ask ChatOpenAI
model = ChatOpenAI()
response_text = model.invoke(prompt)

sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Response: content='Alice meets the Mad Hatter when she sees him having tea with the March Hare under a tree in front of a house.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 26, 'prompt_tokens': 739, 'total_tokens': 765, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': '', 'finish_reason': 'stop', 'logprobs': None} id='run-e924da03-d910-4799-874c-67ac8fc27f63-0' usage_metadata={'input_tokens': 739, 'output_tokens': 26, 'total_tokens': 765, 'input_token_details': {}, 'output_token_details': {}}
Sources: ['data/alice_in_wonderland.md', 'data/alice_in_wonderland.md', 'data/alice_in_wonderland.md']
