In [6]:
%pip install -U chromadb langchain langchain-core langchain-huggingface langchain_text_splitters langchain-community langchain-chroma

# Use CUDA enabled llama
%pip install -U llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124

Collecting langchain-chroma
  Downloading langchain_chroma-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_chroma-0.1.2-py3-none-any.whl (9.3 kB)
Installing collected packages: langchain-chroma
Successfully installed langchain-chroma-0.1.2
Note: you may need to restart the kernel to use updated packages.


# Starting up the Vector Store

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
# model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    # model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange


In [4]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="docs",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not neccesary
)

# Loading Data into Vector Store

In [51]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("./documents/02.txt")
document = loader.load()

In [52]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
split_doc = text_splitter.split_documents(document)

In [53]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(split_doc))]
vector_store.add_documents(documents=split_doc, ids=uuids)

['5b8761a5-9f7d-42f2-8db1-feae7bafbdcb',
 'cf783399-0a40-47c2-a12b-b60d881e06c6']

# Direct usage of llama.cpp

In [1]:
from llama_cpp import Llama

llm = Llama(
    #   model_path="./models/7B/llama-model.gguf",
      # model_path="../privateGPT/models/mistral-7b-instruct-v0.1.Q4_K_M.gguf",
    #   model_path="./models/llama-2-7b-chat.Q5_K_M.gguf",
      model_path="./models/Meta-Llama-3.1-8B-Instruct-Q6_K_L.gguf",
      n_gpu_layers=-1, # Uncomment to use GPU acceleration
      # seed=1337, # Uncomment to set a specific seed
      n_ctx=8192*4, # Uncomment to increase the context window,
      # n_batch=2048,
)

llama_model_loader: loaded meta data with 33 key-value pairs and 292 tensors from ./models/Meta-Llama-3.1-8B-Instruct-Q6_K_L.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 8B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Meta-Llama-3.1
llama_model_loader: - kv   5:                         general.size_label str              = 8B
llama_model_loader: - kv   6:                            general.license str              = llama3.1
llama_model_loader: - kv  

In [2]:
# using chat
question = "Where and when is the second hand uniform sale?"
# question = "What is happening on August 14th?"
# question = "Is there anything parents need to be aware of to get their child ready for back to school?"
# question = "Where and when is the second hand uniform in South Horizons Campus?"
# question = "Who should I contact to check whether I registered for bus service?"
# question = "What are the name of the staff in IMS?"

# results = vector_store.similarity_search(question,k=1)
results = vector_store.max_marginal_relevance_search(question,k=3, fetch_k=5)

output = llm.create_chat_completion(
    messages=[
        # {"role": "system", "content": "You are a helpful assistant that outputs in text."},
        # {"role": "system", "content": "You are provide with the context information below\n------------------\n"+results[0].page_content},
        {"role": "system", "content": "You are a helpful assistent, only answer questions from facts that you can find from the context information below\n------------------\n"+results[0].page_content},
        {"role" : "users", "content": question}
         
    ],
    response_format={"type": "text"},
    temperature=0.7,
)
print("Answer:\n" + output['choices'][0]['message']['content'])

NameError: name 'vector_store' is not defined

In [11]:
# debug
output

{'id': 'chatcmpl-8ea7ae07-e4b7-4ce2-8443-47fdc7c9fe84',
 'object': 'chat.completion',
 'created': 1723391611,
 'model': './models/Meta-Llama-3.1-8B-Instruct-Q6_K_L.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': "To get your child ready for the upcoming school year, here are some essential things to be aware of:\n\n1. **Establish a Back-to-School Routine**: Gradually adjust your child's wake-up time to align with the school schedule, starting a week or two before school begins. This will help them adjust to the new wake-up time and make mornings smoother.\n\n2. **Healthy Breakfast**: Ensure your child has a nutritious breakfast to fuel their day. A healthy breakfast can improve concentration, energy levels, and overall academic performance.\n\n3. **Lunch and Snack Planning**: Discuss meal options and plan healthy lunches and snacks with your child. Consider packing reusable containers and water bottles to reduce waste and promote sustainability.\n\n4

In [9]:

# Testing vector store

# question = "Who should I call to get information about my child's bus service? I need the person's name and phone number."
# question = "Where and when is the second hand uniform sale?"
# question = "Where and when is the second hand uniform sale?"
# question = "What are the name of the staff in IMS?"
question = "What is happening on August 14th?"

# results = vector_store.similarity_search(question,k=1)
results = vector_store.max_marginal_relevance_search(question,k=3, fetch_k=5)

for i in range(len(results)):
    doc = results[i]
    print(f"====Document {i}====\n{doc.page_content[:100]}")


====Document 0====
We look forward to seeing everyone back at school on Wednesday, 14 August!


Warmest regards,

Matth
====Document 1====
Itâ€™s time to prepare your child for the upcoming school year! All Elementary children will begin s
====Document 2====
PTA Second Hand Uniform Sale
From:
pta@ims.edu.hk
Date Sent:
Friday, Aug 09 at 12:42 PM
PTA SECOND H


# Use langchain-llama-cpp
It is a lot slower

In [23]:
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler

# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

lc_llm = LlamaCpp(
    model_path="./models/Meta-Llama-3.1-8B-Instruct-Q6_K_L.gguf",
    temperature=0,
    # max_tokens=2048,
    n_ctx=8192*4,
    top_p=1,
    callback_manager=callback_manager,
    stop=["NO_OUTPUT."],
    n_gpu_layers=-1, # adding this seems to make it even slower!
    verbose=True,  # Verbose is required to pass to the callback manager
)

llama_model_loader: loaded meta data with 33 key-value pairs and 292 tensors from ./models/Meta-Llama-3.1-8B-Instruct-Q6_K_L.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 8B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Meta-Llama-3.1
llama_model_loader: - kv   5:                         general.size_label str              = 8B
llama_model_loader: - kv   6:                            general.license str              = llama3.1
llama_model_loader: - kv  

In [25]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    lc_llm,
    retriever=vector_store.as_retriever(search_type = "mmr", kwargs= {"k": 2, "fetch_k": 3})
)

In [26]:
question = "Where and when is the second hand uniform sale?"
result = qa_chain({"query": question})

Number of requested results 20 is greater than number of elements in index 9, updating n_results = 9


 The second-hand uniform sale will be held at Stanley Campus on Wednesday, 14 August from 9:00 am - 12:00 pm & 1:00 pm - 3:00 pm outside the General Office on the G/F. 
Note: This answer is based on the information provided in the email. If you have any further questions or need clarification, please don't hesitate to ask!


llama_print_timings:        load time =    1727.16 ms
llama_print_timings:      sample time =     168.47 ms /    83 runs   (    2.03 ms per token,   492.65 tokens per second)
llama_print_timings: prompt eval time =  116966.01 ms /   839 tokens (  139.41 ms per token,     7.17 tokens per second)
llama_print_timings:        eval time =   25548.35 ms /    82 runs   (  311.57 ms per token,     3.21 tokens per second)
llama_print_timings:       total time =  143020.19 ms /   921 tokens


# More advance retriever

A lot slower still, because it is using the llm to "compress" the initial retrieved data from the vector store.

From the DeepLearning.ai classes [LangChain Chat with Your Data](https://learn.deeplearning.ai/courses/langchain-chat-with-your-data)

In [39]:
from langchain.chains import RetrievalQA
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor.from_llm(lc_llm)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vector_store.as_retriever(search_type = "mmr", kwargs= {"k": 2, "fetch_k": 3})
)

qa_chain = RetrievalQA.from_chain_type(
    lc_llm,
    retriever=compression_retriever
)

In [36]:
docs = compression_retriever.get_relevant_documents("Where and when is the second hand uniform sale?")

Number of requested results 20 is greater than number of elements in index 9, updating n_results = 9


 
- The first sale date is Wednesday, 14â€‹ August 2024
- The next sale date will be in November 2024. 
- If you have any clean, undamaged uniforms, please consider donating them by dropping off laundered uniforms at your campus’ admin desk.
- We accept summer uniforms, winter uniforms, PE uniforms, backpacks and hats. 
- If you have any questions about the second hand uniform sale, please email the PTA at pta@ims.edu.hk. 


llama_print_timings:        load time =    1481.62 ms
llama_print_timings:      sample time =     256.50 ms /   111 runs   (    2.31 ms per token,   432.75 tokens per second)
llama_print_timings: prompt eval time =   52688.98 ms /   358 tokens (  147.18 ms per token,     6.79 tokens per second)
llama_print_timings:        eval time =   38188.33 ms /   110 runs   (  347.17 ms per token,     2.88 tokens per second)
llama_print_timings:       total time =   91557.30 ms /   468 tokens


 

Llama.generate: prefix-match hit


 
- UNIFORMS
- Please ensure your child has the full uniform, including a backpack and a hat. When children are outdoors, they must wear a hat during school hours. Please see the attached IMS Uniform Guide for further details. If you have made a new uniform order from Aston Wilson before July 31, the uniform will be placed in your childâ€™s cubby, and teachers will ensure they take them home once school begins. Should you need to change the size, contact Campus Admin staff or visit Aston Wilson in Kowloon.
- The PTA will host a second-hand uniform sale at Stanley Campus on Stanley Campus on Wednesday, 14 August from 9:00 am - 12:00 pm & 1:00 pm - 3:00 pm outside the General Office on the G/F. 
> Answer: The second hand uniform sale is held at Stanley Campus on Wednesday, 14 August from 9:00 am - 12:00 pm & 1:00 pm - 3:00 pm outside the General Office on the G


llama_print_timings:        load time =    1481.62 ms
llama_print_timings:      sample time =     464.74 ms /   218 runs   (    2.13 ms per token,   469.08 tokens per second)
llama_print_timings: prompt eval time =   33158.79 ms /   224 tokens (  148.03 ms per token,     6.76 tokens per second)
llama_print_timings:        eval time =   70640.70 ms /   217 runs   (  325.53 ms per token,     3.07 tokens per second)
llama_print_timings:       total time =  105064.24 ms /   441 tokens
Llama.generate: prefix-match hit


 
*


llama_print_timings:        load time =    1481.62 ms
llama_print_timings:      sample time =      12.42 ms /     5 runs   (    2.48 ms per token,   402.45 tokens per second)
llama_print_timings: prompt eval time =   14857.01 ms /   116 tokens (  128.08 ms per token,     7.81 tokens per second)
llama_print_timings:        eval time =    1266.98 ms /     4 runs   (  316.75 ms per token,     3.16 tokens per second)
llama_print_timings:       total time =   16154.68 ms /   120 tokens


 

Llama.generate: prefix-match hit


 
1. The camp fee will be invoiced separately in your first month.
2. Further details will be included in a camp letter sent out in January.
3. Please remember to keep your contact information and other relevant details up-to-date. This includes your address, emergency contacts, and any medical information such as allergies. 
4. Parents can view and edit their family's details by clicking the "Update Household Profile" button on the Parent Portal.




llama_print_timings:        load time =    1481.62 ms
llama_print_timings:      sample time =     206.58 ms /    94 runs   (    2.20 ms per token,   455.02 tokens per second)
llama_print_timings: prompt eval time =   25434.74 ms /   183 tokens (  138.99 ms per token,     7.19 tokens per second)
llama_print_timings:        eval time =   29312.47 ms /    93 runs   (  315.19 ms per token,     3.17 tokens per second)
llama_print_timings:       total time =   55270.02 ms /   276 tokens


In [40]:
# # question = "Who should I call to get information about my child's bus service? I need the person's name and phone number."
question = "Where and when is the second hand uniform sale?"
result = qa_chain({"query": question})

Number of requested results 20 is greater than number of elements in index 9, updating n_results = 9


 
- The first Second Hand Uniform Sale for the 2024/2025 academic year is on Wednesday, 14â€‹ August 2024. 
- There will be a second sale date in November 2024. 
- The locations of the sales are

KeyboardInterrupt: 