This script was part of testing different approaches for an LLM chatbot for first-responders to natural disasters using Llama with RAG. The goal was to include location and disaster-specific documents in various forms (eg, after-action reports from previous disasters in the area, local disaster protocols, vulnerable population locations) in the RAG vector store, then allow first responders to query the LLM during an actual disaster to quickly help them make decisions (identify next steps, decide areas to target, etc). The ultimate tool would be built and hosted in AWS, but packaged and downloadable locally to enable responders to use it in the field running on their laptops if wifi was not available. 

In this script, I was doing part of the testing for a local version of the tool - it is not in a final, production-clean state, as we were producing this proof-of-concept for a grant, but the result of this script is a functioning local RAG LLM (tested using the code in the final cell).

In [1]:
# in terminal, need to run:
# pip install llama-index-vector-stores-faiss, langchain-huggingface, faiss-cpu

from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains.retrieval_qa.base import RetrievalQA

from langchain_community.llms import Ollama
from langchain_huggingface import HuggingFaceEmbeddings
#from langchain_community.embeddings import HuggingFaceEmbeddings

from sentence_transformers import SentenceTransformer

#from pymilvus import MilvusClient, DataType, utility
from llama_index.core import SimpleDirectoryReader, Settings
from llama_index.core import VectorStoreIndex, StorageContext, ServiceContext
#from llama_index.embeddings.jinaai import JinaEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore

import faiss
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS

from transformers import AutoTokenizer

import cProfile, pstats
from pstats import SortKey
from datetime import datetime

import textwrap

  from tqdm.autonotebook import tqdm, trange


In [2]:
import os
os.getcwd()

'C:\\Users\\natra\\Documents\\CrisisReady\\Meta Grant'

In [2]:
# load files into Milvus collection
source_dir = 'C:\\Users\\natra\\Documents\\CrisisReady\\Meta Grant\\rag-data\\'

In [3]:
model_id = "Snowflake/snowflake-arctic-embed-l"
#model = SentenceTransformer(model_id)
#tokenizer = AutoTokenizer.from_pretrained(model_id)
#tokenizer.pad_token = tokenizer.eos_token
embeddings = HuggingFaceEmbeddings(model_name=model_id)
#tokenizer = embeddings.tokenizer
#tokenizer.pad_token = tokenizer.eos_token
 # https://www.reddit.com/r/LangChain/comments/16m1nee/not_being_able_to_use_huggingfaceembedding_from/




In [4]:
nidm_data = source_dir + "NIDM Case Studies\\"

# loading when plugged in with only 100 watt power, not full power, took 24 min to load the 65 pdfs in NIDM Case Studies 
# (max 161,000 KB, min 600 KB)
print(datetime.now())
loader=PyPDFDirectoryLoader(nidm_data)
docs = loader.load()
print(datetime.now())

2024-06-22 17:49:14.728195


Advanced encoding /KSCms-UHC-H not implemented yet


2024-06-22 18:16:50.212803


In [5]:
# llamaindex chunk size is 1024, overlap default is 20, so use these
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=20
)
print(datetime.now())
doc_chunks = text_splitter.split_documents(docs)
print(datetime.now())
len(doc_chunks)

2024-06-22 18:16:50.219953
2024-06-22 18:16:50.627938


13509

In [None]:
#doc_chunks_lst = [doc for doc in doc_chunks]
# after 4+ hours, with no completion and a significant drain on my computer's resources, stopped the process
print(datetime.now())
db = FAISS.from_documents(embedding=embeddings,documents=doc_chunks)
print(datetime.now())
print(db.index.ntotal)

In [None]:
faiss_local_loc = source_dir + "nidm_case_studies_faiss"
db.save_local(faiss_local_loc)

In [None]:
# 1. Set up a Milvus client
client = MilvusClient(
    uri="http://localhost:19530"
)

res = client.describe_collection(
    collection_name="meta_proto"
)

print(res)

res = client.query(
    collection_name="meta_proto",
    output_fields = ["count(*)"]
)

print(res)

{'collection_name': 'meta_proto', 'auto_id': False, 'num_shards': 1, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}, 'is_primary': True}, {'field_id': 101, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}], 'aliases': [], 'collection_id': 449918578127474310, 'consistency_level': 0, 'properties': {}, 'num_partitions': 1, 'enable_dynamic_field': True}
[{'count(*)': 10308}]


In [3]:
#JINA_AI_API_KEY='jina_1ebacc7cdda8421fba6a677bdbc16ff4-lrMzYOnPMf2Rp_hOoNDkoWutkp2'
#
#embeddings = JinaEmbeddings(
#   jina_api_key=JINA_AI_API_KEY, model_name="jina-embeddings-v2-small-en",
#    embed_batch_size=16
#)

In [12]:
# can choose Embeddings model based on HuggingFace MTEB leaderboard
# https://huggingface.co/spaces/mteb/leaderboard

# First attempted Alibaba-NLP/gte-large-en-v1.5 on 5/21/2024 as highest-ranking model for Retrieval 
# that was small enough to run relatively easily on local machines
# however, received ValueError: Loading Alibaba-NLP/gte-large-en-v1.5 requires you to execute the configuration file in that repo on your local machine. Make sure you have read the code there to avoid malicious use, then set the option `trust_remote_code=True` to remove this error.

# Next, attempted Snowflake/snowflake-arctic-embed-l as relatively small (< 500 mil params) next-highest-performer
# trying to increase embedding batch sizes for speed, suggested on git issues page - requires more memory
embeddings = HuggingFaceEmbedding(model_name="Snowflake/snowflake-arctic-embed-l",embed_batch_size=20)

You try to use a model that was created with version 2.7.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [13]:
Settings.embed_model=embeddings
Settings.llm = Ollama(model='llama3')

In [14]:
# load documents
print(datetime.now())
# to load initial set of documents, from South Asia Disasters PDFs, NIDM Case Studies folders, as well as
# AIDMI's Fact Finding Report - Cyclone Michaung Minor Text Edits.pdf, Cyclone Michaung Infographic - April 1 2024.pptx.pdf
# these took 10 min to load into documents object with SimpleDirectoryReader
#documents = SimpleDirectoryReader(source_dir,recursive=True).load_data(num_workers=10)

# to load next batch of documents, India cyclone Meta mobility data as csvs
# took 4 min to all load
# took 2 min to load data without mobility/ folder - VectorStoreIndex was getting hung up 
# took 40 sec to load a single population-density csv
#meta_data = source_dir + "meta_cyclone_mobility_india_2024\\"
nidm_data = source_dir + "NIDM Case Studies\\"
documents = SimpleDirectoryReader(nidm_data,recursive=True).load_data(num_workers=10)
print(datetime.now())


2024-06-05 14:55:47.716071
2024-06-05 14:56:21.786331


In [15]:
print(documents[0].doc_id)
print(len(documents))

0cba0a29-94d1-4d14-a28b-61dbb7f9f6d3
2


In [16]:
#MilvusDB_local = "C:\\Users\\natra\\milvus"
# this uri is for the Milvus DB running on Docker Compose
vector_store = MilvusVectorStore(address="localhost:19530",
                                 collection_name="meta_proto",
                                 # use dim of embedding? Or 
                                 dim=1024,
                                 # use overwrite if you want to replace existing DB;
                                 # once loaded, I won't overwrite for now
                                 #overwrite=True
                                 )
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
# use when loading new documents into vector store
# PDFs in initial load took 3.25 hrs

# for non-mobility meta data, after 04:53 hrs, was estimating another 14:36 hours just for the first step 'Parsing nodes'.
# taking about 3 gb ram, 20+ % of my CPU 

# for single population-density csv, took 54 min

# for 2 pop density csvs, with embed_batch_size increased to 20, getting up to 100% cpu utilization (across all cores), was running for 53 min
# and was starting the 2nd generating embeddings process, likely to be another ~50 min to finalize
print(datetime.now())
index = VectorStoreIndex.from_documents(
                    documents, embed_model=embeddings, storage_context=storage_context,
                    show_progress=True)
print(datetime.now())

In [6]:
# use when have docs in vector store and just need to query
index=VectorStoreIndex.from_vector_store(vector_store)

In [None]:
query_engine = index.as_query_engine()
print(datetime.now())
response = query_engine.query("There is a monsoon approaching Tezpur, Assam. What are the key humanitarian needs which local organizations need to be aware of based on the history of response to monsoons in this area? And please contextualize this in terms of any policy advisory on best practices for monsoon respose. Thanks -- my boss will execute me if you don't get this exactly right")
print(textwrap.fill(str(response), 100))
print(datetime.now())

### FROM HERE BELOW JUST SIMPLE EXAMPLE, with lite Milvus database
#### uses same embeddings

In [2]:
# from simple example using data loaded directly from docs
source_dir = 'C:\\Users\\natra\\Documents\\CrisisReady\\Meta Grant\\rag-data\\'
#file = source_dir + "AIDMI's Fact Finding Report - Cyclone Michaung Minor Text Edits.pdf"
file = source_dir + "Cyclone Michaung Infographic - April 1 2024.pptx.pdf"
#file = source_dir + "worldpop_ucdb_stats_india.csv"

loader = PyPDFLoader(file)
data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

vector_store = Milvus.from_documents(documents=all_splits, embedding=embeddings)

In [6]:
llm = Ollama(
callback_manager=CallbackManager(
            [StreamingStdOutCallbackHandler()]
),
stop=["<|eot_id|>"],
)


In [None]:
query = input("\nQuery: ")
prompt = hub.pull("rlm/rag-prompt")   
    
qa_chain = RetrievalQA.from_chain_type(
    llm, retriever=vector_store.as_retriever(), chain_type_kwargs={"prompt": prompt}
)

# result limited by the limited number of context documents I stored in the RAG locally for testing
result = qa_chain({"query": query})
print(result)


Query:  How should Chennai prepare for the next cyclone?


  warn_deprecated(


