In [6]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from llama_index.embeddings import HuggingFaceEmbedding
import chromadb

#from start to finish this will take the better part of the day as AWS doc is LARGE. looking into ways to finetune before going to prod

In [13]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") #using a smaller model as SFR-mistral (top of MTEB) might take days to embed initial dataset. Model can easily be changed in the future

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [8]:
#for ingestion pipeline we can set up a s3 bucket. user inputs their files -> sent to s3 bucket. We retrieve file from s3 and put it through the pipeline.
documents = (
    SimpleDirectoryReader(input_dir="C:/Users/Adel/Desktop/aws-documentation-main", recursive=True, required_exts=[".md" , ".pdf"]).load_data())


In [9]:
from llama_index.llms import LlamaCPP

model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf" #using 13b for the time being until i can confirm that everything is working 100%
##you can change to 70b by just replacing the linkg

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    verbose=True,
)

AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 
Model metadata: {'general.name': 'LLaMA v2', 'general.architecture': 'llama', 'llama.context_length': '4096', 'llama.rope.dimension_count': '128', 'llama.embedding_length': '5120', 'llama.block_count': '40', 'llama.feed_forward_length': '13824', 'llama.attention.head_count': '40', 'tokenizer.ggml.eos_token_id': '2', 'general.file_type': '2', 'llama.attention.head_count_kv': '40', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'tokenizer.ggml.model': 'llama', 'general.quantization_version': '2', 'tokenizer.ggml.bos_token_id': '1', 'tokenizer.ggml.unknown_token_id': '0'}


In [10]:
from llama_index.node_parser.text import SentenceSplitter
#takes time depending on spec/size of data.
#i will be changing current chunking method to be content-aware in a future MVP. Current one is not great but not horrible. 
#to-do test performance on smaller chunk sizes (128, 256, <1k)
text_parser = SentenceSplitter(
    chunk_size=1024,
    #separator=" ",
)

text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata 
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [11]:
from llama_index.schema import TextNode

nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

In [None]:
#this part takes the most time. expect to wait the better part of a day+ if using a large model. (tested on the AWS documentation w beefy laptop - will see if we can use one of the HPC in the lloyd for future inference)
#i will be looking into ways to finetune this for faster results. current method is not suitable for large datasets
#inference ranging from 1ms-100ms per input - need to look into the large discrepancy. Most sessions are <7ms
#if logs freeze and you are unsure if the code is running/frozen. just type anything in any cell for logs to get sent realtime
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

In [None]:
#saving to local for the timebeing - will store in a server/or docker near prod
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("aws_documentation")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=embed_model)

In [None]:
index = VectorStoreIndex.from_documents(
    documents=nodes, 
    storage_context=storage_context,
    service_context=service_context
)