# Setup

Goal: Explore a on-prem, RAG solution that is most cost effective in terms of infra requirements and integrate with Vault to protect sensitive information.   Use RHT Hybrid Cloud and HCP Terraform as much as possible to automate full stack starting with bare metal.

Source: https://github.com/mwright-pivotal/platform_automation_hcp_rhos 

References: 

https://news.microsoft.com/source/features/ai/the-phi-3-small-language-models-with-big-potential/
https://github.com/GURPREETKAURJETHRA/Phi-3-LLM-by-Microsoft/blob/main/Phi3_Testing.ipynb
https://python.langchain.com/docs/integrations/vectorstores/pgvector/#instantiation
https://www.pragnakalp.com/leverage-phi-3-exploring-rag-based-qna-with-microsofts-phi-3/
https://bugbytes.io/posts/vector-databases-pgvector-and-langchain/

Archticture:
- 3 physical hosts
- 2 Nvidia A2 GPUs
- Redhat Openshift w/Ceph
- NVidia GPU Operator
- Postgresql w/PGVector extension (deployed to Openshift)
- Developer Workspace using Jupyter allocated to a single GPU (deployed to Openshift)

Note: latest version of pgvector not compatible with PGVector from langchain_postgres.vectorstores

In [1]:
!pip install --quiet openai pandas numpy tiktoken pgvector==0.3.0 python-dotenv einops datasets sentence_transformers flash_attn
!pip install --upgrade --quiet  langchain-nvidia-ai-endpoints

In [2]:
!huggingface-cli download microsoft/Phi-3-mini-4k-instruct-gguf Phi-3-mini-4k-instruct-q4.gguf --local-dir . --local-dir-use-symlinks False

Downloading 'Phi-3-mini-4k-instruct-q4.gguf' to '.cache/huggingface/download/Phi-3-mini-4k-instruct-q4.gguf.8a83c7fb9049a9b2e92266fa7ad04933bb53aa1e85136b7b30f1b8000ff2edef.incomplete'
Phi-3-mini-4k-instruct-q4.gguf: 100%|███████| 2.39G/2.39G [00:23<00:00, 102MB/s]
Download complete. Moving file to Phi-3-mini-4k-instruct-q4.gguf
Phi-3-mini-4k-instruct-q4.gguf


In [None]:
!CUDACXX="/usr/local/cuda-12/bin/nvcc" CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all-major" pip install llama-cpp-python --upgrade --force-reinstall --no-cache-dir

# Load text Terraform Docs on GitHub

In [3]:
!git clone https://github.com/hashicorp/terraform

!mkdir text-docs
!cd terraform/docs; for filename in *.md; do pandoc $filename -f markdown -t plain -o ../../text-docs/$filename.txt; done

Cloning into 'terraform'...
remote: Enumerating objects: 305156, done.[K
remote: Counting objects: 100% (3758/3758), done.[K
remote: Compressing objects: 100% (1816/1816), done.[K
remote: Total 305156 (delta 2221), reused 3182 (delta 1849), pack-reused 301398 (from 1)[K
Receiving objects: 100% (305156/305156), 306.97 MiB | 18.55 MiB/s, done.
Resolving deltas: 100% (192729/192729), done.
Updating files: 100% (4672/4672), done.


In [1]:
#!git clone https://github.com/hashicorp/tutorials

!cd tutorials/content/tutorials; for filename in **/*.mdx; do pandoc $filename -f markdown -t plain -o $filename.txt; done

^C


In [2]:
from pathlib import Path
import openai
import os
import pandas as pd
import numpy as np
import json
import tiktoken
import ast
import pgvector
import math
from transformers import AutoTokenizer, AutoModel

df = pd.DataFrame(columns=['title', 'content', 'url'])

pathlist = Path('text-docs/').glob('**/*.txt')
for path in pathlist:
    with open(path) as f: content = f.read()
    new_item = {'title': str(path), 'content': content, 'url': 'https://github.com/hashicorp/terraform/tree/main/docs'}
    df.loc[len(df.index)] = new_item

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("dunzhang/stella_en_1.5B_v5", trust_remote_code=True).cuda()

In [3]:
# Helper functions to help us create the embeddings

# Helper func: calculate number of tokens
def num_tokens_from_string(string: str, encoding_name = "cl100k_base") -> int:
    if not string:
        return 0
    # Returns the number of tokens in a text string
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Helper function: calculate length of essay
def get_essay_length(essay):
    word_list = essay.split()
    num_words = len(word_list)
    return num_words

# Helper function: calculate cost of embedding num_tokens
# Assumes we're using the text-embedding-ada-002 model
# See https://openai.com/pricing
def get_embedding_cost(num_tokens):
    return num_tokens/1000*0.0001

# Helper function: calculate total cost of embedding all content in the dataframe
def get_total_embeddings_cost():
    total_tokens = 0
    for i in range(len(df.index)):
        text = df['content'][i]
        token_len = num_tokens_from_string(text)
        total_tokens = total_tokens + token_len
    total_cost = get_embedding_cost(total_tokens)
    return total_cost

# Helper function: get embeddings for a text
def get_embeddings(text):
    #print(text)
    # response = client.embeddings.create(
    #     model="test-embeddings",
    #     input = text.replace("\n"," ")
    # )
    # embedding = response.data[0].embedding

    response = llm.create_embedding(input = text.replace("\n"," "))
    embedding = response["data"][0]["embedding"]
    return embedding

# Helper function: get embeddings for a text
def get_embeddings_hf(text):
    passage_embeddings = embedding_model.encode([text])
    return passage_embeddings

In [4]:
###############################################################################
# Create new list with small content chunks to not hit max token limits
# Note: the maximum number of tokens for a single request is 8191
# https://openai.com/docs/api-reference/requests
###############################################################################
# list for chunked content and embeddings
new_list = []
# Split up the text into token sizes of around 512 tokens
for i in range(len(df.index)):
    text = df['content'][i]
    token_len = num_tokens_from_string(text)
    if token_len <= 512:
        new_list.append([df['title'][i], df['content'][i], df['url'][i], token_len])
    else:
        # add content to the new list in chunks
        start = 0
        ideal_token_size = 512
        # 1 token ~ 3/4 of a word
        ideal_size = int(ideal_token_size // (4/3))
        end = ideal_size
        #split text by spaces into words
        words = text.split()

        #remove empty spaces
        words = [x for x in words if x != ' ']

        total_words = len(words)
        
        #calculate iterations
        chunks = total_words // ideal_size
        if total_words % ideal_size != 0:
            chunks += 1
        
        new_content = []
        for j in range(chunks):
            if end > total_words:
                end = total_words
            new_content = words[start:end]
            new_content_string = ' '.join(new_content)
            new_content_token_len = num_tokens_from_string(new_content_string)
            if new_content_token_len > 0:
                new_list.append([df['title'][i], new_content_string, df['url'][i], new_content_token_len])
            start += ideal_size
            end += ideal_size

### Option 1: Interacting with PGVector using Langchain VectorStore & HuggingFacePipeline

Table creation and structure is defined by the API
It expects the Database (eg langchain) to be created beforehand

In [17]:
!pip install -qU langchain_postgres langchain-huggingface langchain langchain-community accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [5]:
from langchain_huggingface import HuggingFaceEmbeddings

#model_name = "sentence-transformers/all-mpnet-base-v2"
model_name = "flax-sentence-embeddings/stackoverflow_mpnet-base"
model_kwargs = {"device": "cuda", "trust_remote_code": True}
encode_kwargs = {
    "normalize_embeddings": False,
}

embeddings_lc = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)



In [6]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

document_list = []
pathlist = Path('text-docs/').glob('**/*.txt')
for path in pathlist:
    loader = TextLoader(path, encoding='utf-8')
    documents = loader.load()
    document_list+=documents

#print(documents)  # prints the document objects
print(len(document_list))  # 1 - we've only read one file/document into the loader

192


In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
texts = text_splitter.split_documents(document_list)

#print(texts)
print(len(texts))

5514


In [8]:
#Note: delete tables langchain_pg_collections and langchain_pg_embeddings if you execute this cell more than once

from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
# See docker command above to launch a postgres instance with pgvector enabled.
pgPassword = "JuebqFsgss"
connection = "postgresql+psycopg://postgres:" + pgPassword + "@pgvector-postgresql/langchain"  # Uses psycopg3!
collection_name = "my_docs"


vector_store = PGVector.from_documents(
    embedding=embeddings_lc,
    documents=texts,
    collection_name=collection_name,
    connection=connection,
)
# vector_store = PGVector(
#     embeddings=embeddings_lc,
#     documents=new_list,
#     collection_name=collection_name,
#     connection=connection,
# )

In [9]:
query = "What does an operation consist of?"
similar = vector_store.similarity_search_with_score(query, k=2)

for doc in similar:
    print(doc, end="\n\n")

(Document(id='d885d10a-1f7f-43c2-8d19-d90262d03c04', metadata={'source': 'text-docs/kubernetes-scope-microservice.mdx.txt'}, page_content='how it works out'), 0.5332930048354858)

(Document(id='dfbb9bcb-1ad8-42d7-834e-edb1443049e9', metadata={'source': 'text-docs/architecture.md.txt'}, page_content='-   NodeDestroyResourceInstance.Execute, which handles the main destroy\n    operation.\n\nA vertex must complete successfully before the graph walk will begin\nevaluation for other vertices that have “happens after” edges.\nEvaluation can fail with one or more errors, in which case the graph\nwalk is halted and the errors are returned to the user.\n\nExpression Evaluation\n\nAn important part of vertex evaluation for most vertex types is\nevaluating any expressions in the configuration block associated with\nthe vertex. This completes the processing of the portions of the\nconfiguration that were not processed by the configuration loader.\n\nThe high-level process for expression evaluation

In [10]:
# A utility function for answer generation
def ask(question):
   context = retriever.invoke(question)
   #print(context)

   answer = (chain({"input_documents": context, "question": question}, return_only_outputs=True))['output_text']
   return answer

In [11]:
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFacePipeline

# Define the custom prompt template suitable for the Phi-3 model
qna_prompt_template="""<|system|>
You have been provided with the context and a question, try to find out the answer to the question only using the context information. If the answer to the question is not found within the context, return "I dont know" as the response.<|end|>
<|user|>
Context:
{context}

Question: {question}<|end|>
<|assistant|>"""
PROMPT = PromptTemplate(
   template=qna_prompt_template, input_variables=["context", "question"]
)

#del llm
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct", device_map='cuda', torch_dtype="auto", trust_remote_code=True,)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300)
llm = HuggingFacePipeline(pipeline=pipe)

# Define the QNA chain
chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT)

retriever = vector_store.as_retriever(search_kwargs = {"k" : 3})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

stuff: https://python.langchain.com/v0.2/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/v0.2/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/v0.2/docs/how_to/#qa-with-rag
  chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT)


In [13]:
# Take the user input and call the function to generate output
user_question = input("User: ")
answer = ask(user_question)
answer = (answer.split("<|assistant|>")[-1]).strip()
print("Answer:", answer)

User:  what is waypoint?


Answer: HCP Waypoint is a HashiCorp-managed application deployment platform that simplifies the process of deploying applications into your infrastructure and helps you standardize your deployment process.


### Option 2: Interacting with PGVector with psycopg2 and Llama CPP

In [None]:
#Note: binary option works with running in containers...

!pip uninstall -y psycopg2
!pip install psycopg2-binary --no-binary :all:

In [None]:
#print(get_embeddings("My name is Mike"))

# Create embeddings for each piece of content
for i in range(len(new_list)):
    text = new_list[i][1]
    embedding = get_embeddings_hf(text)
    new_list[i].append(embedding)

# Create a new dataframe from the list
# df_new = pd.DataFrame(new_list, columns=['title', 'content', 'url', 'tokens', 'embeddings'])
# df_new.head()

In [None]:
#query_embeddings = model.encode(queries, instruction=query_prefix, max_length=max_length)

df_new = pd.DataFrame(new_list, columns=['title', 'content', 'url', 'tokens', 'embeddings'])

In [None]:
pgPassword = ""

os.environ['PGVECTOR_DB'] = 'postgres://postgres:' + pgPassword + '@pgvector-postgresql'

connection_string  = os.environ['PGVECTOR_DB']

In [None]:
from llama_cpp import Llama
from psycopg2
from psycopg2.extras import execute_values
from pgvector.psycopg2 import register_vector

llm = Llama(
  model_path="/home/jovyan/Phi-3-mini-4k-instruct-q4.gguf",  # path to GGUF file
  n_ctx=4096,  # The max sequence length to use - note that longer sequence lengths require much more resources
  n_threads=8, # The number of CPU threads to use, tailor to your system and the resulting performance
  n_gpu_layers=35, # The number of layers to offload to GPU, if you have GPU acceleration available. Set to 0 if no GPU acceleration is available on your system.
)

In [None]:
vector = embeddings.embed_query(query)
print(vector)

In [None]:
# Connect to PostgreSQL database in Timescale using connection string
conn = psycopg2.connect(connection_string)
cur = conn.cursor()

#install pgvector 
cur.execute("CREATE EXTENSION IF NOT EXISTS vector");
conn.commit()

# Register the vector type with psycopg2
register_vector(conn)

table_drop_command = """
DROP TABLE IF EXISTS embeddings;
"""
cur.execute(table_drop_command)

# Create table to store embeddings and metadata
table_create_command = """
CREATE TABLE IF NOT EXISTS embeddings (
            id bigserial primary key, 
            title text,
            url text,
            content text,
            tokens integer,
            embedding vector(1024)
            );
            """

cur.execute(table_create_command)
cur.close()
conn.commit()

In [None]:
register_vector(conn)
cur = conn.cursor()

In [None]:
df_new.head()['embeddings']

In [None]:
df_new.loc[0]['embeddings'].shape

In [None]:

# Remind ourselves of the dataframe structure
df_temp = df_new.loc[df_new['title'] == 'text-docs/README.md.txt']

In [None]:
# Save the dataframe with embeddings as a CSV file
df_temp.to_csv('terraformdocs_data_and_embeddings.csv', index=False)
# It may also be useful to save as a json file, but we won't use this in the tutorial
#df_new.to_json('blog_data_and_embeddings.json')

In [None]:
#Batch insert embeddings and metadata from dataframe into PostgreSQL database
# Prepare the list of tuples to insert
data_list = [(row['title'], row['url'], row['content'], int(row['tokens']), row['embeddings'][0]) for index, row in df_new.iterrows()]
# Use execute_values to perform batch insertion
execute_values(cur, "INSERT INTO embeddings (title, url, content, tokens, embedding) VALUES %s", data_list)
# Commit after we insert all embeddings
conn.commit()

In [None]:
cur.execute("SELECT COUNT(*) as cnt FROM embeddings;")
num_records = cur.fetchone()[0]
print("Number of vector records in table: ", num_records,"\n")

In [None]:
# print the first record in the table, for sanity-checking
cur.execute("SELECT * FROM embeddings LIMIT 1;")
records = cur.fetchall()
print("First record in table: ", records)

In [None]:
# Create an index on the data for faster retrieval
# this isn't really needed for 129 vectors, but it shows the usage for larger datasets
# Note: always create this type of index after you have data already inserted into the DB

#calculate the index parameters according to best practices
num_lists = num_records / 1000
if num_lists < 10:
    num_lists = 10
if num_records > 1000000:
    num_lists = math.sqrt(num_records)

#use the cosine distance measure, which is what we'll later use for querying
cur.execute(f'CREATE INDEX ON embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = {num_lists});')
conn.commit() 

In [None]:
from llama_cpp import Llama


llm = Llama(
  model_path="./Phi-3-mini-4k-instruct-q4.gguf",  # path to GGUF file
  n_ctx=4096,  # The max sequence length to use - note that longer sequence lengths require much more resources
  n_threads=8, # The number of CPU threads to use, tailor to your system and the resulting performance
  n_gpu_layers=35, # The number of layers to offload to GPU, if you have GPU acceleration available. Set to 0 if no GPU acceleration is available on your system.
)


In [None]:
query_prompt = "Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: "
queries = [
    "What is Terraform?",
    "What are the benefits of Terraform?",
]
queries = [query_prompt + query for query in queries]

In [None]:
# Helper function: Get top 3 most similar documents from the database
def get_top3_similar_docs(query_embedding, conn):
    embedding_array = np.array(query_embedding)
    # Register pgvector extension
    register_vector(conn)
    cur = conn.cursor()
    # Get the top 3 most similar documents using the KNN <=> operator
    cur.execute("SELECT content FROM embeddings ORDER BY embedding <=> %s LIMIT 3", (embedding_array,))
    top3_docs = cur.fetchall()
    return top3_docs

In [None]:
# Question about Timescale we want the model to answer
input = "What is the default planning behavior of terraform?"

In [None]:
# Function to process input with retrieval of most similar documents from the database
def process_input_with_retrieval(user_input):
    delimiter = "```"

    #Step 1: Get documents related to the user input from database
    related_docs = get_top3_similar_docs(get_embeddings_hf(user_input)[0], conn)

    # Step 2: Get completion from OpenAI API
    # Set system message to help set appropriate tone and context for model
    system_message = f"""
    You are a friendly chatbot. \
    You can answer questions about terraform, its features and its use cases. \
    You respond in a concise, technically credible tone. \
    """

    # Prepare messages to pass to model
    # We use a delimiter to help the model understand the where the user_input starts and ends
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": f"{delimiter}{user_input}{delimiter}"},
        {"role": "assistant", "content": f"Relevant Terraform information: \n {related_docs[0][0]} \n {related_docs[1][0]} {related_docs[2][0]}"}   
    ]

    # final_response = get_completion_from_messages(messages)
    return messages

In [None]:
response = process_input_with_retrieval(input)
print(input)
print(response)