In [None]:
import os

from torch import cuda, bfloat16
import transformers
from dotenv import load_dotenv
from pathlib import Path

import psycopg2
from pgvector.psycopg2 import register_vector
import numpy as np


from sentence_transformers import SentenceTransformer
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

In [None]:
env_path = Path('..') / '.env'
load_dotenv(dotenv_path=env_path)
hf_auth = os.environ.get('HUGGING_FACE_TOKEN');

In [None]:
model_id = 'meta-llama/Llama-2-13b-chat-hf'

Quantization config with Bits and Bytes

In [None]:
bnb_config = transformers.BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=bfloat16
)

Model config

In [None]:
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    token=hf_auth
)

Initialize the model and move to GPU -- This can take about ~10 mins to download if the local HF cache doesn't have the model

In [None]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    token=hf_auth
)

In [None]:
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [None]:
model.eval()
print(f"Model loaded on {device}")

Get the corresponding llama2 13B tokenizer and initialize it

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    token=hf_auth
)

In [None]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    task='text-generation',
    # model params
    temperature=0.01,
    max_new_tokens=512, # max tokens for output
    repetition_penalty=1.1
)

In [None]:
res = generate_text("What is the difference between a desert and dessert?")
print(res[0]["generated_text"])

In [None]:
res

In [None]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

In [None]:
llm(prompt="What is the difference between a desert and dessert?")

### Initializing a Retrieval QA chain

Redo-ing the embeddings as huggingface in a separate table to find if there's much of a functional difference

In [None]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")


In [None]:
conn = psycopg2.connect(connection_string)

In [None]:
query = "Are property setbacks for pools different in East Hampton?"

In [None]:
def get_top3_similar_docs(query_embedding, conn):
    embedding_array = np.array(query_embedding)
    # Register pgvector extension
    register_vector(conn)
    cur = conn.cursor()
    # Get the top 3 most similar documents
    cur.execute("SELECT chunk_text FROM documentchunk ORDER BY embedding <=> %s LIMIT 3", (embedding_array,))
    top3_docs = cur.fetchall()
    return top3_docs

In [None]:
def get_embeddings(user_input):
    return embed_model.encode(user_input)

In [None]:
get_top3_similar_docs(get_embeddings(query), conn)

In [None]:
generate_text(query)

In [None]:
def process_input_with_retrieval(user_input):
    delimiter = "```"

    #Step 1: Get documents related to the user input from database
    related_docs = get_top3_similar_docs(get_embeddings(user_input), conn)

    system_message = f"""
    You are a friendly chatbot. \
    You can answer questions about timescaledb, its features and its use cases. \
    You respond in a concise, technically credible tone. \
    """

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": f"{delimiter}{user_input}{delimiter}"},
        {"role": "assistant", "content": f"Relevant Timescale case studies information: \n {related_docs[0][0]} \n {related_docs[1][0]} {related_docs[2][0]}"}   
    ]

    res = generate_text(messages)
    return res

In [None]:
rag_test = process_input_with_retrieval(query)

In [None]:
embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id
)

In [None]:
from langchain.vectorstores.pgvector import PGVector

In [None]:
connection_string = "postgresql://postgres:password@localhost:55432/ai_experiments"

In [None]:
conn.rollback()