### ⚙️ | Config

In [1]:
LOCAL_LLM = "gemma2"
LLM_MAX_TOKENS = 512
LLM_TEMPERATURE = 0

### 🔧 | Embeddings (Do not show progress)

In [2]:
from langchain_community.embeddings import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="nomic-embed-text", show_progress=False)

### 🛠️ | Vector Store 

In [3]:
from langchain_community.vectorstores import Chroma

CHROMA_COLLECTION_NAME = "blogposts"
CHROMA_PERSIST_DIRECTORY = "./db-chromadb"

db = Chroma(
    persist_directory=CHROMA_PERSIST_DIRECTORY,
    embedding_function=embeddings,
    # collection_name=CHROMA_COLLECTION_NAME
)

print(
    f"There are {db._collection.count()} in the <{CHROMA_COLLECTION_NAME}> collection."
)

There are 0 in the <blogposts> collection.


### 📥 | Ingest

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader

urls = [
    "https://lilianweng.github.io/posts/2024-04-12-diffusion-video/",
    "https://lilianweng.github.io/posts/2024-02-05-human-data-quality/",
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

# Initialize a text splitter with specified chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500, chunk_overlap=300, add_start_index=True
)

# Split the documents into chunks
doc_splits = text_splitter.split_documents(docs_list)

vectorstore = db.from_documents(
    documents=doc_splits,
    embedding=embeddings,
    persist_directory=CHROMA_PERSIST_DIRECTORY,
)

retriever = vectorstore.as_retriever(k=4)

print(
    f"There are {db._collection.count()} vectors in the <{db._collection.name}> collection."
)

USER_AGENT environment variable not set, consider setting it to identify your requests.


There are 182 vectors in the <langchain> collection.


### 📤 | Create Retriever

In [5]:
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5},
)

### 🧠 | LLM

In [6]:
from langchain_community.chat_models import ChatOllama

llm = ChatOllama(
    model=LOCAL_LLM,
    keep_alive="3h",
    max_tokens=LLM_MAX_TOKENS,
    temperature=LLM_TEMPERATURE,
)

### 🚀 | Prompt Templates 

In [7]:
from jinja2 import Environment, FileSystemLoader
from langchain_core.prompts import ChatPromptTemplate

PROMPTS_PATH = "prompts"
RAG_PROMPT_FILENAME = "rag-prompt.jinja"

env = Environment(loader=FileSystemLoader(PROMPTS_PATH))
template = env.get_template(RAG_PROMPT_FILENAME)
prompt = ChatPromptTemplate.from_template(template.render())

In [8]:
# Function to print the prompt for a runnable assign
def print_prompt(input_dict):
    formatted_prompt = prompt.format(**input_dict)
    print("Generated Prompt:")
    print(formatted_prompt)
    print("-" * 50)
    return input_dict

In [9]:
# Function to print and pass through the formatted prompt - string output
def print_and_pass_prompt(formatted_prompt):
    print("Generated Prompt:")
    print(formatted_prompt)
    print("-" * 50)
    return formatted_prompt

### 🔗 | Chain

In [10]:
from langchain.schema.runnable import RunnablePassthrough

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | print_and_pass_prompt
    | llm
)

### 💬 | Chat with RAG

In [11]:
def ask_question(question: str):
    print("Answer:", end=" ", flush=True)
    for chunk in rag_chain.stream(question):
        print(chunk.content, end="", flush=True)
    print("\n")

In [12]:
user_question = "What is 3D stable disfussion?"
answer = ask_question(user_question)

Answer: Generated Prompt:
messages=[HumanMessage(content='Answer the question based only on the following context:\n[Document(page_content=\'where $\\\\hat{\\\\mathbf{x}}^a_\\\\theta (\\\\mathbf{z}_t), \\\\hat{\\\\mathbf{x}}^b_\\\\theta (\\\\mathbf{z}_t)$ are reconstructions of $\\\\mathbf{x}^a, \\\\mathbf{x}^b$ provided by the denoising model. And $w_r$ is a weighting factor and a large one $w_r >1$ is found to improve sample quality. Note that it is also possible to simultaneously condition on low resolution videos to extend samples to be at the high resolution using the same reconstruction guidance method.\\nModel Architecture: 3D U-Net & DiT#\\nSimilar to text-to-image diffusion models, U-net and Transformer are still two common architecture choices. There are a series of diffusion video modeling papers from Google based on the U-net architecture and a recent Sora model from OpenAI leveraged the Transformer architecture.\\nVDM (Ho & Salimans, et al. 2022) adopts the standard diffus