In [1]:
%pip install requests
%pip install numpy
%pip install ollama

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from typing import Callable, List, Tuple
import numpy as np
import requests as r
import ollama
from ollama import chat
from ollama import ChatResponse

In [3]:
# Settings
BASE_URL = "http://localhost:11434"  # Ollama
GEN_MODEL = "Llama3.1"
EMBEDDINGS_MODEL = "all-minilm"

In [None]:
class LLM:
    def __init__(self, model: str, base_url: str) -> None:
        self.model = model
        # self.endpoint = f"{base_url}/api/generate"

    def generate(self, prompt: str) -> str:
        # Using Python requests lib
        # headers = {"Content-Type": "application/json"}
        # data = {"model": self.model, "prompt": prompt, "stream": False}
        # response = r.post(self.endpoint, headers=headers, json=data)
        # if response.status_code == 200:
        #     return response.json()["response"]
        
        return ollama.generate(model=self.model, prompt=prompt, stream=False).response


llm = LLM(model=GEN_MODEL, base_url=BASE_URL)

In [5]:
# Validate yourself with the test cases below
assert llm.generate("What is 2+2? Answer only numbers") == "4"

In [None]:
from transformers import AutoTokenizer

# Initialize the tokenizer for the specific model
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# The text to be tokenized
input_text = 'assert llm.generate("What is 2+2? Answer only numbers") == "4"'
response_text = "4"

# Tokenize the input text
input_tokens = tokenizer.encode(input_text, add_special_tokens=False)

# Tokenize the response text
response_tokens = tokenizer.encode(response_text, add_special_tokens=False)

# Print the number of tokens for both input and response
print(f"Number of tokens in input: {len(input_tokens)}")
print(f"Number of tokens in response: {len(response_tokens)}")

  from .autonotebook import tqdm as notebook_tqdm


Number of tokens in input: 23
Number of tokens in response: 1


### Step 2: Generate embeddings using API

Here is another example of how to call LLM API to generate embeddings. Embeddings are a different type of model where you can get a vector representation of the text. This is useful for vector search. You can find more information here:

- https://platform.openai.com/docs/guides/embeddings 

In [7]:
class Embeddings:
    def __init__(self, model: str, base_url: str) -> None:
        self.model = model
        self.endpoint = f"{base_url}/api/embed"

    def generate(self, texts: List[str]) -> List[np.ndarray]:

        '''
        # Using request library
        data = {"model": self.model, "input": texts}
        response = r.post(self.endpoint, json=data)
        data = response["data"]
        result = [
            np.array(embedding_data["embedding"]) for embedding_data in data
        ]'''

        data = ollama.embed(model=self.model, input=texts).embeddings
        result = [
            np.array(embedding_data) for embedding_data in data
        ]

        return result

embeddings = Embeddings(model=EMBEDDINGS_MODEL, base_url=BASE_URL)

In [8]:
embeddings.generate(["Hello", "world"])

[array([-6.28815740e-02,  5.49236460e-02,  5.19983880e-02,  8.57613200e-02,
        -8.28843400e-02, -7.45355560e-02,  6.84593600e-02,  1.84834130e-02,
        -8.20334200e-02, -3.72907400e-02,  1.22046490e-02,  3.67584000e-03,
        -4.19703600e-03, -4.38255560e-02,  2.17877100e-02, -5.04259800e-03,
         1.95215440e-02, -4.22743970e-02, -1.10398280e-01,  5.23626430e-03,
        -5.59179900e-02,  2.79457740e-02, -2.31763100e-02,  2.85418300e-02,
        -5.37887850e-02, -5.25930670e-02,  3.37996300e-02,  4.53285430e-02,
         2.38065140e-02, -7.31337200e-02,  5.47934620e-02,  1.69697480e-02,
         8.13179700e-02, -2.77827450e-03,  1.19441310e-02,  7.34490700e-02,
        -9.43206200e-02, -8.13761700e-02,  4.01511100e-02,  6.95719900e-04,
        -1.33454520e-02, -5.44682260e-02,  5.13111050e-03, -2.61289300e-02,
         3.68050300e-02, -3.38917230e-02,  2.11916770e-02,  5.58499900e-02,
         5.78013960e-02, -5.37844000e-03, -6.83696050e-02, -9.03298560e-02,
        -4.2

In [9]:
# Validate yourself with the test cases below
test_embeddings = embeddings.generate(["Hello", "world"])

assert len(test_embeddings) == 2
assert type(test_embeddings) == list
assert type(test_embeddings[0]) == np.ndarray
assert type(test_embeddings[1]) == np.ndarray
assert test_embeddings[0].shape == (384,)
assert test_embeddings[1].shape == (384,)
assert np.all(test_embeddings[0] != test_embeddings[1])

### Step 3: Implement chunking in Python

In this step, we will implement chunking in Python. Chunking is a process of splitting text into smaller parts. This is useful when you have long text and you want to process it in parts. This is useful for vector search when you want to search for similar documents. For this implementation, we will split text by word count. Here you can implement more advanced chunking algorithms like splitting by sentences or paragraphs or adding some overlap between chunks.



In [10]:
def split_into_chunks(text: str, chunk_size: int = 30) -> list:
    """
    Split the input text into chunks of the specified size.

    Args:
    text (str): The input text to split.
    chunk_size (int): The size in words of each chunk.
    """

    chunks = []
    tokens = text.split()
    for i in range(0, len(tokens), chunk_size):
        chunks.append(" ".join(tokens[i : i + chunk_size]))
    return chunks

In [11]:
# Validate yourself with the test cases below
test_chunks = split_into_chunks("hello world, how are you?", chunk_size=2)

assert len(test_chunks) == 3, f"Expected 3 chunks but got {len(test_chunks)}"
assert (
    len(test_chunks[0].split()) == 2
), f"Expected 2 words in first chunk but got {len(test_chunks[0].split())}"
assert test_chunks == [
    "hello world,",
    "how are",
    "you?",
], f"Got different result {test_chunks}"

### Step 4: Create a similarity measure using cosine similarity

The core idea of vector search is to find similar vectors. This method is called KNN (k-nearest neighbors) search. In this step, we will implement cosine similarity. Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them. It is calculated as the dot product of the vectors divided by the product of the magnitudes of the vectors.

Here’s the cosine similarity formula:

$$
 \text{cosine\_similarity} = \frac{\mathbf{A} \cdot \mathbf{B}}{\|\mathbf{A}\| \|\mathbf{B}\|} 
$$
Where:

- $\mathbf{A} \cdot \mathbf{B}$  is the dot product of vectors  $\mathbf{A}$  and  $\mathbf{B}$.
- $\|\mathbf{A}\|$  is the norm (magnitude) of vector  $\mathbf{A}$.
- $\|\mathbf{B}\|$  is the norm (magnitude) of vector  $\mathbf{B}$.


In [12]:
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """
    Calculate the cosine similarity between two vectors.

    Args:
    a (np.ndarray): The first vector.
    b (np.ndarray): The second vector.
    """
    # TODO Implement the function to calculate the cosine similarity between two vectors
    product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)

    return product / (norm_a * norm_b)

In [13]:
# Validate yourself with the test cases below
test_distance = cosine_similarity(np.array([1, 2, 3]), np.array([4, 5, 6]))
assert (
    round(test_distance, 2) == 0.97
), "The cosine similarity between the same vectors should be 0.97"

### Step 5: Implement vector retrieval

In this step, we will implement vector retrieval. This is a process of finding similar vectors to the query vector. This is useful when you have a lot of documents and you want to find the most similar ones to the query. This is useful for building search engines. In this implementation, we will use brute force search. This is not efficient for large datasets, but it is good for educational purposes. In real-world scenarios, you would use specialized libraries like Faiss or Annoy or vector search engines like Milvus, Qdrant, and pgvector.

It includes the following steps:
- add_documents
  - for each document, generate embeddings using API
  - store the embeddings and document
- retrieve_most_similar
  - generate embeddings for the query
  - calculate cosine similarity between the query and all documents
  - return the most similar documents based on the cosine similarity

In [None]:
class Retriever:

    def __init__(
        self,
        vectorizer: Callable[[List[str]], List[Tuple[str, np.ndarray]]],
        similarity_metric: Callable[[np.ndarray, np.ndarray], float],
    ):
        self.vectorizer = vectorizer
        self.similarity_metric = similarity_metric
        self.documents = []
        self.vectors = []

    def add_documents(self, documents: List[str]):
        """
        Add documents to the retriever.

        Args:
        documents (List[str]): The list of documents to add.
        """
        self.documents.extend(documents)
        self.vectors = self.vectorizer(documents)

    def retrieve_most_similar(
        self, query: str, vectors: List[float], top_k: int = 1
    ) -> List[dict]:
        """
        Retrieve the most similar chunk to the query from the list of vectors.

        """
        query_vector = self.vectorizer(
            query
        )
        similarities = [
            self.similarity_metric(query_vector, doc_vector)[0]
            for doc_vector in vectors
        ]
        top_k_indices = np.argsort(similarities)[-top_k:][
            ::-1
        ]
        return [
            {"document": self.documents[i], "similarity": similarities[i]}
            for i in top_k_indices
        ]

# Step 6: Putting it all together, building RAG (Retriever Answer Generator)

How we can use all these components to build the RAG model? RAG is a model that can generate answers based on the documents. It consists of two parts:
- Retriever: find the most relevant documents
- Generator: generate the answer based on the documents


The whole process contains two stages: offline and online. In the offline stage, you split documents into chunks, generate embeddings for all chunks and store them. In the online stage, you generate embeddings for the query and find the most similar chunks. Then you generate the answer based on the most similar documents.

### lets take the story and build assistant to answer the questions based on the story

In [15]:
story = """In a bustling city, tucked away in a cozy apartment, lived a small cat named Whiskers. Whiskers was a curious and adventurous feline, always exploring nooks and crannies, but one day, his attention was caught by something entirely new—a shiny, sleek iPhone that belonged to his owner, Emma.

Emma was a tech enthusiast who loved her gadgets, especially her iPhone. She used it for everything, from ordering groceries to video chatting with friends. One lazy afternoon, Emma left her iPhone on the coffee table and stepped out for a quick run to the store, leaving Whiskers alone in the apartment.

Whiskers, intrigued by the glowing screen, padded over to the device. With a gentle paw, he nudged it, causing the screen to light up. The cat’s eyes widened as he saw the moving images and colorful icons. As he batted at the screen, the phone unlocked, revealing a treasure trove of apps and pictures.

The first interesting pivot occurred when Whiskers accidentally opened Emma’s video editing app. As he pounced on the screen, he unknowingly began creating a video. Clips of Emma dancing, snippets of her travels, and hilarious moments of Whiskers himself filled the timeline. With a few more taps and swipes, the video was complete and, unbeknownst to Whiskers, uploaded to Emma’s social media.

The next twist came when Emma’s friends started commenting and sharing the video, amazed at the unexpected compilation. Emma, still unaware of the viral sensation, returned home to find her phone on the floor and Whiskers curled up beside it, purring contentedly.

As Emma picked up her phone, notifications flooded the screen. Confused, she opened the video app and saw the masterpiece Whiskers had inadvertently created. She couldn’t help but laugh at the serendipity of it all. Emma decided to embrace the moment and posted a thank you message to Whiskers on her social media, giving him full credit for the unexpected entertainment.

However, the story didn’t end there. The final pivot happened when a local news channel got wind of the viral video. They reached out to Emma, eager to feature Whiskers on their evening segment. Emma agreed, and soon, Whiskers was a local celebrity. The small cat who had simply been curious about an iPhone had now become a beloved figure in the community.

Whiskers enjoyed the attention, but more than that, he relished the extra treats and cuddles from Emma. The iPhone, once just a gadget, became a bridge to a new adventure, reminding Emma and Whiskers that sometimes, the most unexpected moments bring the greatest joy."""

### Before building RAG, let's first check if LLM itself can NOT answer the question 

In [17]:
# Test LLM without context
query = "Who does iPhone belong to?"

llm.generate(query)

"The iPhone is a product of Apple Inc., an American multinational technology company based in Cupertino, California. As such, the rights and ownership of the iPhone are held by Apple.\n\nApple designs, manufactures, markets, and sells iPhones through its own retail stores and various carriers around the world. The company holds the intellectual property (IP) rights to the iPhone brand, including trademarks and patents related to the device's design, functionality, and operating system.\n\nHowever, it's worth noting that when you purchase an iPhone from Apple or a carrier, you become the owner of the physical device. You are entitled to use the phone as you see fit, install your own apps and software, and sell or trade it in when you're ready to upgrade.\n\nBut even after purchasing an iPhone, Apple still retains some rights and responsibilities related to the device, such as:\n\n1. Software updates: Apple continues to provide software updates, security patches, and new features for you

> **Answer is correct but not our story, let's use story as a context and build RAG to answer the questions based on the story**

As a first step, we will build a retriever and check that that works well. We will use the story as the context and the questions as the query. We will use the cosine similarity to find the most similar question to the context.

In [None]:
# Create retriever to use for the knowledge base
chunk_size = 30
chunks = split_into_chunks(story, chunk_size=chunk_size)
print(f"Len of chunks: {len(chunks)}")
retriever = Retriever(
    vectorizer=embeddings.generate, similarity_metric=cosine_similarity
)
retriever.add_documents(chunks)

In [None]:
## Retrieve the most similar chunk to the query
top_similar_documents = retriever.retrieve_most_similar(
    query, retriever.vectors, top_k=5
)
top_similar_documents

We can see some chunks most similar to the query according to the cosine similarity.


😁 Let's build the RAG model to answer the questions based on the story.

In [None]:
question = "Who does iPhone belong to?"

# Retrieve relevant context from the knowledge base
top_similar_documents = retriever.retrieve_most_similar(
    query, retriever.vectors, top_k=5
)
context = "\n\n".join([doc["document"] for doc in top_similar_documents])

# Generate the prompt
prompt = f"""You goal is to answer the following question: {query} based on the context provided below.

{context}
--- 
Answer:"""

# Generate the answer
answer = llm.generate(prompt)
answer

## Evaluation and extra tasks 

- `10 points` Base task: Implement the RAG model
- `2 point`: Calculate how much token did you use for the prompt and for the responese 
- `1 point`: Add several documents to the context.
- `3 points`: When generating reponse, add links on what document or chunk it is based on. 
- `3 points`: Add prompt injection protection to the response. 
- `5-10 points`: Add pre-and post-validation using any guiderail framework to protect from prompt injection, 