# Complete RAG Flow - 7 steps

In [10]:
import os
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv
from fastembed import TextEmbedding
import re

load_dotenv()

True

## STEP 1: Chunking by Section

In [11]:
def chunk_by_section(text):
    """Split document by markdown headers (##)"""
    pattern = r"\n## "
    chunks = re.split(pattern, text)
    return [chunk.strip() for chunk in chunks if chunk.strip()]

with open('./report.md','r') as f:
    text = f.read()

chunks = chunk_by_section(text)

## STEP 2: Getting Embeddings for each chunk using fastembd

In [12]:
embedding_model = TextEmbedding()


def get_embedding(texts):
    return list(embedding_model.embed(texts))


chunk_embeddings = [get_embedding(chunk) for chunk in chunks]

## STEP 3: Creating Vector store & inserting all chunks

In [13]:
class VectorIndex:
    def __init__(self):
        self.vectors = []  # store embeddings
        self.metadata = []  # store original chunks with metadata

    def add_vector(self, embeddings, metadata):
        """Adding a vector and it's metadata to the index"""
        self.vectors.append(np.array(embeddings).flatten()) # used 'flatten()' to force convert all incoming embeddings to 1D array
        self.metadata.append(metadata)

    def search(self, query_embedding, top_k=2):
        """Find most similar vectors using cosine similarity"""
        # Convert to numpy for faster computation
        query_emb = np.array(query_embedding)

        # Calculate similarities
        similarities = []
        for i, vec in enumerate(self.vectors):
            vec = np.array(vec)
            # Cosine similarity formula
            similarity = np.dot(query_emb, vec) / (
                np.linalg.norm(query_emb) * np.linalg.norm(vec)
            )
            similarities.append((similarity, i))

        # Sort by similarity (highest first)
        similarities.sort(reverse=True)

        # Return top_k results
        results = []
        for sim, idx in similarities[:top_k]:
            results.append(
                {
                    "distance": 1 - sim,  # Convert similarity to distance (like video)
                    "content": self.metadata[idx]["content"],
                    "similarity": sim,
                }
            )
        return results


store = VectorIndex()

# Now looping through each chunk and it's embeddings
for embedding, chunk in zip(chunk_embeddings, chunks):
    store.add_vector(embedding, {"content": chunk})

## STEP 4: User asks a question (create embedding)

In [14]:
user_qn = "What did the software engineering department do last year?"
question_embedding = list(embedding_model.embed([user_qn]))[0]

## STPE 5: searching for most relevant chunks

In [15]:
results = store.search(question_embedding,top_k=2)

for i,result in enumerate(results):
    print(f"    Result {i+1}:")
    print(f"    Distance: {result['distance']:.3f}")
    print(f"    Content: {result['content'][:150]}...")
    print()

    Result 1:
    Distance: 0.349
    Content: Methodology

The insights compiled within this Annual Interdisciplinary Research Review represent a synthesis of findings drawn from standard departme...

    Result 2:
    Distance: 0.366
    Content: Executive Summary

This report synthesizes the key findings and ongoing research efforts across the organization's diverse operational and R&D departm...



## STEP 6: Building the final prompt for LLM

In [16]:
context = "\n\n---\n\n".join([r['content'] for r in results])

prompt = f"""You are a helpful assistant answering questions based on provided context.

CONTEXT:
{context}

USER QUESTION:
{user_qn}

Answer the question based ONLY on the context provided. If the answer cannot be found in the context, say "I cannot find this information in the provided documents."
"""

In [17]:
API_KEY=os.getenv("OPENROUTER_API_KEY")
BASE_URL="https://openrouter.ai/api/v1"
MODEL="stepfun/step-3.5-flash:free"

In [18]:
def get_llm_ans(prompt):
    client = OpenAI(
    api_key=API_KEY,
    base_url=BASE_URL
    )
    
    completion = client.chat.completions.create(
        model=MODEL,
        messages=[
            {
                "role": "user",
                "content": prompt,
            },
        ],
    )
    
    return completion.choices[0].message.content



ans = get_llm_ans(prompt)
ans

'Based on the provided context, the software engineering department tackled persistent stability issues and implemented key fixes identified through error code analysis, with a specific example being the error code `ERR_MEM_ALLOC_FAIL_0x8007000E`.'