In [1]:
pip install --quiet openai tiktoken python-dotenv


Note: you may need to restart the kernel to use updated packages.


In [2]:
# Core imports
from openai import OpenAI
import os
from dotenv import load_dotenv

# Load API key from .env (NOT committed)
load_dotenv()
assert os.getenv("OPENAI_API_KEY"), "Missing OPENAI_API_KEY in your environment"

# OpenAI client (no key in code, uses env var)
client = OpenAI()

print("Hello LLM 👋")


Hello LLM 👋


In [10]:
import tiktoken

def count_tokens(text: str, model_hint: str = "gpt-4.1-mini") -> int:
    """
    Count tokens in a given text using tiktoken. Falls back to cl100k_base.
    """
    try:
        enc = tiktoken.encoding_for_model(model_hint)
    except Exception:
        enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(text))

def estimate_cost(tokens: int, model: str = "gpt-4.1-mini", io: str = "input") -> float:
    """
    Estimate USD cost for `tokens` using example $/1M rates.
    `io` is "input" or "output".
    """
    prices = {
        "gpt-4.1-mini": {"input": 0.15, "output": 0.60},
        "gpt-4.1": {"input": 2.50, "output": 10.00},
        "gpt-5": {"input": 10.00, "output": 30.00},
    }
    rate_per_token = prices[model][io] / 1_000_000
    return tokens * rate_per_token


In [4]:
user_prompt = "Say hello from inside VS Code and list three key topics about RAG."

response = client.responses.create(
    model="gpt-4.1-mini",
    input=user_prompt,
)

print(response.output[0].content[0].text)

# OPTIONAL: tiny Q&A over a small text (simulating 'context window' usage)
context = """RAG = Retrieve-Augmented Generation. Key parts:
- Indexing chunks in a vector DB (embeddings like OpenAI text-embedding-3-large).
- Retrieving top-k most similar chunks for a query.
- Prompting the LLM with the retrieved chunks + user question.
"""

qa_prompt = f"""You are a helpful assistant. Use ONLY the context below.
Context:
{context}

Question: What are the two main steps in RAG before generation?
Answer in one sentence:"""

resp2 = client.responses.create(model="gpt-4.1-mini", input=qa_prompt)
print("\nQ&A:", resp2.output[0].content[0].text)


Hello from inside VS Code! 👋

Here are three key topics about Retrieval-Augmented Generation (RAG):

1. **Integration of Retrieval with Generation:** RAG combines a retrieval system (like a search engine or database) with a generative model (such as GPT) to provide more accurate, context-aware, and up-to-date responses.

2. **Use of External Knowledge Sources:** Instead of relying solely on pre-trained knowledge, RAG fetches relevant documents or data in real-time to enhance the generation process.

3. **Applications and Performance:** RAG is used in tasks like open-domain question answering, customer support, and knowledge management, improving the factual correctness and relevance of generated text.

If you'd like, I can help you set up an environment in VS Code for experimenting with RAG as well!

Q&A: The two main steps in RAG before generation are indexing chunks in a vector database using embeddings and retrieving the top-k most similar chunks for a given query.
