In [13]:
from dotenv import load_dotenv
load_dotenv()


True

In [5]:
# pip install fastembed scikit-learn openai numpy
from fastembed import TextEmbedding
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from openai import OpenAI

# SETUP
client = OpenAI()
documents = [
    "LlamaIndex is a framework for connecting data to LLMs.",
    "FastEmbed is a high-performance embedding generation library by Qdrant.",
    "Qdrant is a vector database written in Rust."
]

# 1. EMBED (Local & Free)
# FastEmbed returns a generator, so we convert to list
embed_model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")
doc_embeddings = list(embed_model.embed(documents)) # List of numpy arrays

# 2. RETRIEVE (Manual Math with Sklearn)
query = "What is FastMbed?"
query_embedding = list(embed_model.embed([query]))[0]

# Calculate similarity between Query and ALL documents
# We must stack the list of arrays into a single matrix for sklearn
scores = cosine_similarity([query_embedding], np.stack(doc_embeddings))[0]

# Find the index of the highest score
best_doc_index = np.argmax(scores)
retrieved_doc = documents[best_doc_index]

print(f"Retrieved: {retrieved_doc} (Score: {scores[best_doc_index]:.4f})")

# 3. GENERATE (OpenAI)
prompt = f"Context: {retrieved_doc}\nQuestion: {query}\nAnswer:"
response = client.chat.completions.create(
    model="gpt-5-mini",
    messages=[{"role": "user", "content": prompt}]
)
print("AI Answer:", response.choices[0].message.content)

Retrieved: FastEmbed is a high-performance embedding generation library by Qdrant. (Score: 0.6500)
AI Answer: There’s no widely known project called “FastMbed” — you probably meant FastEmbed.  

FastEmbed is Qdrant’s high-performance embedding-generation library. It’s designed to produce vector embeddings efficiently (high throughput, low latency), integrate with model backends, and fit into vector-search workflows (e.g., with Qdrant). If you meant something else by FastMbed, tell me where you saw it and I’ll look into it.


In [7]:
# pip install llama-index llama-index-llms-openai
# import os
from llama_index.core import VectorStoreIndex, Document

# SETUP
# os.environ["OPENAI_API_KEY"] = "sk-..."
documents = [Document(text="LlamaIndex connects data to LLMs.")]

# 1. INDEX (Auto-Embeds via OpenAI API)
# LlamaIndex handles the API calls, batching, and vector storage automatically.
index = VectorStoreIndex.from_documents(documents)

# 2. RETRIEVE & GENERATE
query_engine = index.as_query_engine()
response = query_engine.query("What does LlamaIndex do?")

print(response)

LlamaIndex connects data to LLMs.


In [10]:
# pip install llama-index llama-index-embeddings-fastembed
from llama_index.core import VectorStoreIndex, Document, Settings
from llama_index.embeddings.fastembed import FastEmbedEmbedding


# --- THE MAGIC SWITCH ---
# We globally configure LlamaIndex to use FastEmbed running locally on your CPU
Settings.embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")

documents = [
    Document(text="FastEmbed runs locally and saves API costs."),
    Document(text="LlamaIndex orchestrates the retrieval flow.")
]

# 1. INDEX (Local & Free)
# This no longer calls OpenAI. It runs on your laptop.
index = VectorStoreIndex.from_documents(documents)

# 2. RETRIEVE & GENERATE
# The retrieval uses local vectors; the final answer uses GPT-4
query_engine = index.as_query_engine()
response = query_engine.query("Why use FastEmbed?")

print(response)

To save API costs.


'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'

In [20]:
import os
import qdrant_client
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    Settings,
)
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.llms.openai import OpenAI

# 1. SETUP: LLM & Embeddings
# We use OpenAI for generation (requires key) but FastEmbed for embeddings (free/local)
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
Settings.llm = OpenAI(model="gpt-4")
Settings.embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")

# 2. CONNECT: Qdrant
# Note: For local docker use host="localhost". For in-memory (testing) use location=":memory:"
client = qdrant_client.QdrantClient(location=":memory:") 

# 3. STORAGE: Configure LlamaIndex to use Qdrant
vector_store = QdrantVectorStore(client=client, collection_name="uv_demo")
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# 4. INDEX: Load, Embed (Locally), and Store (in Qdrant)
# Create a dummy file if you don't have one
if not os.path.exists("data"):
    os.makedirs("data")
    with open("data/test.txt", "w") as f:
        f.write("uv is an extremely fast Python package installer written in Rust.")

documents = SimpleDirectoryReader("./data").load_data()
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
)

# 5. QUERY
query_engine = index.as_query_engine()
response = query_engine.query("What is uv?")
print(response)

2025-12-19 16:39:53,036 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


UV is an extremely fast Python package installer that is written in Rust.
