In [2]:
from fastembed import TextEmbedding

# Initialize the embedding model
embedding_model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")

# Embed the query
query = "I just discovered the course. Can I join now?"
query_embedding = list(embedding_model.embed([query]))[0]

# Check the shape of the embedding
print(f"Shape of embedding: {query_embedding.shape}")  # Should be (512,)

# Find the minimal value in the array
min_value = query_embedding.min()
print(f"Minimum value in the embedding: {min_value}")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Fetching 5 files: 100%|██████████| 5/5 [00:29<00:00,  5.90s/it]


Shape of embedding: (512,)
Minimum value in the embedding: -0.11726373885183883


In [3]:
# Embed the document
doc = 'Can I still join the course after the start date?'
doc_embedding = list(embedding_model.embed([doc]))[0]

# Compute cosine similarity (dot product of normalized vectors)
import numpy as np
cosine_similarity = query_embedding.dot(doc_embedding)

# Verify that vectors are normalized
print(f"Query vector norm: {np.linalg.norm(query_embedding)}")  # Should be close to 1.0
print(f"Document vector norm: {np.linalg.norm(doc_embedding)}")  # Should be close to 1.0

print(f"Cosine similarity: {cosine_similarity}")

Query vector norm: 1.0
Document vector norm: 1.0
Cosine similarity: 0.9008528895674548


In [6]:
documents = [{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first \"Office Hours\" live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon't forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'course': 'data-engineering-zoomcamp'}]

# Embed all document texts
doc_texts = [doc['text'] for doc in documents]
doc_embeddings = list(embedding_model.embed(doc_texts))

# Create a matrix of document embeddings
import numpy as np
V = np.vstack(doc_embeddings)

# Compute cosine similarities with the query
similarities = V.dot(query_embedding)

# Find the document with highest similarity
highest_idx = np.argmax(similarities)
highest_similarity = similarities[highest_idx]

print(f"Document with highest similarity: {highest_idx}")
print(f"Similarity score: {highest_similarity}")
print(f"Document text: {documents[highest_idx]['text'][:100]}...")

Document with highest similarity: 1
Similarity score: 0.8182378150042889
Document text: Yes, we will keep all the materials after the course finishes, so you can follow the course at your ...


In [7]:
# Create full_text by concatenating question and text
full_texts = [doc['question'] + ' ' + doc['text'] for doc in documents]

# Embed the full texts
full_text_embeddings = list(embedding_model.embed(full_texts))

# Create a matrix of full text embeddings
V_full = np.vstack(full_text_embeddings)

# Compute cosine similarities with the query
similarities_full = V_full.dot(query_embedding)

# Find the document with highest similarity
highest_idx_full = np.argmax(similarities_full)
highest_similarity_full = similarities_full[highest_idx_full]

print(f"Document with highest similarity (full text): {highest_idx_full}")
print(f"Similarity score: {highest_similarity_full}")
print(f"Document question: {documents[highest_idx_full]['question']}")

Document with highest similarity (full text): 0
Similarity score: 0.8514543236908068
Document question: Course - Can I still join the course after the start date?


In [8]:
from fastembed import TextEmbedding

# List available models
available_models = TextEmbedding.list_supported_models()
print("Available models:")
for model in available_models:
    print(model)

# Check dimensions of a smaller model
small_model = TextEmbedding(model_name="BAAI/bge-small-en")
test_embedding = list(small_model.embed(["test"]))[0]
print(f"Dimension of BAAI/bge-small-en: {test_embedding.shape[0]}")

Available models:
{'model': 'BAAI/bge-base-en', 'sources': {'hf': 'Qdrant/fast-bge-base-en', 'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz', '_deprecated_tar_struct': True}, 'model_file': 'model_optimized.onnx', 'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2023 year.', 'license': 'mit', 'size_in_GB': 0.42, 'additional_files': [], 'dim': 768, 'tasks': {}}
{'model': 'BAAI/bge-base-en-v1.5', 'sources': {'hf': 'qdrant/bge-base-en-v1.5-onnx-q', 'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en-v1.5.tar.gz', '_deprecated_tar_struct': True}, 'model_file': 'model_optimized.onnx', 'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.', 'license': 'mit', 'size_in_GB': 0.21, 'additional_files': [], 'dim': 768, 'tasks': {}}
{'model': 'BAAI/bge-large-en-v1.5', 'sou

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 5 files:  80%|████████  | 4/5 [03:50<00:57, 57.59s/it] 
[32m2025-06-17 15:16:28.300[0m | [31m[1mERROR   [0m | [36mfastembed.common.model_management[0m:[36mdownload_model[0m:[36m430[0m - [31m[1mCould not download model from HuggingFace: [WinError 1314] A required privilege is not held by the client: '..\\..\\blobs\\37fca74771bc76a8e01178ce3a6055a0995f8093' -> 'C:\\Users\\MIKITA~1\\AppData\\Local\\Temp\\fastembed_cache\\models--Qdrant--bge-small-en\\snapshots\\8791246cc2a79c7949a4dc0d4a018cbd7d024879\\tokenizer_config.json' Falling back to other sources.[0m
100%|██████████| 77.7M/77.7M [00:51<00:00, 1.49MiB/s]


Dimension of BAAI/bge-small-en: 384


In [9]:
import requests
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

# Download documents
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

# Filter for ML Zoomcamp documents
documents = []
for course in documents_raw:
    course_name = course['course']
    if course_name != 'machine-learning-zoomcamp':
        continue

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

print(f"Number of ML Zoomcamp documents: {len(documents)}")

# Initialize the small embedding model
small_model = TextEmbedding(model_name="BAAI/bge-small-en")

# Connect to Qdrant
client = QdrantClient(host="localhost", port=6333)

# Create a collection
collection_name = "ml_zoomcamp_faq"
vector_size = list(small_model.embed(["test"]))[0].shape[0]  # Should be 384

# Create collection if it doesn't exist
try:
    client.get_collection(collection_name)
    print(f"Collection {collection_name} already exists")
except:
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
    )
    print(f"Created collection {collection_name}")

# Prepare documents for indexing
points = []
for i, doc in enumerate(documents):
    # Combine question and text
    text = doc['question'] + ' ' + doc['text']
    
    # Embed the text
    embedding = list(small_model.embed([text]))[0]
    
    # Create a point
    point = PointStruct(
        id=i,
        vector=embedding.tolist(),
        payload=doc
    )
    points.append(point)

# Upload points in batches
batch_size = 100
for i in range(0, len(points), batch_size):
    batch = points[i:i+batch_size]
    client.upsert(
        collection_name=collection_name,
        points=batch
    )
    print(f"Uploaded batch {i//batch_size + 1}/{(len(points)-1)//batch_size + 1}")

# Query the collection
query = "I just discovered the course. Can I join now?"
query_embedding = list(small_model.embed([query]))[0]

search_results = client.search(
    collection_name=collection_name,
    query_vector=query_embedding.tolist(),
    limit=5
)

# Print the top result and its score
print(f"Top result score: {search_results[0].score}")
print(f"Question: {search_results[0].payload['question']}")

Number of ML Zoomcamp documents: 375


Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 13.41it/s]


Created collection ml_zoomcamp_faq
Uploaded batch 1/4
Uploaded batch 2/4
Uploaded batch 3/4
Uploaded batch 4/4


  search_results = client.search(


Top result score: 0.8703172
Question: The course has already started. Can I still join it?
