In [1]:
from datetime import datetime

In [2]:
from qdrant_client import QdrantClient
client = QdrantClient('http://localhost:6333')
client.get_collections()

  from .autonotebook import tqdm as notebook_tqdm


CollectionsResponse(collections=[CollectionDescription(name='sparse'), CollectionDescription(name='sparse+dense'), CollectionDescription(name='zoomcamp-rag')])

In [3]:
import requests
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [4]:
print(documents_raw[0].keys())
print(documents_raw[0]['documents'][0].keys())
print([d['course'] for d in documents_raw])

dict_keys(['course', 'documents'])
dict_keys(['text', 'section', 'question'])
['data-engineering-zoomcamp', 'machine-learning-zoomcamp', 'mlops-zoomcamp']


In [5]:
from qdrant_client import models

In [6]:
client.delete_collection(collection_name='sparse')
client.create_collection(
	collection_name='sparse',
	sparse_vectors_config={
		'bm25' : models.SparseVectorParams(
			modifier=models.Modifier.IDF,
		)
	}
)

True

In [7]:
import uuid

points = []

for course in documents_raw:
	for doc in course['documents']:
		point = models.PointStruct(
			id=uuid.uuid4().hex,
			vector={
				'bm25' : models.Document(
					text=doc['text'],
					model='Qdrant/bm25'
				)
			},
			payload={
				'text' : doc['text'],
				'section' : doc['section'],
				'course' : course['course']
			}
		)
		points.append(point)

In [8]:
client.upsert(
	collection_name='sparse',
	points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [9]:
def search(query, limit=1):
	results = client.query_points(
		collection_name='sparse',
		query=models.Document(
			text=query,
			model='Qdrant/bm25'
		),
		using='bm25',
		limit=limit,
		with_payload=True
	)
	return results.points

In [10]:
results = search('Qdrant')
results

[]

In [11]:
results = search('pandas')
print(results[0].payload['text'])

You can use round() function or f-strings
round(number, 4)  - this will round number up to 4 decimal places
print(f'Average mark for the Homework is {avg:.3f}') - using F string
Also there is pandas.Series. round idf you need to round values in the whole Series
Please check the documentation
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.round.html#pandas.Series.round
Added by Olga Rudakova


In [12]:
results[0].score

6.0392046

In [13]:
import random
import json

random.seed(202506)

course = random.choice(documents_raw)
course_piece = random.choice(course['documents'])
print(json.dumps(course_piece, indent=2))


{
  "text": "Even though the upload works using aws cli and boto3 in Jupyter notebook.\nSolution set the AWS_PROFILE environment variable (the default profile is called default)",
  "section": "Module 4: Deployment",
  "question": "Uploading to s3 fails with An error occurred (InvalidAccessKeyId) when calling the PutObject operation: The AWS Access Key Id you provided does not exist in our records.\""
}


In [14]:
results = search(course_piece['question'])
print(results[0].payload['text'])

The trial dbt account provides access to dbt API. Job will still be needed to be added manually. Airflow will run the job using a python operator calling the API. You will need to provide api key, job id, etc. (be careful not committing it to Github).
Detailed explanation here: https://docs.getdbt.com/blog/dbt-airflow-spiritual-alignment
Source code example here: https://github.com/sungchun12/airflow-toolkit/blob/95d40ac76122de337e1b1cdc8eed35ba1c3051ed/dags/examples/dbt_cloud_example.py


In [15]:
client.delete_collection(collection_name='sparse+dense')
client.create_collection(
	collection_name='sparse+dense',
	vectors_config={
		'jina-small' : models.VectorParams(
			size=512,
			distance=models.Distance.COSINE
		)
	},
	sparse_vectors_config={
		'bm25' : models.SparseVectorParams(
			modifier=models.Modifier.IDF
		)
	}
)

True

In [16]:
points = []
num_docs_per_course_used = 75

for course in documents_raw:
	for doc in course['documents'][:num_docs_per_course_used]:
		point = models.PointStruct(
			id=uuid.uuid4().hex,
			vector={
				'jina-small' : models.Document(
					text=doc['text'],
					model='jinaai/jina-embeddings-v2-small-en'
				),
				'bm25' : models.Document(
					text=doc['text'],
					model='Qdrant/bm25'
				)
			},
			payload={
				'text' : doc['text'],
				'section' : doc['section'],
				'course' : course['course']
			}
		)
		points.append(point)

In [17]:
print(datetime.now())
client.upsert(
	collection_name='sparse+dense',
	points=points
)
print(datetime.now())

2025-06-27 02:17:02.837287
2025-06-27 02:17:41.366770


In [18]:
def multi_stage_search(query, limit=1):
	results = client.query_points(
		collection_name='sparse+dense',
		prefetch=[
			models.Prefetch(
				query=models.Document(
					text=query,
					model='jinaai/jina-embeddings-v2-small-en'
				),
				using='jina-small',
				limit=10*limit
			)
		],
		query=models.Document(
			text=query,
			model='Qdrant/bm25'
		),
		using='bm25',
		limit=limit,
		with_payload=True
	)
	return results.points

In [19]:
print(json.dumps(course_piece, indent=2))

{
  "text": "Even though the upload works using aws cli and boto3 in Jupyter notebook.\nSolution set the AWS_PROFILE environment variable (the default profile is called default)",
  "section": "Module 4: Deployment",
  "question": "Uploading to s3 fails with An error occurred (InvalidAccessKeyId) when calling the PutObject operation: The AWS Access Key Id you provided does not exist in our records.\""
}


In [20]:
results = multi_stage_search(course_piece['question'])
print(results[0].payload['text'])

My SSH connection to AWS cannot last more than a few minutes, whether via terminal or VS code.
My config:
# Copy Configuration in local nano editor, then Save it!
Host mlops-zoomcamp                                         # ssh connection calling name
User ubuntu                                             # username AWS EC2
HostName <instance-public-IPv4-addr>                    # Public IP, it changes when Source EC2 is turned off.
IdentityFile ~/.ssh/name-of-your-private-key-file.pem   # Private SSH key file path
LocalForward 8888 localhost:8888                        # Connecting to a service on an internal network from the outside, static forward or set port user forward via on vscode
StrictHostKeyChecking no
Added by Muhammed Çelik
The disconnection will occur whether I SSH via WSL2 or via VS Code, and usually occurs after I run some code, i.e. “import mlflow”, so not particularly intense computation.
I cannot reconnect to the instance without stopping and restarting with a new 

In [21]:
def rrf_search(query, limit=1):
	results = client.query_points(
		collection_name='sparse+dense',
		prefetch=[
			models.Prefetch(
				query=models.Document(
					text=query,
					model='jinaai/jina-embeddings-v2-small-en'
				),
				using='jina-small',
				limit=5*limit
			),
            models.Prefetch(
				query=models.Document(
					text=query,
					model='Qdrant/bm25'
				),
				using='bm25',
				limit=5*limit
			)
		],
		query=models.FusionQuery(fusion=models.Fusion.RRF),
		with_payload=True
	)
	return results.points

In [22]:
results = rrf_search(course_piece['question'])
print(json.dumps(course_piece, indent=2))
print(results[0].payload['text'])

{
  "text": "Even though the upload works using aws cli and boto3 in Jupyter notebook.\nSolution set the AWS_PROFILE environment variable (the default profile is called default)",
  "section": "Module 4: Deployment",
  "question": "Uploading to s3 fails with An error occurred (InvalidAccessKeyId) when calling the PutObject operation: The AWS Access Key Id you provided does not exist in our records.\""
}
My SSH connection to AWS cannot last more than a few minutes, whether via terminal or VS code.
My config:
# Copy Configuration in local nano editor, then Save it!
Host mlops-zoomcamp                                         # ssh connection calling name
User ubuntu                                             # username AWS EC2
HostName <instance-public-IPv4-addr>                    # Public IP, it changes when Source EC2 is turned off.
IdentityFile ~/.ssh/name-of-your-private-key-file.pem   # Private SSH key file path
LocalForward 8888 localhost:8888                        # Connecting t