In [1]:
from openai import OpenAI
from pydantic import BaseModel
import json
import numpy as np
from tqdm.auto import tqdm
from dotenv import load_dotenv
load_dotenv()

openai_client = OpenAI()

In [2]:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

In [None]:
Install Docker (Docker Desktop on Mac/Windows; Docker Engine on Linux).

Run Elasticsearch (this creates a container from the official image):

docker run -it --rm --name elasticsearch \
  -m 4GB \
  -p 9200:9200 -p 9300:9300 \
  -e "discovery.type=single-node" \
  -e "xpack.security.enabled=false" \
  -v es9_data:/usr/share/elasticsearch/data \
  docker.elastic.co/elasticsearch/elasticsearch:9.1.1


This pulls the image if you don’t have it yet, then starts a container.

The named volume es9_data keeps your data between restarts.

Security is off here for local testing (don’t do this in prod).

Check it’s up:

curl http://localhost:9200


You should see version info in JSON.

Use it from Python (install the client and try a ping):

pip install elasticsearch==9.1.1
python - <<'PY'
from elasticsearch import Elasticsearch
es = Elasticsearch("http://localhost:9200")
print("Ping:", es.ping())
PY


Stop it (and remove the container):

docker stop elasticsearch


Your data is still in the es9_data volume. Start again with the same docker run and it reuses the data.

In [3]:
! curl http://localhost:9200

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0
curl: (7) Failed to connect to localhost port 9200 after 2222 ms: Could not connect to server


In [3]:
! curl http://localhost:9200

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
curl: (52) Empty reply from server


In [6]:
es_client = Elasticsearch('http://localhost:9200') 

In [8]:
! curl http://localhost:9200

{
  "name" : "08e5a81db022",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "fwrm0e9mSViPmADLW4Vcqw",
  "version" : {
    "number" : "9.1.1",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "5e94055934defa56e454868b7783b2a3b683785e",
    "build_date" : "2025-08-05T01:07:31.959947279Z",
    "build_snapshot" : false,
    "lucene_version" : "10.2.2",
    "minimum_wire_compatibility_version" : "8.19.0",
    "minimum_index_compatibility_version" : "8.0.0"
  },
  "tagline" : "You Know, for Search"
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100   540  100   540    0     0   2236      0 --:--:-- --:--:-- --:--:--  2231


In [9]:
model = SentenceTransformer("all-MiniLM-L6-v2")  # 384-d embeddings

# Prep Data

In [10]:
from youtube_transcript_api import YouTubeTranscriptApi

video_id = 'ph1PxZIkz1o'

ytt_api = YouTubeTranscriptApi()
transcript = ytt_api.fetch(video_id)

In [11]:
type(transcript)

youtube_transcript_api._transcripts.FetchedTranscript

In [12]:
len(transcript)

1407

In [13]:
for i in range(5):
    print(transcript[i])

FetchedTranscriptSnippet(text='So hi everyone. Uh today we are going to', start=0.0, duration=5.04)
FetchedTranscriptSnippet(text='talk about our upcoming course. The', start=2.96, duration=3.52)
FetchedTranscriptSnippet(text='upcoming course is called machine', start=5.04, duration=5.92)
FetchedTranscriptSnippet(text='learning zoom camp. And um this is', start=6.48, duration=5.92)
FetchedTranscriptSnippet(text='already I put the link in the', start=10.96, duration=3.599)


In [14]:
# format the transcript into a single string
def format_timestamp(seconds: float) -> str:
    """Convert seconds to H:MM:SS if > 1 hour, else M:SS"""
    total_seconds = int(seconds)
    hours, remainder = divmod(total_seconds, 3600)
    minutes, secs = divmod(remainder, 60)

    if hours > 0:
        return f"{hours}:{minutes:02}:{secs:02}"
    else:
        return f"{minutes}:{secs:02}"

def make_subtitles(transcript) -> str:
    lines = []

    for entry in transcript:
        ts = format_timestamp(entry.start)
        text = entry.text.replace('\n', ' ')
        lines.append(ts + ' ' + text)

    return '\n'.join(lines)


In [15]:

subtitles = make_subtitles(transcript)

In [16]:
print(subtitles[:500])

0:00 So hi everyone. Uh today we are going to
0:02 talk about our upcoming course. The
0:05 upcoming course is called machine
0:06 learning zoom camp. And um this is
0:10 already I put the link in the
0:12 description. So if you're watching um
0:14 this video in recording or you're
0:17 watching it live, you go here in the
0:19 description after under this video and
0:21 then you see a link course. uh click on
0:25 that link and this bring you will bring
0:27 you to
0:29 this website this GitHub


## User def function (sliding window)

In [19]:
def sliding_window(seq, size, step):
    """Create overlapping chunks using sliding window approach."""
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        batch = seq[i:i+size]
        result.append(batch)
        if i + size >= n:
            break

    return result


def join_lines(transcript) -> str:
    """Join transcript entries into continuous text."""
    lines = []

    for entry in transcript:
        text = entry.text.replace('\n', ' ')
        lines.append(text)

    return ' '.join(lines)

def format_chunk(chunk):
    """Format a chunk with start/end timestamps and text."""
    time_start = format_timestamp(chunk[0].start)
    time_end = format_timestamp(chunk[-1].start)
    text = join_lines(chunk)

    return {
        'start': time_start,
        'end': time_end,
        'text': text
    }

    


In [20]:
chunks = []

# Experiment with different values: try (30, 10) for more granular chunks
for chunk in sliding_window(transcript, 60, 30):
    processed = format_chunk(chunk)
    chunks.append(processed)

print(f"Created {len(chunks)} chunks")

Created 46 chunks


In [22]:
chunks[0]

{'start': '0:00',
 'end': '2:38',
 'text': "So hi everyone. Uh today we are going to talk about our upcoming course. The upcoming course is called machine learning zoom camp. And um this is already I put the link in the description. So if you're watching um this video in recording or you're watching it live, you go here in the description after under this video and then you see a link course. uh click on that link and this bring you will bring you to this website this GitHub page. This GitHub page is the main entry point to our course and um yeah I think it's more or less self-explanatory. If you want to sign up this is the button you click and the actual course starts in on September 15th. it means that it's uh slightly less than one one month before the course starts and the purpose of today's um session is to just answer your questions. So you have some questions and uh you can ask these questions using uh you can ask your questions using the pinned link. So there's a pinned link in

# Lexical search (BM25)

## Index

In [36]:
index_name = "docs_lex"

# delete if exists (for repeatable demos)
if es_client.indices.exists(index=index_name):
    es_client.indices.delete(index=index_name)

es_client.indices.create(
    index=index_name,
    settings={
        "analysis": {
            "analyzer": {
                "my_english": {
                    "type": "standard",
                    "stopwords": "_english_"
                }
            }
        }
    },
    mappings={
        "properties": {
            "start":  {"type": "keyword"},
            "end": { "type": "text"},
            "text": {"type": "text",  "analyzer": "my_english"}
        }
    }
)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'docs_lex'})

In [37]:
for d in tqdm(chunks):
    es_client.index(index=index_name, id= d['start'],document=d, refresh=True)



  0%|          | 0/46 [00:00<?, ?it/s]

## Search

In [38]:
q = "How can I access old videos?"
resp = es_client.search(
    index=index_name,
    query={"match": {"text": q}},
    size=5
)
for hit in resp["hits"]["hits"]:
    print(hit["_score"], hit["_source"]["text"])


5.574486 want to build tools for trading. Um then we also have a AI dev tools zoom camp which is not here yet because this is still work in progress. This is course for engineers who want to use AI to become uh more proficient. Okay, let me share this link too. Okay, I think I was answering this question. Now, are you going to teach something new this time or we just have to watch old videos? Um, I think I explained it. uh yes you'll have to watch old videos but there is also new material so I uh this year I am updating some of the uh things some of the modules so you will have to watch well I don't like to say have to nobody's forcing you but if you wish to learn something you can decide to watch all videos yes but uh trust me these videos are going to be very useful to you um as they have been very useful to many people before you. Uh but also yeah there will be new things too. Uh what do you recommend for taking the course for the official repo and working or uh I would recommend cr

In [39]:
def elastic_search(query, num_results=5):
    es_query = {
        "size": num_results,
        "query": {
            "multi_match": {
                "query": query,
                "type": "best_fields",
                "fields": ["start", "end", "text^3"],
            }
        }
    }

    response = es_client.search(index=index_name, body=es_query)

    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [40]:
query = "When would the next class start?"
elastic_search(query, num_results=5)

[{'start': '5:13',
  'end': '7:44',
  'text': "update this to pytorch so this year you'll have two options you can go with tensorflow you can also go with pytorch um but yeah it will be basics so we will not go deep so this is not a computer vision class this is a machine learning engineering class so we will just have one lesson And we will focus a lot on deployment not on the theory part. We do not cover rack at all. So for rack if you're interested in rack there is another course called LLM Zoom camp. It has almost finished but all the materials are available for self-studying. Um so yeah here's you can check it. So we don't cover a rock in machine learning course but there's another course where we go pretty deep into this maybe not super deep but like we cover a lot of things when it comes to AI engineering. Uh are there any prerequisites to get the most out of the boot camp? Yes, you need to be uh comfortable with programming and with command line. So some basic linear algebra wi

## How to append new files

In [43]:
es_client.index(index=index_name, id="500", document={"start":"500",'end':"6000","text":"Nelson is from nigeria Nelson is from nigeria Nelson is from nigeria"}, refresh=True)


ObjectApiResponse({'_index': 'docs_lex', '_id': '500', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 46, '_primary_term': 1})

In [45]:
query = "Where is nelson from?"
elastic_search(query, num_results=2)

[{'start': '500',
  'end': '6000',
  'text': 'Nelson is from nigeria Nelson is from nigeria Nelson is from nigeria'},
 {'start': '58:58',
  'end': '1:01:34',
  'text': "haven't experimented with that but typically for me I just hey this is not detailed enough. I have an example actually from one of the workshops I gave. So let's say this coding agent. So in this um new workshop I was showing how to create a Django template even though like it wasn't about Django. So, I didn't want to spend time uh on explaining jungle stuff. And I think I just showed where is it? Think somewhere is a link build from scratch. Yeah. I don't know. chat GPT did I not post it? Okay, weird. Uh I think it was a different document. Anyways, um so what I did is I just um posted where so all these things like building the jungle app from scratch and I just edited this copied this to jungo and said hey this is the tutorial I don't understand explain it line by line and then it gave me some explanation but I say o

# KNN Search (vector search)

## Index

In [None]:
VINDEX = "docs_vec"

if es_client.indices.exists(index=VINDEX):
    es_client.indices.delete(index=VINDEX)

es_client.indices.create(
    index=VINDEX,
    mappings={
        "properties": {
            "start":   {"type": "keyword"},
            "end":   {"type": "text"},
            "text": {"type": "text"},  # optional if you also want lexical
            "emb":  {
                "type": "dense_vector",
                "dims": 384,
                "index": True,                # enable k-NN
                "similarity": "cosine"        # or "dot_product" / "l2_norm"
            }
        }
    }
)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'docs_vec'})

In [33]:
for d in chunks:
    vec = model.encode(d["text"]).tolist()  # 384 floats
    body = {"id": d["start"], "text": d["text"], "emb": vec}
    es_client.index(index=VINDEX, id=d["start"], document=body, refresh=True)


## How to search

In [48]:
q = "What tool will this course teach?"
qv = model.encode(q).tolist()

resp = es_client.search(
    index=VINDEX,
    knn={
        "field": "emb",
        "query_vector": qv,
        "k": 5,                 # return top-k
        "num_candidates": 100   # search breadth (↑ recall, ↑ latency)
    }
)
for hit in resp["hits"]["hits"]:
    print(hit["_score"], hit["_source"]["text"])


0.7220858 zoom camp, machine learning zoom camp, llm zoom camp. So ML uh ML zoom camp this course is for ML engineers and data scientists. Um MLOps uh zoom camp is for ML engineers again slightly more advanced course than this one. Um so there is also a role called MLOps engineer but it may mean very different things for different companies. So um yeah we teach some uh automation some things like that like make files and so on. Um but yeah so in some cases MLOps engineer is somebody who builds platform for ML engineers then this is not about building platform but it will it talks about different steps and procedures of the ML process. Uh data engineer zoom camp is of course for data engineers uh also for data scientists who want to become better at building pipelines and then LM zoom camp is for AI engineers and data scientists. Then we also have analytics and stock market zoom camp. This is not a course I teach. So this is uh this is done by Ivan who's u like who's leading the course.

## How to append

In [47]:
text = "Nelson is from Nigeria"
emb  = model.encode(text).tolist()
es_client.index(index=VINDEX, id="5000", document={"id":"4","text":text,"emb":emb}, refresh=True)


ObjectApiResponse({'_index': 'docs_vec', '_id': '5000', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 46, '_primary_term': 1})

Deletions/updates: use es.delete(...) or re-index the doc with the same id.
For high-throughput ingestion, prefer bulk API.

In [49]:
q = "Who is from Nigeria?"
qv = model.encode(q).tolist()

resp = es_client.search(
    index=VINDEX,
    knn={
        "field": "emb",
        "query_vector": qv,
        "k": 5,                 # return top-k
        "num_candidates": 100   # search breadth (↑ recall, ↑ latency)
    }
)
for hit in resp["hits"]["hits"]:
    print(hit["_score"], hit["_source"]["text"])

0.82654715 Nelson is from Nigeria
0.55061954 like if you're uh an engineer, if you know how to program, then for you this is enough. Do I need a efficient workstation? Like probably asked you asked, do you need a good computer to uh work on this um course? No. Um actually what you can use is um a code space I think. Oh wait, I think code spaces GitHub code spaces. If you go to here environment here I show how to um configure GitHub code spaces and actually the environment you get there for code spaces is relatively is very not powerful it just have has two CPUs and I don't know how I don't remember how many how much RAM but yeah this is not um very powerful um so then what you need to have is just a computer with Python or computer with internet if you don't if you cannot install some things or maybe you I know some students uh from Nigeria uh that we have that we had in the past they didn't even have a computer they just had a tablet they could still finish the course so they could us

How persistence works

Elasticsearch persists data to disk automatically (indices live on data nodes). You don’t “save” an index from your app.

For backups / disaster recovery, use the Snapshot & Restore API:

Register a snapshot repository (e.g., to an S3 bucket).

Create snapshots on a schedule.

Restore from a snapshot when needed.

In [None]:
# # 4.1 register a repo (example: shared filesystem; for S3 use the S3 repo plugin)
# PUT _snapshot/my_repo
# {
#   "type": "fs",
#   "settings": { "location": "/mnt/es_backups" }
# }

# # 4.2 take a snapshot of both indices
# PUT _snapshot/my_repo/snap_2025_10_18?wait_for_completion=true
# {
#   "indices": "docs_lex,docs_vec",
#   "ignore_unavailable": true,
#   "include_global_state": false
# }

# # 4.3 list snapshots
# GET _snapshot/my_repo/_all

# # 4.4 restore later
# POST _snapshot/my_repo/snap_2025_10_18/_restore
# {
#   "indices": "docs_lex,docs_vec",
#   "include_global_state": false
# }


# Hybrid search (BM25 + vector) in Elasticsearch

Run both searches, then fuse results with Reciprocal Rank Fusion (RRF) or a weighted sum.

In [50]:
import numpy as np

q = "Who is the speaker?"
qv = model.encode(q).tolist()

bm25 = es_client.search(index=index_name, query={"match": {"text": q}}, size=5)
knn  = es_client.search(index=VINDEX, knn={"field":"emb","query_vector":qv,"k":5,"num_candidates":200})


In [51]:
def rankmap(hits, score_key="_score"):
    return {h["_id"]: r+1 for r, h in enumerate(sorted(hits["hits"]["hits"], key=lambda x: -x[score_key]))}

r_bm25 = rankmap(bm25)
r_knn  = rankmap(knn)

In [52]:
r_bm25

{'20:46': 1, '22:08': 2, '47:40': 3, '10:26': 4, '48:54': 5}

In [53]:
r_knn

{'5000': 1, '30:30': 2, '0:00': 3, '27:30': 4, '31:59': 5}

In [54]:
def rrf(doc_id, k=7):
    return (1/(k + r_bm25.get(doc_id, 10**9))) + (1/(k + r_knn.get(doc_id, 10**9)))

ids = {h["_id"] for h in bm25["hits"]["hits"]} | {h["_id"] for h in knn["hits"]["hits"]}
fused = sorted(ids, key=lambda i: -rrf(i))[:10]

In [55]:
ids

{'0:00',
 '10:26',
 '20:46',
 '22:08',
 '27:30',
 '30:30',
 '31:59',
 '47:40',
 '48:54',
 '5000'}

In [56]:
fused

['20:46',
 '5000',
 '30:30',
 '22:08',
 '47:40',
 '0:00',
 '27:30',
 '10:26',
 '48:54',
 '31:59']

In [60]:
# fetch docs if needed
from elasticsearch import NotFoundError

for _id in fused:
    try:
        src = es_client.get(index=index_name, id=_id)
    except NotFoundError:
        try:
            src = es_client.get(index=VINDEX, id=_id)
        except NotFoundError:
            src = None
    print(_id, (src or {}).get("_source", {}))


20:46 {'start': '20:46', 'end': '23:24', 'text': "but yeah so this is for ML engineering. Uh for data engineering uh do you recommend taking this course depends on your goals. If your goal is to learn data engineering then no I do not recommend. If you're a data engineer who work with machine learners, machine learning engineers and with data scientists and you want to understand what what is happening, what they are doing, this course will be very helpful. When it comes to data engineering, we actually have a course called data engineering zoom camp that is focused specifically on data engineering. So this is what you should take instead if you are interested in data engineering. So this course will be tangentally useful just to broaden your horizons. Uh if you work with ML teams uh yes it will be helpful to understand what they are doing but for you as data engineer for your career uh unless you plan to work with ML it will not be very useful. Please explain what jobs are suitable fo

# Index & Mapping Cheatsheet

## A1) Lexical (BM25) index

In [None]:

{
  "settings": {
    "analysis": {
      "filter": {
        "en_stem": { "type": "stemmer", "language": "light_english" }
      },
      "analyzer": {
        "my_english": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": ["lowercase","stop","en_stem"]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "id":   { "type": "keyword" },
      "title": {
        "type": "text",
        "analyzer": "my_english",
        "fields": { "raw": { "type": "keyword", "ignore_above": 2048 } }
      },
      "body": {
        "type": "text",
        "analyzer": "my_english",
        "fields": { "raw": { "type": "keyword", "ignore_above": 4096 } }
      },
      "tags": { "type": "keyword" },        # for filters
      "created_at": { "type": "date" }      # for ranges/sorts
    }
  }
}


## A2) Vector (k-NN) index

In [None]:

{
  "mappings": {
    "properties": {
      "id":   { "type": "keyword" },
      "text": { "type": "text" },       # optional if you also want lexical
      "emb":  {
        "type": "dense_vector",
        "dims": 384,                    # match your model
        "index": true,                  # enable ANN
        "similarity": "cosine"          # or "dot_product" / "l2_norm"
      },
      "tags": { "type": "keyword" }
    }
  }
}


## A3) One index for Hybrid (lexical + vector)

Goal: keep text + vector side-by-side so you can use single-request

In [None]:
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_english": { "type":"custom", "tokenizer":"standard", "filter":["lowercase","stop"] }
      }
    }
  },
  "mappings": {
    "properties": {
      "id":   { "type": "keyword" },
      "text": { "type": "text", "analyzer":"my_english" },
      "emb":  { "type": "dense_vector", "dims": 384, "index": true, "similarity": "cosine" },
      "tags": { "type": "keyword" },
      "created_at": { "type": "date" }
    }
  }
}


# Query Cheatsheet (Parameters that matter)

## B1) Lexical (BM25)

In [None]:
# Basic relevance
{
  "size": 10,
  "query": { "match": { "body": "machine learning zoom camp" } }
}

In [None]:
# Filtering (exact fields don’t affect score)
{
  "query": {
    "bool": {
      "must": { "match": { "body": "registration details" } },
      "filter": [
        { "term": { "tags": "course" } },
        { "range": { "created_at": { "gte": "2025-01-01" } } }
      ]
    }
  },
  "sort": [{ "created_at": "desc" }]
}


In [None]:
# Highlighting
"highlight": { "fields": { "body": {} } }


## B2) Vector (k-NN)
ANN (k-NN) query

In [None]:

{
  "knn": {
    "field": "emb",
    "query_vector": [/* 384-d numbers */],
    "k": 10,                 # results to return
    "num_candidates": 200    # breadth; ↑ for recall (slower)
  },
  "_source": ["id","text","tags"]
}


Knobs

k: top-K by similarity to return.

num_candidates: how many candidates to consider before picking top-k (↑ recall, ↑ latency).

similarity: set at mapping (cosine/dot/l2).

In [None]:
# Vector + filter (narrow the candidate pool)
{
  "knn": {
    "field": "emb",
    "query_vector": [/* ... */],
    "k": 10,
    "num_candidates": 300,
    "filter": { "term": { "tags": "course" } }
  }
}
