In [50]:
# Downloading the dataset 
import requests

docs_url = 'https://raw.githubusercontent.com/oluwafemidan/search_engine/main/documents.json'
docs_response = requests.get(docs_url)
docs_response.raise_for_status()

docs_content = docs_response.json()

documents = []

for course in docs_content:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [51]:
documents[5]

{'text': "There are 3 Zoom Camps in a year, as of 2024. However, they are for separate courses:\nData-Engineering (Jan - Apr)\nMLOps (May - Aug)\nMachine Learning (Sep - Jan)\nThere's only one Data-Engineering Zoomcamp “live” cohort per year, for the certification. Same as for the other Zoomcamps.\nThey follow pretty much the same schedule for each cohort per zoomcamp. For Data-Engineering it is (generally) from Jan-Apr of the year. If you’re not interested in the Certificate, you can take any zoom camps at any time, at your own pace, out of sync with any “live” cohort.",
 'section': 'General course-related questions',
 'question': 'Course - how many Zoomcamps in a year?',
 'course': 'data-engineering-zoomcamp'}

In [52]:
# Creating the DataFrame and rearranging the the columns of the dataset
import pandas as pd
df = pd.DataFrame(documents,columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [53]:
# filtering the course for only "data-engineering-zoomcamp"
df[df.course=='data-engineering-zoomcamp'].head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


# Term Frequency - Inverse Document Frequency(TF-IDF)


In [54]:
#Applying TF-IDF on the document to convert the words into vectors
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english',min_df=5)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
02,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
03,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
04,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
05,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0.0,0.00,0.28,0.0,0.0,0.00,0.21,0.2,0.15,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
yml,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.00
youtube,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.15,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
zip,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00


In [55]:
#Testing the search engine with TF-IDF
query = "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [56]:
X.dot(q.T)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 294 stored elements and shape (948, 1)>

In [57]:
X.dot(q.T).todense()

matrix([[0.19464486],
        [0.        ],
        [0.        ],
        [0.06011641],
        [0.04932915],
        [0.        ],
        [0.        ],
        [0.13477565],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.15899187],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.07431408],
        [0.        ],
        [0.        ],
        [0.05779673],
        [0.07243428],
        [0.        ],
        [0.05174293],
        [0.16373635],
        [0.08076031],
        [0.        ],
        [0.09755254],
        [0.        ],
        [0.21069625],
        [0.12067781],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.06381749],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.00910541],
        [0.02835681],
        [0.05480112],
        [0.        ],
        [0.        ],
        [0.        ],
        [0

In [58]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(X,q)

array([[0.19464486],
       [0.        ],
       [0.        ],
       [0.06011641],
       [0.04932915],
       [0.        ],
       [0.        ],
       [0.13477565],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.15899187],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.07431408],
       [0.        ],
       [0.        ],
       [0.05779673],
       [0.07243428],
       [0.        ],
       [0.05174293],
       [0.16373635],
       [0.08076031],
       [0.        ],
       [0.09755254],
       [0.        ],
       [0.21069625],
       [0.12067781],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.06381749],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.00910541],
       [0.02835681],
       [0.05480112],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.024

In [59]:
score = cosine_similarity(X,q).flatten()
score

array([0.19464486, 0.        , 0.        , 0.06011641, 0.04932915,
       0.        , 0.        , 0.13477565, 0.        , 0.        ,
       0.        , 0.15899187, 0.        , 0.        , 0.        ,
       0.07431408, 0.        , 0.        , 0.05779673, 0.07243428,
       0.        , 0.05174293, 0.16373635, 0.08076031, 0.        ,
       0.09755254, 0.        , 0.21069625, 0.12067781, 0.        ,
       0.        , 0.        , 0.        , 0.06381749, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00910541,
       0.02835681, 0.05480112, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.02469964, 0.05129386, 0.06013439,
       0.05252658, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.04169018, 0.        , 0.        , 0.        , 0.0075293 ,
       0.        , 0.        , 0.01971463, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [60]:
import numpy as np
np.argsort(score)

array([473, 563, 564, 566, 567, 568, 569, 570, 571, 572, 574, 575, 576,
       578, 579, 580, 581, 582, 583, 584, 562, 561, 560, 559, 530, 532,
       533, 534, 535, 536, 537, 538, 542, 585, 544, 548, 549, 550, 551,
       552, 553, 555, 556, 558, 546, 586, 590, 594, 634, 635, 636, 637,
       638, 640, 641, 643, 644, 631, 645, 647, 649, 650, 651, 652, 653,
       654, 655, 657, 646, 528, 630, 627, 595, 597, 600, 601, 602, 604,
       605, 606, 607, 628, 608, 612, 613, 614, 615, 616, 618, 621, 622,
       626, 611, 527, 526, 525, 422, 423, 426, 427, 428, 429, 430, 432,
       437, 421, 441, 443, 444, 447, 453, 460, 461, 462, 463, 466, 442,
       467, 420, 418, 385, 386, 387, 389, 390, 392, 397, 399, 400, 419,
       402, 405, 407, 408, 409, 410, 412, 414, 416, 417, 404, 658, 468,
       472, 499, 501, 504, 505, 506, 507, 508, 509, 510, 498, 512, 514,
       515, 516, 517, 518, 519, 520, 523, 524, 513, 471, 497, 495, 946,
       474, 475, 476, 477, 478, 479, 480, 481, 496, 482, 486, 48

In [61]:
np.argsort(score)[-5:]

array([764,  27, 806, 577, 445])

In [62]:
# Here is the response from the tested query: "Do I need to know Python to sign up for the January course?"
df.iloc[445].text

'Check this article. If you know everything in this article, you know enough. If you don’t, read the article and join the coursIntroduction to Pythone too :)\nIntroduction to Python – Machine Learning Bookcamp\nYou can follow this English course from the OpenClassrooms e-learning platform, which is free and covers the python basics for data analysis: Learn Python Basics for Data Analysis - OpenClassrooms . It is important to know some basics such as: how to run a Jupyter notebook, how to import libraries (and what libraries are), how to declare a variable (and what variables are) and some important operations regarding data analysis.\n(Mélanie Fouesnard)'

In [63]:
df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...


## Applying TF-IDF to Vectorize all the documents

In [64]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [65]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just signed up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

# BERT
The problem with the Term Frequency-Inverse Document Frequency(TF-IDF) approach is that it doesn't take into account the word order, sparsity, and out of vocabulary. 

BERT and other transformer models don't have this problem.

Let's create embeddings with BERT. We will use the Hugging Face library for that

In [66]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [68]:
def make_batches(seq, n):
    result = []
    for i in range(0, len(seq), n):
        batch = seq[i:i+n]
        result.append(batch)
    return result

In [69]:
from tqdm import tqdm
def compute_embeddings(texts, batch_size=8):
    text_batches = make_batches(texts, 8)
    
    all_embeddings = []
    
    for batch in tqdm(text_batches):
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    
        with torch.no_grad():
            outputs = model(**encoded_input)
            hidden_states = outputs.last_hidden_state
            
            batch_embeddings = hidden_states.mean(dim=1)
            batch_embeddings_np = batch_embeddings.cpu().numpy()
            all_embeddings.append(batch_embeddings_np)
    
    final_embeddings = np.vstack(all_embeddings)
    return final_embeddings

In [70]:
# Indexing the documents
document_texts = df['text'].tolist()
document_embeddings = compute_embeddings(document_texts)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 119/119 [10:52<00:00,  5.48s/it]


In [71]:
# Encode query
def encode_query(query):
    encoded_input = tokenizer([query], padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**encoded_input)
        query_embedding = outputs.last_hidden_state.mean(dim=1)
    return query_embedding.cpu().numpy()

# Perform search
def search(query, document_embeddings, top_k=5):
    query_embedding = encode_query(query)
    similarities = cosine_similarity(query_embedding, document_embeddings)
    similarities = similarities.flatten()
    top_k_indices = similarities.argsort()[-top_k:][::-1]
    return top_k_indices, similarities[top_k_indices]

# Display the top-k results with document content
def display_top_k_results(query, document_texts, document_embeddings, top_k=5):
    top_k_indices, top_k_similarities = search(query, document_embeddings, top_k=top_k)
    print(f"Top {top_k} search results for query: '{query}'\n")
    for index, similarity in zip(top_k_indices, top_k_similarities):
        print(f"Document: {document_texts[index]}")
        print(f"Similarity: {similarity:.4f}\n")

# Perform search and display results
query = "I just signed up. Is it too late to join the course?"
display_top_k_results(query, document_texts, document_embeddings, top_k=5)


Top 5 search results for query: 'I just signed up. Is it too late to join the course?'

Document: There are 3 Zoom Camps in a year, as of 2024. However, they are for separate courses:
Data-Engineering (Jan - Apr)
MLOps (May - Aug)
Machine Learning (Sep - Jan)
There's only one Data-Engineering Zoomcamp “live” cohort per year, for the certification. Same as for the other Zoomcamps.
They follow pretty much the same schedule for each cohort per zoomcamp. For Data-Engineering it is (generally) from Jan-Apr of the year. If you’re not interested in the Certificate, you can take any zoom camps at any time, at your own pace, out of sync with any “live” cohort.
Similarity: 0.6459

Document: Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.
Click on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube 