In [1]:
import requests

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']
    for doc in course['documents']:
      doc['course'] = course_name
      documents.append(doc)

In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [4]:
df.shape

(948, 4)

In [5]:
df[df.course == 'data-engineering-zoomcamp'].shape

(435, 4)

When a boost factor greater than 1.0 is provided for a specific field, the similarity score for that field is multiplied by the boost factor. This means that the field with the boost will have a greater influence on the overall score, making it more important in determining the final ranking of the results. Conversely, if a field is given a boost less than 1.0, it will have less influence on the final

In [44]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class TextSearch:
    def __init__(self, text_fields):
        """
        Initialize the TextSearch class.

        Parameters:
        text_fields (list of str): List of text fields to be used for searching.
        """
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}
        self.df = None

    def fit(self, records, vectorizer_params=None):
        """
        Fit the TF-IDF vectorizer to the text fields.

        Parameters:
        records (list of dict): List of records to be indexed.
        vectorizer_params (dict, optional): Parameters to pass to the TfidfVectorizer.
        """
        if vectorizer_params is None:
            vectorizer_params = {}

        # Convert records to a DataFrame
        self.df = pd.DataFrame(records)

        # Fit TF-IDF vectorizers for each text field
        for field in self.text_fields:
            vectorizer = TfidfVectorizer(**vectorizer_params)
            X = vectorizer.fit_transform(self.df[field])
            self.matrices[field] = X
            self.vectorizers[field] = vectorizer

    def search(self, query, n_results=10, boost=None, filters=None):
        """
        Perform a search on the indexed records.

        Parameters:
        query (str): The search query.
        n_results (int): Number of results to return.
        boost (dict, optional): Field-specific boost factors.
        filters (dict, optional): Filters to apply on the records.

        Returns:
        list of dict: The top n matching records.
        """
        if boost is None:
            boost = {}
        if filters is None:
            filters = {}

        # Initialize the score array
        score = np.zeros(len(self.df))

        # Compute the score for each field
        for field in self.text_fields:
            b = boost.get(field, 1.0)
            q_vec = self.vectorizers[field].transform([query])
            field_score = cosine_similarity(self.matrices[field], q_vec).flatten()
            score += b * field_score

        # Apply filters if any
        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score *= mask

        # Get top n results
        top_indices = np.argsort(-score)[:n_results]
        results = self.df.iloc[top_indices]

        return results.to_dict(orient='records')


In [45]:
fileds = ['section', 'question', 'text']
index = TextSearch(text_fields = fileds)
index.fit(documents)

In [51]:
query = "Do I need to know python to sign up for the January course?"
# query = "I just singned up. Is it too late to join the course?"
# query = "Are late submissions of homework allowed?"

In [53]:

index.search(

             query=query,
             n_results=2,
             boost={'question':3.0},
             filters={'course': 'data-engineering-zoomcamp'}
)



[{'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'After you create a GitHub account, you should clone the course repo to your local machine using the process outlined in this video: Git for Everybody: How to Clone a Repository from GitHub\nHaving this local repository on your computer will make it easy for you to access the instructors’ code and make pull requests (if you want to add your own notes or make changes to the course content).\nYou will probably also create your own repositories that host your notes, versions of your file, to do this. Here is a great tutorial that shows you how to do 

In [41]:
df.course.unique()

array(['data-engineering-zoomcamp', 'machine-learning-zoomcamp',
       'mlops-zoomcamp'], dtype=object)

## SVD

In [56]:
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=3)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X


In [58]:
from sklearn.decomposition import TruncatedSVD
X = matrices['text']
cv = transformers['text']

svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

query = 'I just singned up. Is it too late to join the course?'

Q = cv.transform([query])
Q_emb = svd.transform(Q)

X_emb[0]

array([ 0.08799806, -0.07510375,  0.1001265 ,  0.05129153,  0.05299514,
       -0.05883441,  0.02497097,  0.05939274, -0.1875083 ,  0.34031394,
       -0.01492214,  0.09686582, -0.11706412,  0.08004287, -0.03899647,
       -0.0201448 ])

In [61]:
np.dot(X_emb[0], Q_emb[0])
score = cosine_similarity(X_emb, Q_emb).flatten()

score

array([ 9.87373140e-01,  4.28699699e-02,  9.71729238e-01,  9.30435587e-01,
        3.82337014e-02,  6.78537171e-01,  7.02535707e-01,  9.83322171e-01,
        9.16688500e-01,  6.17443631e-01,  7.88127243e-01,  9.83988778e-01,
        9.23961111e-01,  9.64769917e-01,  2.71092504e-01,  9.85269941e-01,
        4.36956285e-01,  7.92657639e-01,  7.79787829e-01,  4.19052691e-01,
        4.99581180e-01, -4.65499864e-02,  8.58137307e-01,  7.14492562e-01,
        1.26453382e-01,  1.26037688e-01,  2.99642835e-01,  8.81904867e-01,
        6.83595776e-01,  8.69915683e-01,  3.64812025e-01,  4.03946715e-01,
        7.62263116e-01,  5.84838017e-01,  1.08227161e-01,  9.00169007e-01,
        4.03219714e-01,  1.83937346e-01,  6.81596295e-01,  5.10013524e-01,
        5.99769040e-01,  1.60858824e-01,  3.20401280e-01,  1.19360829e-01,
       -4.32494365e-03,  2.09254871e-02, -1.78857201e-01,  3.51380997e-02,
        4.58674064e-02,  1.78822728e-01,  1.35782710e-01, -1.50161602e-01,
        3.71421648e-01,  

In [63]:
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

['If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects for each submission), you will get the certificate for the course. According to the course coordinator, Alexey Grigorev, only two projects are needed to get the course certificate.\n(optional) David Odimegwu',
 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'Please choose the closest one to your answer. Also do not post your answer in the course slack channel.',
 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are rec

## NMF

In [64]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 2.00617239e-04, 0.00000000e+00,
       0.00000000e+00, 3.12782928e-01, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [65]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

['Please choose the closest one to your answer. Also do not post your answer in the course slack channel.',
 "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister be

## Transformer

In [66]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [68]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

In [69]:
encoded_input

{'input_ids': tensor([[  101,  2748,  1010,  2057,  2097,  2562,  2035,  1996,  4475,  2044,
          1996,  2607, 12321,  1012,   102],
        [  101,  2017,  2064,  3582,  1996,  2607,  2012,  2115,  2219,  6393,
          2044,  2009, 12321,   102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}

In [70]:
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state



In [71]:
hidden_states.shape

torch.Size([2, 15, 768])

In [72]:
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape


torch.Size([2, 768])

In [73]:
sentence_embeddings.numpy()


array([[ 0.3599924 , -0.16072305,  0.35452363, ...,  0.04289253,
         0.03482319, -0.03822242],
       [ 0.17849939, -0.5000251 ,  0.25277585, ..., -0.11413134,
        -0.33608466,  0.4109512 ]], dtype=float32)

In [74]:
def make_batches(seq, n):
    result = []
    for i in range(0, len(seq), n):
        batch = seq[i:i+n]
        result.append(batch)
    return result




In [75]:
from tqdm.auto import tqdm
def compute_embeddings(texts, batch_size=8):
    text_batches = make_batches(texts, 8)

    all_embeddings = []

    for batch in tqdm(text_batches):
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')

        with torch.no_grad():
            outputs = model(**encoded_input)
            hidden_states = outputs.last_hidden_state

            batch_embeddings = hidden_states.mean(dim=1)
            batch_embeddings_np = batch_embeddings.cpu().numpy()
            all_embeddings.append(batch_embeddings_np)

    final_embeddings = np.vstack(all_embeddings)
    return final_embeddings



embeddings = {}

for f in fields:
    print(f'computing embeddings for {f}...')
    embeddings[f] = compute_embeddings(df[f].tolist())

computing embeddings for section...


  0%|          | 0/119 [00:00<?, ?it/s]

computing embeddings for question...


  0%|          | 0/119 [00:00<?, ?it/s]

computing embeddings for text...


  0%|          | 0/119 [00:00<?, ?it/s]