## Download the data

In [1]:
import requests  # Import the requests library to handle HTTP requests

# URL of the JSON file containing course documents
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'

# Send a GET request to the URL to retrieve the JSON file
docs_response = requests.get(docs_url)

# Parse the JSON response into a Python list of dictionaries
documents_raw = docs_response.json()

# Initialize an empty list to store processed documents
documents = []

# Iterate through each course in the raw documents
for course in documents_raw:
    # Extract the course name
    course_name = course['course']

    # Iterate through each document in the course
    for doc in course['documents']:
        # Add the course name to the document dictionary
        doc['course'] = course_name
        # Append the updated document to the documents list
        documents.append(doc)


In [2]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
# Import the pandas library to work with dataframes
import pandas as pd

# Create a dataframe from the list of documents
# Specify the columns to include in the dataframe
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])

# Display the first few rows of the dataframe to verify its content
df.head()


Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [4]:
df.tail()

Unnamed: 0,course,section,question,text
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...
947,mlops-zoomcamp,Module 6: Best practices,How to destroy infrastructure created via GitH...,Problem description\nInfrastructure created in...


### Implementing Basic Text Search

In [5]:
# Filter the dataframe to show only the rows where the course is 'data-engineering-zoomcamp'
df[df.course == 'data-engineering-zoomcamp'].head()


Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


### Vectorzation

Vector spaces

    - turn the docs into vectors
    - term-document matrix:
        - rows: documents
        - columns: word/tokens
    - bag of words
        - word order is lost
        - sparse matrix

In [6]:
from sklearn.feature_extraction.text import CountVectorizer # turns document/text into vectors

In [7]:
cv = CountVectorizer() # intitalizes the count vectorization for 

In [8]:
cv.fit(df.text)

In [9]:
cv.get_feature_names_out()

array(['00', '00000000e', '0002', ..., '要了解键盘快捷键', '要启用屏幕阅读器支持', '请按ctrl'],
      dtype=object)

In [10]:
cv.get_feature_names_out().shape

(6711,)

In [11]:
cv = CountVectorizer(min_df=5) # Intricues a minimal document frequency of 5 

In [12]:
cv.fit(df.text)

In [13]:
cv.get_feature_names_out()

array(['01', '02', '03', ..., 'youtube', 'zip', 'zoomcamp'], dtype=object)

In [14]:
cv.get_feature_names_out().shape

(1524,)

For Count Vectorizer and TF-IDF we will first use a simple example

In [15]:
doc_examples = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [16]:
cv = CountVectorizer()

In [17]:
cv.fit(doc_examples)

In [18]:
cv.get_feature_names_out().shape

(25,)

In [19]:
X = cv.transform(doc_examples)

In [20]:
X.todense() # To see what's insside X

matrix([[1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
         0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
         0, 0, 0, 0],
        [0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         1, 0, 0, 0]])

In [21]:
X

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 27 stored elements and shape (5, 25)>

In [22]:
pd.DataFrame(X.todense(), columns=cv.get_feature_names_out()).T # Put X and xcv into datafram and Transposes it

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
after,0,0,1,0,0
and,0,0,0,0,1
before,0,0,0,0,1
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
for,0,0,0,1,0
github,0,1,0,0,0


In the dataframe above, we see the presence of stop words like, "for", "and", "not", "on". These words are not superior elements so we disregard them and introduce a `"stop_words"` when instantiating `CountVectorizer`

In [23]:
cv = CountVectorizer(stop_words='english')

In [24]:
cv.fit(doc_examples)

In [25]:
cv.get_feature_names_out()

array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

In [26]:
X = cv.transform(doc_examples)

In [27]:
pd.DataFrame(X.todense(), columns=cv.get_feature_names_out()).shape 

(5, 19)

In [28]:
pd.DataFrame(X.todense(), columns=cv.get_feature_names_out()).T 

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


This representation is called **"bag of words"** - here we ignore the order of words, and just focus on the words themselves. In many cases, this is sufficient and gives pretty good results already.

I'll use count vectorizer of the whole dataframe.

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names)
df_docs

Unnamed: 0,01,02,03,04,05,06,09,10,100,11,...,y_val,yaml,year,yellow,yellow_tripdata_2021,yes,yml,youtube,zip,zoomcamp
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
944,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
945,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
946,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


The concept here is the less frequently a word appears the more important it is. For example, the word **'yml'** is more important than the word **'yes'** even though **'yes'** may appear more in words than **'yml'**. This is where **TfidVEctorizer** comes into play; which focuses on more important terms.

### TfidfVectorizer

Now let's replace **`CountVectorizer`** with **`TfidfVectorizer`**

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names)
df_docs.round(2)

Unnamed: 0,01,02,03,04,05,06,09,10,100,11,...,y_val,yaml,year,yellow,yellow_tripdata_2021,yes,yml,youtube,zip,zoomcamp
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.43
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.28,0.00,0.0,0.0,0.00
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.11,0.0,0.0,0.00
944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.17,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00


It's now evident that `TfidVectorizer` grades the importance of words by how less frequently they appear in the search words. Even though in this dataset, the word **`yes`** is less frequently used than the word **`yml`**.|

### Query-Document Similarity

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(df.text) # X is the matrix

In [32]:
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23808 stored elements and shape (948, 1333)>

In [33]:
query = "I just discovered the course, is it too late to join?"

q = cv.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [34]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

{'01': 0.0,
 '02': 0.0,
 '03': 0.0,
 '04': 0.0,
 '05': 0.0,
 '06': 0.0,
 '09': 0.0,
 '10': 0.0,
 '100': 0.0,
 '11': 0.0,
 '12': 0.0,
 '127': 0.0,
 '13': 0.0,
 '14': 0.0,
 '15': 0.0,
 '16': 0.0,
 '17': 0.0,
 '19': 0.0,
 '1st': 0.0,
 '20': 0.0,
 '2019': 0.0,
 '2020': 0.0,
 '2021': 0.0,
 '2022': 0.0,
 '2023': 0.0,
 '2024': 0.0,
 '21': 0.0,
 '22': 0.0,
 '24': 0.0,
 '25': 0.0,
 '2pacx': 0.0,
 '30': 0.0,
 '35': 0.0,
 '403': 0.0,
 '42': 0.0,
 '50': 0.0,
 '5000': 0.0,
 '5431': 0.0,
 '5432': 0.0,
 '60': 0.0,
 '600': 0.0,
 '7077': 0.0,
 '80': 0.0,
 '8080': 0.0,
 '8888': 0.0,
 '9696': 0.0,
 'abhijit': 0.0,
 'able': 0.0,
 'abolade': 0.0,
 'absolute': 0.0,
 'accept': 0.0,
 'access': 0.0,
 'accordingly': 0.0,
 'account': 0.0,
 'accuracy': 0.0,
 'action': 0.0,
 'activate': 0.0,
 'actual': 0.0,
 'actually': 0.0,
 'add': 0.0,
 'added': 0.0,
 'adding': 0.0,
 'addition': 0.0,
 'additional': 0.0,
 'additionally': 0.0,
 'address': 0.0,
 'admin': 0.0,
 'advani': 0.0,
 'ahmed': 0.0,
 'ai': 0.0,
 'airflow': 0

In [35]:
X.dot(q.T).todense() # This is cosine similarity

matrix([[0.48049682],
        [0.        ],
        [0.        ],
        [0.2083882 ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.17557272],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.15870689],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.09680922],
        [0.        ],
        [0.        ],
        [0.07529201],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.29986763],
        [0.10520675],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.27447476],
        [0.12828407],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.05163407],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.03156309],
        [0.04914818],
        [0.07138962],
        [0.        ],
        [0.04329773],
        [0.        ],
        [0

In [36]:
from sklearn.metrics.pairwise import cosine_similarity

# It's a very fast and efficient method of computing similarities. 
score = cosine_similarity(X, q).flatten() # Flatten turns it to a 1D array

In [37]:
import numpy as np

np.argsort(score) # Sorts the scores of the array and returs the indexes of the scores

array([473, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613,
       614, 615, 616, 617, 618, 620, 621, 622, 624, 625, 627, 601, 628,
       600, 596, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580,
       581, 582, 583, 584, 585, 586, 589, 590, 591, 592, 594, 595, 597,
       569, 629, 632, 665, 666, 667, 668, 670, 671, 673, 674, 675, 676,
       677, 678, 679, 680, 682, 683, 684, 686, 687, 688, 689, 690, 691,
       664, 631, 663, 661, 633, 634, 635, 636, 637, 638, 640, 641, 642,
       643, 644, 645, 646, 647, 649, 650, 651, 652, 653, 655, 658, 659,
       660, 662, 692, 567, 564, 478, 479, 480, 481, 482, 483, 484, 486,
       487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499,
       500, 501, 477, 504, 476, 474, 433, 434, 437, 438, 441, 442, 443,
       444, 446, 447, 453, 459, 460, 461, 462, 463, 466, 467, 468, 469,
       471, 472, 946, 475, 566, 505, 507, 539, 540, 541, 542, 543, 544,
       545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 55

It should be noted that th above are note the actual scoes but the indexes of the documents with the highest scores in descending order

In [38]:
np.argsort(score)[-5:] # Looks at the last five scores

array([ 22, 448, 449, 440,   0])

In [39]:
df.iloc[22]

course                              data-engineering-zoomcamp
section                      General course-related questions
question    Environment - Do we really have to use GitHub ...
text        It's up to you which platform and environment ...
Name: 22, dtype: object

In [40]:
df.iloc[22].text

"It's up to you which platform and environment you use for the course.\nGithub codespaces or GCP VM are just possible options, but you can do the entire course from your laptop."

In [41]:
df.iloc[448].text

"Here’s how you join a in Slack: https://slack.com/help/articles/205239967-Join-a-channel\nClick “All channels” at the top of your left sidebar. If you don't see this option, click “More” to find it.\nBrowse the list of public channels in your workspace, or use the search bar to search by channel name or description.\nSelect a channel from the list to view it.\nClick Join Channel.\nDo we need to provide the GitHub link to only our code corresponding to the homework questions?\nYes. You are required to provide the URL to your repo in order to receive a grade"

In [42]:
df.iloc[449].text

'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.'

In [43]:
df.iloc[440].text

"The process is automated now, so you should receive the email eventually. If you haven’t, check your promotions tab in Gmail as well as spam.\nIf you unsubscribed from our newsletter, you won't get course related updates too.\nBut don't worry, it’s not a problem. To make sure you don’t miss anything, join the #course-ml-zoomcamp channel in Slack and our telegram channel with announcements. This is enough to follow the course."

In [44]:
df.iloc[0].text

"The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."

Kindly note that not al of the indexes aboveare relevant to the query. Some of them are just matches to the text from the Tfidvectorizer

### Vectorizing all the documents

In [45]:
fields = ['section', 'question', 'text']

In [46]:
matrices = {}   # Matrices are the X; X = cv.fit_transform(df.text) # X is the matrix
vectorizers = {}   # Vectorizers are the cv; cv = TfidfVectorizer(stop_words='english', min_df=5)

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=5)
    X = cv.fit_transform(df[field])
    matrices[field] = X
    vectorizers[field] = cv

In [47]:
matrices

{'section': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 3090 stored elements and shape (948, 66)>,
 'question': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 3431 stored elements and shape (948, 291)>,
 'text': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 23808 stored elements and shape (948, 1333)>}

In [48]:
vectorizers

{'section': TfidfVectorizer(min_df=5, stop_words='english'),
 'question': TfidfVectorizer(min_df=5, stop_words='english'),
 'text': TfidfVectorizer(min_df=5, stop_words='english')}

To look across all the fields, First, I'll create an array with all 0's and then I'll llop over all the fields.
For each field, I'll then copy the similarity and then sum similarity across the fields

In [49]:
n = len(df)  # The size of the matrix should be the size of our dataframe

### Search
I'll now do search with the text field:

In [50]:
score = np.zeros(n)

query = "I just discovered the course, is it too late to join?"

for field in fields:
    q = vectorizers[field].transform([query])
    X = matrices[field]

    field_score = cosine_similarity(X, q).flatten()

    score = score + field_score
    

In [51]:
idx = np.argsort(score)[-5:]

In [52]:
idx

array([453,   5,   7, 448,   0])

In [53]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
453,machine-learning-zoomcamp,General course-related questions,What are the deadlines in this course?,"For the 2023 cohort, you can see the deadlines..."
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
448,machine-learning-zoomcamp,General course-related questions,I’m new to Slack and can’t find the course cha...,Here’s how you join a in Slack: https://slack....
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...


There's a problem, above. We see that the result returns multiple results from diffrent courses. We need to add a filter to ensure that it only return answers from the **'data-engineering-zoomcamp'**

In [54]:
filters = {
    'course': 'data-engineering-zoomcamp'
}

In [55]:
for field, value in filters.items():
    mask = (df[field] == value).astype(int).values
    score = score * mask   # This ensures that results with 1 remain as 0 * 1 = 0

In [56]:
idx = np.argsort(score)[-5:]

In [57]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...


One more thing to be done, for more accurate result that matches the query, is to make the `question` field more relevant than the `text`. I'll adopt the concept of booting from **`elastic search`**

### Search with all the fields & boosting + filtering

I can do it for all the fields. I'll also boost one of the fields - `question` - to give it more importance than to others.

In [58]:
score = np.zeros(n)

query = "I just discovered the course, is it too late to join?"

boosts = {
    'question': 3.0 #, 
    # 'text': 0.5
}

for field in fields:
    q = vectorizers[field].transform([query])
    X = matrices[field]

    field_score = cosine_similarity(X, q).flatten()

    boost = boosts.get(field, 1.0)  # This get a boost value from the boost or assigns 1 if it is not present
    
    score = score +  boost * field_score
    

In [59]:
filters = {
    'course': 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (df[field] == value).astype(int).values
    score = score * mask   # This ensures that results with 1 remain as 0 * 1 = 0

In [60]:
idx = np.argsort(-score)[:5] # Either this or idx = np.argsort(score)[-5:] gets the most relevant

df.iloc[idx]

Unnamed: 0,course,section,question,text
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."


### Putting it all together
I'll create a class for us to use:

In [61]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

Using the class

In [62]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just singned up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

### Embeddings and Vector Search

I'll use the vectorizer for the "text" field and turn it into embeddings

In [63]:
from sklearn.decomposition import TruncatedSVD

X = matrices['text']
cv = vectorizers['text']

In [64]:
svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

In [65]:
X_emb.shape

(948, 16)

In [66]:
X_emb[0]

array([ 0.09652885, -0.08198954, -0.10169548, -0.07996557,  0.06898671,
       -0.06270984,  0.02203515, -0.14103113, -0.23880195,  0.29628329,
        0.05049173,  0.06213355,  0.07237429, -0.10297324, -0.00503199,
        0.02695145])

In [67]:
query = 'I just singned up. Is it too late to join the course?'

Q = cv.transform([query])  # Applying the same transform as above creates a sparse matrix
Q_emb = svd.transform(Q)   # Turns the document into dense vecorizer
Q_emb[0]

array([ 0.05790074, -0.03844997, -0.05605559, -0.0283301 ,  0.04037123,
       -0.06409979,  0.01269419, -0.09143724, -0.16035333,  0.19264608,
        0.04322008,  0.06521423,  0.05451981, -0.0771471 ,  0.01555452,
        0.01997809])

I'll check for `similarity` between query and the document:

In [68]:
np.dot(X_emb[0], Q_emb[0])

0.15064244004657765

Checking the similarity across all the docment

In [69]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(score)[-5:]
list(df.loc[idx].text)

['No, it’s not possible. The form is closed after the due date. But don’t worry, homework is not mandatory for finishing the course.',
 "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'If you have subm

### Non-Negative Matrix Factorization

While its difficult to interpet SVD creates values with negative numbers, **NMF** (Non-Negative Matrix Factorization)- using a similar concept, produces non-negative results; except for non-negative input matrices.

We can interpret each of the columns (features) of the embeddings as different topic/concents and to what extent this document is about this concept.

In [70]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.27767195, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [71]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

array([0.        , 0.00224951, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.18345422, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.0018632 ,
       0.        ])

We again compute the cosine similarity

In [72]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
df.loc[idx]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
436,machine-learning-zoomcamp,General course-related questions,Is it going to be live? When?,"The course videos are pre-recorded, you can st..."
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
764,machine-learning-zoomcamp,Projects (Midterm and Capstone),What If I submitted only two projects and fail...,If you have submitted two projects (and peer-r...
814,mlops-zoomcamp,+-General course questions,What if my answer is not exactly the same as t...,Please choose the closest one to your answer. ...
449,machine-learning-zoomcamp,General course-related questions,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some..."
451,machine-learning-zoomcamp,General course-related questions,Can I submit the homework after the due date?,"No, it’s not possible. The form is closed afte..."
437,machine-learning-zoomcamp,General course-related questions,What if I miss a session?,"Everything is recorded, so you won’t miss anyt..."
11,data-engineering-zoomcamp,General course-related questions,Certificate - Can I follow the course in a sel...,"No, you can only get a certificate if you fini..."
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."


### BERT
The problem with the previous two approaches is that they don't take into account the word order. They just treat all the words separately (that's why it's called "Bag-of-Words")

BERT and other transformer models don't have this problem. 

I'll create embeddings with BERT and use the Hugging Face library for that.

In [73]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

Above, I downloaded two things:
- The `text`
- the `tokenizer`

First, I have to tokenize the text:

In [75]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
encoded_input

{'input_ids': tensor([[  101,  2748,  1010,  2057,  2097,  2562,  2035,  1996,  4475,  2044,
          1996,  2607, 12321,  1012,   102],
        [  101,  2017,  2064,  3582,  1996,  2607,  2012,  2115,  2219,  6393,
          2044,  2009, 12321,   102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}

Then I compute the embeddings:

In [76]:
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

In [77]:
hidden_states.shape

torch.Size([2, 15, 768])

In [80]:
hidden_states[0]

tensor([[ 0.1010,  0.0181,  0.1303,  ..., -0.2932,  0.1863,  0.6615],
        [ 1.0608, -0.1242,  0.1370,  ..., -0.1605,  1.0429,  0.3532],
        [ 0.1802,  0.0776,  0.3941,  ..., -0.1379,  0.5974,  0.1704],
        ...,
        [ 0.4738, -0.0184,  0.2186,  ..., -0.0013, -0.0833, -0.2170],
        [ 0.6516,  0.1216, -0.2494,  ...,  0.1557, -0.5632, -0.4310],
        [ 0.7164,  0.2157, -0.0281,  ...,  0.2281, -0.6725, -0.3245]])

In [81]:
hidden_states[0].shape

torch.Size([15, 768])

Now, I need to compress the embeddings from the averages of the rows

In [82]:
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

torch.Size([2, 768])

In [83]:
sentence_embeddings

tensor([[ 0.3600, -0.1607,  0.3545,  ...,  0.0429,  0.0348, -0.0382],
        [ 0.1785, -0.5000,  0.2528,  ..., -0.1141, -0.3361,  0.4110]])

The above shos the embeddings for the documents numbers 1 and 2

Now I need to compute it for all the documents. 

**Note:**
Kindly note that **`BERT`** is an overkill for this project, because of the dataset size. It, however, is more powerful for more complex data and performs better in bigger documents or datasets.

First, I'll convert the documents to a numpy array

In [86]:
X_emb = sentence_embeddings.numpy()

In [87]:
sentence_embeddings_cpu = sentence_embeddings.cpu()

Now, I'll compute it for the texts. I'll do it in batches

In [88]:
def make_batches(seq, n):
    """
    Split a sequence into batches of size n.

    Args:
        seq (list): The input sequence to be split into batches.
        n (int): The batch size.

    Returns:
        list of lists: A list containing the batches, where each batch is a list of n elements from the input sequence.
    """
    # Initialize an empty list to store the batches
    result = []
    
    # Iterate over the sequence in steps of n
    for i in range(0, len(seq), n):
        # Get the current batch of size n
        batch = seq[i:i+n]
        # Append the current batch to the result list
        result.append(batch)
    
    return result


In [90]:
from tqdm import tqdm

texts = df['text'].tolist()
text_batches = make_batches(texts, 8)

all_embeddings = []

for batch in tqdm(text_batches):
    encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**encoded_input)
        hidden_states = outputs.last_hidden_state
        
        batch_embeddings = hidden_states.mean(dim=1)
        batch_embeddings_np = batch_embeddings.cpu().numpy()
        all_embeddings.append(batch_embeddings_np)

final_embeddings = np.vstack(all_embeddings)

100%|██████████████████████████████████████████████████████| 119/119 [11:02<00:00,  5.57s/it]


In [92]:
final_embeddings.shape

(948, 768)

### Putting the above in a Function

In [93]:
def compute_embeddings(texts, batch_size=8):
    """
    Compute embeddings for a list of texts using a pre-trained transformer model.

    Args:
        texts (list of str): The input texts to compute embeddings for.
        batch_size (int, optional): The number of texts to process in each batch. Default is 8.

    Returns:
        numpy.ndarray: A 2D array containing the computed embeddings for all input texts.
    """
    # Split the input texts into batches of specified size
    text_batches = make_batches(texts, batch_size)
    
    # Initialize an empty list to store embeddings for all batches
    all_embeddings = []
    
    # Iterate over each batch of texts
    for batch in tqdm(text_batches, desc="Computing embeddings"):
        # Tokenize the current batch of texts
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
        
        # Disable gradient calculation to speed up inference
        with torch.no_grad():
            # Pass the tokenized input through the model to get the outputs
            outputs = model(**encoded_input)
            # Get the hidden states from the model's last layer
            hidden_states = outputs.last_hidden_state
            
            # Compute the mean of the hidden states across the sequence length dimension
            batch_embeddings = hidden_states.mean(dim=1)
            # Convert the batch embeddings to a NumPy array
            batch_embeddings_np = batch_embeddings.cpu().numpy()
            # Append the current batch embeddings to the list of all embeddings
            all_embeddings.append(batch_embeddings_np)
    
    # Stack all batch embeddings vertically to form the final embeddings array
    final_embeddings = np.vstack(all_embeddings)
    return final_embeddings


In [94]:
X_text = compute_embeddings(df['text'].tolist())

Computing embeddings: 100%|████████████████████████████████| 119/119 [10:57<00:00,  5.53s/it]


In [97]:
X_text.shape

(948, 768)

In [99]:
# Your query text
q_text = "Can I still join the course after the start date?"

# Compute embedding for the query
q_emb = compute_embeddings([q_text])


Computing embeddings: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 16.36it/s]


In [102]:
# Assuming `document_embeddings` contains the embeddings of the documents
score = cosine_similarity(q_embedding, X_text)

# Flatten the similarities array for easy indexing
score = score.flatten()


In [103]:
# Get the indices of the top N most similar documents
top_idx = score.argsort()[-5:][::-1]  # Top 5 documents

# Retrieve the corresponding rows from the DataFrame
top_idx = df.iloc[top_idx]

# Display the top N documents
top_idx

Unnamed: 0,course,section,question,text
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
449,machine-learning-zoomcamp,General course-related questions,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some..."
11,data-engineering-zoomcamp,General course-related questions,Certificate - Can I follow the course in a sel...,"No, you can only get a certificate if you fini..."
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
450,machine-learning-zoomcamp,General course-related questions,When does the next iteration start?,The course is available in the self-paced mode...


### Class-Based Implementation
Here’s how you can refactor the script into a class-based implementation:

In [104]:
import torch
import requests
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from transformers import BertModel, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity


class CourseFAQBot:
    def __init__(self, model_name="bert-base-uncased", docs_url=None, batch_size=8):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.model.eval()  # Set the model to evaluation mode if not training
        self.batch_size = batch_size
        self.df = self._download_and_process_documents(docs_url)
        self.document_embeddings = self.compute_embeddings(self.df['text'].tolist())

    def _download_and_process_documents(self, docs_url):
        """
        Download and process the document data.
        """
        docs_response = requests.get(docs_url)
        documents_raw = docs_response.json()
        
        documents = []
        for course in documents_raw:
            course_name = course['course']
            for doc in course['documents']:
                doc['course'] = course_name
                documents.append(doc)
        
        # Create the DataFrame
        return pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])

    def make_batches(self, seq, n):
        """
        Split a sequence into batches of size n.
        """
        result = []
        for i in range(0, len(seq), n):
            batch = seq[i:i+n]
            result.append(batch)
        return result

    def compute_embeddings(self, texts):
        """
        Compute embeddings for a list of texts using a pre-trained transformer model.
        """
        text_batches = self.make_batches(texts, self.batch_size)
        all_embeddings = []
        
        for batch in tqdm(text_batches, desc="Computing embeddings"):
            encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
            with torch.no_grad():
                outputs = self.model(**encoded_input)
                hidden_states = outputs.last_hidden_state
                batch_embeddings = hidden_states.mean(dim=1)
                batch_embeddings_np = batch_embeddings.cpu().numpy()
                all_embeddings.append(batch_embeddings_np)
        
        final_embeddings = np.vstack(all_embeddings)
        return final_embeddings

    def query(self, query_text, top_n=5):
        """
        Perform a query to find the most relevant documents.
        """
        query_embedding = self.compute_embeddings([query_text])
        similarities = cosine_similarity(query_embedding, self.document_embeddings).flatten()
        top_n_indices = similarities.argsort()[-top_n:][::-1]
        top_n_documents = self.df.iloc[top_n_indices]
        return top_n_documents


In [105]:
# Importing the document from the url
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
faq_bot = CourseFAQBot(docs_url=docs_url)

# Your query text
query_text = "Can I still join the course after the start date?"

# Get top 5 most relevant documents
top_documents = faq_bot.query(query_text)
top_documents

Computing embeddings:   0%|          | 0/119 [00:00<?, ?it/s]

Computing embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,course,section,question,text
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
449,machine-learning-zoomcamp,General course-related questions,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some..."
11,data-engineering-zoomcamp,General course-related questions,Certificate - Can I follow the course in a sel...,"No, you can only get a certificate if you fini..."
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
450,machine-learning-zoomcamp,General course-related questions,When does the next iteration start?,The course is available in the self-paced mode...
