### Importing the necessary libraries

In [1]:
import numpy as np
import pandas as pd
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Implementing text search with sci-kit learn

In [2]:
# we will start off by converting to json the FAQs from the various courses from DataTalks - this had already
# been parsed by the course instructor 
# this will be used as the knowledge/data base by the search engine that we should be building shortly

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
# converting to pandas dataframe

df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [5]:
# the next step is to vectorise the text column but to understand how vectorisation works
# we can look at the following example - we start off with a list of texts called sample

sample = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [6]:
# next we initialise the object so that we can convert it into a encoded vector

cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(sample)

names = cv.get_feature_names_out()

# this is what is called a bag of words - there is no order
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs  # following is an illustration

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


In [7]:
# this is what we need - but not the best as there is no importance

cv = CountVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(df.text) # sparse matrix

In [8]:
# hence we use TF-IDF (Term Frequency-Inverse Document Frequency)
# to give less importance to words that appear more frequently

tf = TfidfVectorizer(stop_words='english', min_df=5)
X = tf.fit_transform(df.text) # sparse matrix

In [9]:
X

<948x1333 sparse matrix of type '<class 'numpy.float64'>'
	with 23808 stored elements in Compressed Sparse Row format>

In [10]:
# now to represent the query in the same vector space - i.e. using the same vectorizer

query = "I just discovered the course, is it too late to join?"

q = tf.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [11]:
# we need to take the weight of the term in the query (weighted as its a tfid vectoriser)
# then we need to multiply that by the weight in the document, followed by the sum across all the
# matching terms to give a measure of similarity - i.e. how similar is the query for the document.
# If we repeat the process we can score all the documents, and rank them based on relevance score.

X.dot(q.T).todense()

matrix([[0.48049682],
        [0.        ],
        [0.        ],
        [0.2083882 ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.17557272],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.15870689],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.09680922],
        [0.        ],
        [0.        ],
        [0.07529201],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.29986763],
        [0.10520675],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.27447476],
        [0.12828407],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.05163407],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.03156309],
        [0.04914818],
        [0.07138962],
        [0.        ],
        [0.04329773],
        [0.        ],
        [0

In [12]:
# so essentially what we were trying to derive thus far is actually cosine similarity
# cosine similarity method provides a number to describe the similarity between a 
# query and the document like below - to discuss more on cosine_similarity

score = cosine_similarity(X, q).flatten()

In [13]:
# array of indexes of the highest scores in SCORES - highest scores at bottom

np.argsort(score)[-5:]

array([ 22, 448, 449, 440,   0])

In [14]:
# document 22 is the highest matching document to the query

df.iloc[22].text

"It's up to you which platform and environment you use for the course.\nGithub codespaces or GCP VM are just possible options, but you can do the entire course from your laptop."

In [15]:
# we can repeat the steps for fields other than text

fields = ['section', 'question', 'text']

matrices = {}
vectorizers = {}

for f in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=5)
    X = cv.fit_transform(df[f])
    matrices[f] = X
    vectorizers[f] = cv

In [16]:
matrices

{'section': <948x66 sparse matrix of type '<class 'numpy.float64'>'
 	with 3090 stored elements in Compressed Sparse Row format>,
 'question': <948x291 sparse matrix of type '<class 'numpy.float64'>'
 	with 3431 stored elements in Compressed Sparse Row format>,
 'text': <948x1333 sparse matrix of type '<class 'numpy.float64'>'
 	with 23808 stored elements in Compressed Sparse Row format>}

In [17]:
vectorizers

{'section': TfidfVectorizer(min_df=5, stop_words='english'),
 'question': TfidfVectorizer(min_df=5, stop_words='english'),
 'text': TfidfVectorizer(min_df=5, stop_words='english')}

In [18]:
n = len(df)

score = np.zeros(n)

boosts = {
    'question' : 3  # question field has more relevance than other - hence 3 times 
}

for f in fields:
    q = vectorizers[f].transform([query])
    X = matrices[f]

    f_score = cosine_similarity(X,q).flatten()
    
    boost = boosts.get(f, 1)

    score += boost * f_score

In [19]:
# its important to add filters so as to retrive documents that are most relevant

filters = {
    'course' : 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (df[field] == value).astype(int).values # converting boolean to int
    score = score * mask

In [20]:
idx = np.argsort(-score)[:10]
results = df.iloc[idx]
results.to_dict(orient='records')

[{'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcem

In [21]:
results

Unnamed: 0,course,section,question,text
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...
9,data-engineering-zoomcamp,General course-related questions,Course - Which playlist on YouTube should I re...,All the main videos are stored in the Main “DA...
10,data-engineering-zoomcamp,General course-related questions,Course - ​​How many hours per week am I expect...,It depends on your background and previous exp...
411,data-engineering-zoomcamp,Workshop 1 - dlthub,Edit Course Profile.,The display name listed on the leaderboard is ...
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...


### Putting it all together - Let's create a class for us to use

In [22]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [23]:
# using it

index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just singned up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin