## Environment Preparation

In [1]:
# Install packages
!pip install requests pandas scikit-learn jupyter



You should consider upgrading via the 'D:\learning\llm\search_engine\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [7]:
import pandas as pd
import requests

In [9]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'

docs_response = requests.get(docs_url)
docs_raw = docs_response.json()
# print(docs_raw)

In [10]:
documents = []
for course in docs_raw:
    for docs in course['documents']:
        # print(docs)
        docs['course'] = course['course']
        documents.append(docs)
print(documents[0])

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", 'section': 'General course-related questions', 'question': 'Course - When will the course start?', 'course': 'data-engineering-zoomcamp'}


In [11]:
# Create a pandas df
import pandas as pd

df = pd.DataFrame(documents,columns=['course','section','question','text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [12]:
df.course.unique()

array(['data-engineering-zoomcamp', 'machine-learning-zoomcamp',
       'mlops-zoomcamp'], dtype=object)

## Text Search 
- Information retrieved from large dataset
- **vector space** - A mathematical representation where text is converted into vectors (points in space) allowing for quantitative comparison.
- **Bag of Words** - A simple text representation model treating each document as a collection of words disregarding grammar and word order but keeping multiplicity.
- **TF-IDF** (Term Frequency-Inverse Document Frequency) - A statistical measure used to evaluate how important a word is to a document in a collection or corpus. It increases with the number of times a word appears in the document but is offset by the frequency of the word in the corpus.

In [13]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


In [14]:
# cv = CountVectorizer()
# cv.fit(df.text)

In [15]:
# cv.get_feature_names_out()

In [16]:
doc_text_samples = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

 - Creating the bag of words

In [17]:
# cv = CountVectorizer(stop_words='english')
# x = cv.fit_transform(doc_text_samples)

# names = cv.get_feature_names_out()

In [18]:
# # Investigating the vectorized words - Sparse Matrix
# x = cv.transform(doc_text_samples)
# x.todense()
# pd.DataFrame(x.todense(),columns = cv.get_feature_names_out()).T

In [19]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(doc_text_samples)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


In [20]:
# TfidfVectorizer - Classifies words based on importance 

#Vectorizers
cv = TfidfVectorizer(stop_words='english')
# X = cv.fit_transform(df.text)

#Matrices
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names)
df_docs.round(2)

Unnamed: 0,00,00000000e,0002,00021,001,009s,01,02,020,028879,...,zoompcamp,zshrc,ángel,çelik,开启屏幕阅读器支持,斜杠,查找和替换,要了解键盘快捷键,要启用屏幕阅读器支持,请按ctrl
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Query document similarity

In [21]:
query = "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [22]:
# View the array

query_dict = dict(zip(names,q.toarray()[0]))
query_dict

doc_dict = dict(zip(names,X.toarray()[1]))
doc_dict

{'00': np.float64(0.0),
 '00000000e': np.float64(0.0),
 '0002': np.float64(0.0),
 '00021': np.float64(0.0),
 '001': np.float64(0.0),
 '009s': np.float64(0.0),
 '01': np.float64(0.0),
 '02': np.float64(0.0),
 '020': np.float64(0.0),
 '028879': np.float64(0.0),
 '02d': np.float64(0.0),
 '03': np.float64(0.0),
 '0315': np.float64(0.0),
 '04': np.float64(0.0),
 '04d': np.float64(0.0),
 '05': np.float64(0.0),
 '051': np.float64(0.0),
 '054': np.float64(0.0),
 '06': np.float64(0.0),
 '06_spark_sql': np.float64(0.0),
 '07': np.float64(0.0),
 '07cd': np.float64(0.0),
 '08': np.float64(0.0),
 '09': np.float64(0.0),
 '0ms': np.float64(0.0),
 '0x3c947bc5': np.float64(0.0),
 '0x7efe331cf790': np.float64(0.0),
 '0x7f797010a590': np.float64(0.0),
 '0x7fbaf2666280': np.float64(0.0),
 '0x800701bc': np.float64(0.0),
 '0xa0': np.float64(0.0),
 '0xff': np.float64(0.0),
 '0zw04wdetqo': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '1000': np.float64(0.0),
 '100000': np.float64(0.0),
 

In [23]:
df_qd = pd.DataFrame([query_dict, doc_dict], index=['query', 'doc']).T

(df_qd['query'] * df_qd['doc']).sum()

np.float64(0.0)

In [24]:
# Compute the similarity product (Document and vector)
X.dot(q.T).todense()

matrix([[0.16865333],
        [0.        ],
        [0.        ],
        [0.03733748],
        [0.04085934],
        [0.        ],
        [0.        ],
        [0.09807025],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.13021262],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.05696417],
        [0.        ],
        [0.        ],
        [0.04230602],
        [0.05451619],
        [0.        ],
        [0.04440487],
        [0.16373635],
        [0.07091479],
        [0.        ],
        [0.0880943 ],
        [0.        ],
        [0.17574325],
        [0.0960077 ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.05523241],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.0075487 ],
        [0.02332155],
        [0.0492298 ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0

### Compute cosine similarity

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
score = cosine_similarity(X,q).flatten()

In [27]:
# Sort the documents
import numpy as np

In [28]:
np.argsort(score)[:-5]

array([ 43, 916, 917, 918, 919, 920, 921, 922, 923, 925,  16,  17, 931,
       933, 934,  10,  34,   1,   2,  20,  24, 926, 927, 928, 929, 930,
        35,  36,  37,  38, 766, 937,  12,  13,  14,  51,  52,  53,  54,
        55,  56,  57,  58,  59,  61,  62,  63, 769,  65,  66, 770,  68,
        69,  70,  71,  72,  73,  74,  75,  77, 692,  79,  80,  81,  82,
        83, 695,  85,  86,  87,  88,  89,  92,  93,  94,  95, 696,  97,
        98,  99, 697, 698, 104, 107, 108, 109, 110, 111, 699, 113, 114,
       901, 902, 903, 904, 700, 906, 907, 910, 911, 912, 913, 701, 115,
        26, 117, 119, 120, 121,  29, 124,  30, 128, 129, 130, 131, 132,
       133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145, 146,
       147, 149, 151, 152, 153, 155, 156, 157,  31, 160, 161, 162, 163,
       164, 166, 167, 168, 169, 170, 171, 173,  32, 175, 176, 177, 178,
       179, 182, 183, 185, 187, 188, 936, 190,  44, 192, 883, 884, 885,
       886, 887, 888, 890, 892, 893,  45, 195, 196, 197, 200, 20

In [29]:
df.iloc[591].text

'Open terminal and type the code below to check the version on your laptop\npython3 --version\nFor windows,\nVisit the official python website at  https://www.python.org/downloads/ to download the python version you need for installation\nRun the installer and  ensure to check the box that says “Add Python to PATH” during installation and complete the installation by following the prompts\nOr\nFor Python 3,\nOpen your command prompt or terminal and run the following command:\npip install --upgrade python\nAminat Abolade'

##### Search across fields

In [30]:
fields = ['section','question','text']

In [31]:
matrices = {}
vectorizers = {}

for f in fields:
    cv = TfidfVectorizer(stop_words='english',min_df=5)
    X = cv.fit_transform(df[f])
    matrices[f] = X
    vectorizers[f] = cv

In [32]:
matrices

{'section': <948x66 sparse matrix of type '<class 'numpy.float64'>'
 	with 3090 stored elements in Compressed Sparse Row format>,
 'question': <948x291 sparse matrix of type '<class 'numpy.float64'>'
 	with 3431 stored elements in Compressed Sparse Row format>,
 'text': <948x1333 sparse matrix of type '<class 'numpy.float64'>'
 	with 23808 stored elements in Compressed Sparse Row format>}

In [33]:
vectorizers

{'section': TfidfVectorizer(min_df=5, stop_words='english'),
 'question': TfidfVectorizer(min_df=5, stop_words='english'),
 'text': TfidfVectorizer(min_df=5, stop_words='english')}

In [34]:
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [35]:
#Score the query across all fields

n = len(df)

score = np.zeros(n)

query = "I just discovered the course, is it too late to join?"

#Add a boost 
boosts = {
    'question':3
}

for f in fields:
    q = vectorizers[f].transform([query])
    X = matrices[f]

    f_score = cosine_similarity(X,q).flatten()
    boost = boosts.get(f,1.0)
    score  = score + boost*f_score

In [36]:
score

array([3.52985023, 3.49512426, 2.70735166, 2.96614194, 3.49512426,
       3.49512426, 1.93689291, 3.67069698, 2.67242848, 3.49512426,
       3.10198469, 2.46096752, 0.49512426, 0.49512426, 0.49512426,
       0.59193348, 0.49512426, 2.63772182, 0.57041627, 0.49512426,
       0.49512426, 0.49512426, 0.79499188, 0.60033101, 0.49512426,
       0.49512426, 0.49512426, 0.76959902, 0.62340833, 0.49512426,
       0.49512426, 0.49512426, 0.49512426, 1.78972334, 3.49512426,
       1.72080809, 0.49512426, 0.49512426, 0.49512426, 0.52668735,
       0.54427244, 2.00115141, 0.49512426, 0.53842198, 0.        ,
       0.        , 0.        , 0.        , 0.02804374, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.06739038, 0.        , 0.00980845,
       0.        , 0.        , 0.        , 0.        , 0.05820102,
       0.        , 0.        , 0.        , 0.        , 0.     

In [37]:
# Filter only data engineering content

filter = {
    'course': 'data-engineering-zoomcamp'
}

for field,value in filter.items():
    mask = (df[field] == value).astype(int).values
    score = score*mask
mask

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [38]:
score

array([3.52985023, 3.49512426, 2.70735166, 2.96614194, 3.49512426,
       3.49512426, 1.93689291, 3.67069698, 2.67242848, 3.49512426,
       3.10198469, 2.46096752, 0.49512426, 0.49512426, 0.49512426,
       0.59193348, 0.49512426, 2.63772182, 0.57041627, 0.49512426,
       0.49512426, 0.49512426, 0.79499188, 0.60033101, 0.49512426,
       0.49512426, 0.49512426, 0.76959902, 0.62340833, 0.49512426,
       0.49512426, 0.49512426, 0.49512426, 1.78972334, 3.49512426,
       1.72080809, 0.49512426, 0.49512426, 0.49512426, 0.52668735,
       0.54427244, 2.00115141, 0.49512426, 0.53842198, 0.        ,
       0.        , 0.        , 0.        , 0.02804374, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.06739038, 0.        , 0.00980845,
       0.        , 0.        , 0.        , 0.        , 0.05820102,
       0.        , 0.        , 0.        , 0.        , 0.     

In [39]:
idx = np.argsort(-score)[:5]

df.iloc[idx]

Unnamed: 0,course,section,question,text
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
9,data-engineering-zoomcamp,General course-related questions,Course - Which playlist on YouTube should I re...,All the main videos are stored in the Main “DA...


In [45]:
from sklearn.decomposition import TruncatedSVD
#Reduce the dimensions of the document
X = matrices['text']
cv = vectorizers['text']

In [46]:
cv

In [47]:
svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

In [48]:
X_emb.shape

(948, 16)

In [49]:
X_emb[0]

array([ 0.0965283 , -0.08192636, -0.10221648, -0.08020549,  0.06972527,
       -0.05777981,  0.01476954, -0.17502583,  0.19766853,  0.26589271,
        0.11207353,  0.06613429,  0.09548724, -0.00749689,  0.00389691,
        0.02570728])

In [50]:
"""This captures semantic similarities in the text"""

query = "I just signed up. Is it too late to join the course?"

Q = cv.transform([query])

Q_emb = svd.transform(Q)
Q_emb[0]

array([ 0.05789996, -0.03857729, -0.05613362, -0.02904863,  0.04039729,
       -0.05999122,  0.01146904, -0.11421196,  0.13575358,  0.17691828,
        0.08291762,  0.06909046,  0.0671059 , -0.00384287, -0.01249337,
        0.01933966])

In [51]:
# Compute the cosine similarity btn the arrays - compute score - use similarity\
np.dot(X_emb[0],Q_emb[0])

np.float64(0.1378822153077418)

In [52]:
score = cosine_similarity(X_emb,Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

["The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'No, it’s not possible. The form is closed after the due date. But don’t worry, homework is not mandatory for finishing the course.',
 'The course is available in the self-paced mode too, so you can go through the materials at any time. But if you want to do it as a cohort with other students, the next iterations will happen in September 2023, September 2024 (and potentially other Septembers as well).',
 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office h

In [54]:
df.loc[idx]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
451,machine-learning-zoomcamp,General course-related questions,Can I submit the homework after the due date?,"No, it’s not possible. The form is closed afte..."
450,machine-learning-zoomcamp,General course-related questions,When does the next iteration start?,The course is available in the self-paced mode...
436,machine-learning-zoomcamp,General course-related questions,Is it going to be live? When?,"The course videos are pre-recorded, you can st..."
764,machine-learning-zoomcamp,Projects (Midterm and Capstone),What If I submitted only two projects and fail...,If you have submitted two projects (and peer-r...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
449,machine-learning-zoomcamp,General course-related questions,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some..."
814,mlops-zoomcamp,+-General course questions,What if my answer is not exactly the same as t...,Please choose the closest one to your answer. ...
440,machine-learning-zoomcamp,General course-related questions,"I filled the form, but haven't received a conf...","The process is automated now, so you should re..."
15,data-engineering-zoomcamp,General course-related questions,Homework - Are late submissions of homework al...,"No, late submissions are not allowed. But if t..."
