## In Memory implementation

In [52]:
import pandas as pd
import numpy as np

In [53]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [54]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [55]:
df = pd.DataFrame(documents, columns=['course','section','question','text'])

In [56]:
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [57]:
from sklearn.feature_extraction.text import CountVectorizer

In [58]:
cv = CountVectorizer(max_df=5)

In [59]:
cv.fit(df.text)

In [60]:
cv.get_feature_names_out().shape

(5390,)

In [61]:
doc_ex = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [91]:
CV = TfidfVectorizer(stop_words='english')
CV.fit(doc_ex)

In [63]:
CV.get_feature_names_out()

array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

In [64]:
X = CV.transform(doc_ex)

In [65]:
pd.DataFrame(X.todense(),columns=CV.get_feature_names_out()).T

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


In [66]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

CV = TfidfVectorizer(stop_words='english', min_df=5)
CV.fit(df.text)

X = CV.transform(df.text)

names = CV.get_feature_names_out()

df_doc = pd.DataFrame(X.toarray(),columns=names).T
df_doc.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
02,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
03,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
04,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
05,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0.0,0.00,0.28,0.0,0.0,0.00,0.21,0.2,0.15,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
yml,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.00
youtube,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.15,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
zip,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00


In [67]:
X

<948x1333 sparse matrix of type '<class 'numpy.float64'>'
	with 23808 stored elements in Compressed Sparse Row format>

In [68]:
query = "Do I need to know python to sign up for the January course?"

q = CV.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [69]:
from sklearn.metrics.pairwise import cosine_similarity

In [70]:
#X.dot(q.T).todense()..->gives same result as cosine_similarity(X,q)----> cosine_similarity is just a normalized dot product

In [71]:
#falatten to change from 2 dim to just one dim numpy array
score = cosine_similarity(X,q).flatten()

In [72]:
#this gives the indices of the documents sorted, in increasing order
np.argsort(score)[-5:]

array([764,  27, 806, 577, 445])

In [73]:
df.iloc[27].text

'You can do most of the course without a cloud. Almost everything we use (excluding BigQuery) can be run locally. We won’t be able to provide guidelines for some things, but most of the materials are runnable without GCP.\nFor everything in the course, there’s a local alternative. You could even do the whole course locally.'

In [74]:
fields = ['section', 'question','text']

In [101]:
matrices = {}
vecterizor = {}

for f in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=5)
    X = cv.fit_transform(df[f])
    matrices[f] = X
    vecterizor[f] = cv

In [102]:
matrices

{'section': <948x66 sparse matrix of type '<class 'numpy.float64'>'
 	with 3090 stored elements in Compressed Sparse Row format>,
 'question': <948x291 sparse matrix of type '<class 'numpy.float64'>'
 	with 3431 stored elements in Compressed Sparse Row format>,
 'text': <948x1333 sparse matrix of type '<class 'numpy.float64'>'
 	with 23808 stored elements in Compressed Sparse Row format>}

In [98]:
vecterizor

{'section': TfidfVectorizer(min_df=5, stop_words='english'),
 'question': TfidfVectorizer(min_df=5, stop_words='english'),
 'text': TfidfVectorizer(min_df=5, stop_words='english')}

In [99]:
df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...


In [126]:
n = len(df)

In [154]:
score = np.zeros(n)
boosts = {
    'question': 3,
    'test': 0.5
}

query = "Do I need to know python to sign up for the January course?"

for f in fields:
    q = vecterizor[f].transform([query])
    x = matrices[f]

    boost = boosts.get(f, 1.0)
    f_Score = cosine_similarity(x,q).flatten()

    score =  score + boost * f_Score

In [155]:
filters =  {
    'course' : 'data-engineering-zoomcamp'
}

In [156]:
for field, value in filters.items():
    mask = (df[field] == value).astype(int).values
    score = score * mask

In [157]:
idx = np.argsort(-score)[:5]

In [158]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
18,data-engineering-zoomcamp,General course-related questions,Leaderboard - I am not on the leaderboard / ho...,When you set up your account you are automatic...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...


In [159]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [162]:
index = TextSearch(
    text_fields= ['section', 'question','text']
)
index.fit(documents)

index.search(
    query = "Do I need to know python to sign up for the January course?",
    n_results= 5,
    boost= {'question': 3.0},
    filters = {'course' : 'data-engineering-zoomcamp'}
)


[{'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'After you create a GitHub account, you should clone the course repo to your local machine using the process outlined in this video: Git for Everybody: How to Clone a Repository from GitHub\nHaving this local repository on your computer will make it easy for you to access the instructors’ code and make pull requests (if you want to add your own notes or make changes to the course content).\nYou will probably also create your own repositories that host your notes, versions of your file, to do this. Here is a great tutorial that shows you how to do 

#### Embeddings  --- bag of words to --SVD-->embeddings
- all the above was with bag of words, no order and exact match no synomous matches, and embeddings are dense vectors

In [171]:
from sklearn.decomposition import TruncatedSVD

x = matrices['text']
cv = vecterizor['text']

svd = TruncatedSVD(n_components=16)
x_emb = svd.fit_transform(x)
x_emb.shape

(948, 16)

In [172]:
x_emb[0]

array([ 0.09653025, -0.08240087, -0.1024234 , -0.07781641,  0.06756842,
       -0.05545656,  0.02082613, -0.13235549, -0.22213398,  0.28464306,
        0.06624199,  0.06478287, -0.10447577,  0.09465672,  0.0257452 ,
       -0.03320595])

In [173]:
query = "Do I need to know python to sign up for the January course?"

Q = cv.transform([query])
Q_emb = svd.transform(Q)

In [174]:
Q_emb[0]

array([ 0.09945384, -0.05293029,  0.07116615, -0.04231482,  0.02449468,
       -0.04278687,  0.00923252, -0.11531137, -0.10207268,  0.14537863,
        0.08442247,  0.08780715,  0.0231239 , -0.06553705, -0.01117111,
       -0.02321346])

In [175]:
np.dot(Q_emb[0], x_emb[0])

0.09664711674010008

In [187]:
score = cosine_similarity(x_emb, Q_emb).flatten()
idx = np.argsort(-score)[:5]
list(df.loc[idx].text)
df.loc[idx]

Unnamed: 0,course,section,question,text
806,machine-learning-zoomcamp,Miscellaneous,"Can I do the course in other languages, like R...","Technically, yes. Advisable? Not really. Reaso..."
455,machine-learning-zoomcamp,General course-related questions,The course videos are from the previous iterat...,We won’t re-record the course videos. The focu...
19,data-engineering-zoomcamp,General course-related questions,Environment - Is Python 3.9 still the recommen...,"Yes, for simplicity (of troubleshooting agains..."
503,machine-learning-zoomcamp,2. Machine Learning for Regression,The answer I get for one of the homework quest...,That’s normal. We all have different environme...
443,machine-learning-zoomcamp,General course-related questions,Will I get a certificate?,"Yes, if you finish at least 2 out of 3 project..."
