## In Memory implementation

In [14]:
import pandas as pd
import numpy as np

In [15]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [16]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [17]:
df = pd.DataFrame(documents, columns=['course','section','question','text'])

In [18]:
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [27]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [28]:
cv = CountVectorizer(max_df=5)

In [29]:
cv.fit(df.text)

In [30]:
cv.get_feature_names_out().shape

(5390,)

In [31]:
doc_ex = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [32]:
CV = TfidfVectorizer(stop_words='english')
CV.fit(doc_ex)

In [33]:
CV.get_feature_names_out()

array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

In [34]:
X = CV.transform(doc_ex)

In [35]:
pd.DataFrame(X.todense(),columns=CV.get_feature_names_out()).T

Unnamed: 0,0,1,2,3,4
15th,0.463693,0.0,0.0,0.0,0.0
2024,0.463693,0.0,0.0,0.0,0.0
cloud,0.0,0.0,0.0,0.0,0.463693
course,0.374105,0.0,0.0,0.0,0.374105
date,0.0,0.0,0.5,0.0,0.0
github,0.0,0.57735,0.0,0.0,0.0
google,0.0,0.0,0.0,0.0,0.463693
homeworks,0.0,0.0,0.5,0.0,0.0
jan,0.463693,0.0,0.0,0.0,0.0
listed,0.0,0.57735,0.0,0.0,0.0


In [36]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

CV = TfidfVectorizer(stop_words='english', min_df=5)
CV.fit(df.text)

X = CV.transform(df.text)

names = CV.get_feature_names_out()

df_doc = pd.DataFrame(X.toarray(),columns=names).T
df_doc.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
02,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
03,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
04,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
05,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0.0,0.00,0.28,0.0,0.0,0.00,0.21,0.2,0.15,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
yml,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.00
youtube,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.15,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
zip,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00


In [37]:
X

<948x1333 sparse matrix of type '<class 'numpy.float64'>'
	with 23808 stored elements in Compressed Sparse Row format>

In [38]:
query = "Do I need to know python to sign up for the January course?"

q = CV.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [39]:
from sklearn.metrics.pairwise import cosine_similarity

In [40]:
#X.dot(q.T).todense()..->gives same result as cosine_similarity(X,q)----> cosine_similarity is just a normalized dot product

In [41]:
#falatten to change from 2 dim to just one dim numpy array
score = cosine_similarity(X,q).flatten()

In [42]:
#this gives the indices of the documents sorted, in increasing order
np.argsort(score)[-5:]

array([764,  27, 806, 577, 445])

In [43]:
df.iloc[27].text

'You can do most of the course without a cloud. Almost everything we use (excluding BigQuery) can be run locally. We won’t be able to provide guidelines for some things, but most of the materials are runnable without GCP.\nFor everything in the course, there’s a local alternative. You could even do the whole course locally.'

In [44]:
fields = ['section', 'question','text']

In [45]:
matrices = {}
vecterizor = {}

for f in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=5)
    X = cv.fit_transform(df[f])
    matrices[f] = X
    vecterizor[f] = cv

In [46]:
matrices

{'section': <948x66 sparse matrix of type '<class 'numpy.float64'>'
 	with 3090 stored elements in Compressed Sparse Row format>,
 'question': <948x291 sparse matrix of type '<class 'numpy.float64'>'
 	with 3431 stored elements in Compressed Sparse Row format>,
 'text': <948x1333 sparse matrix of type '<class 'numpy.float64'>'
 	with 23808 stored elements in Compressed Sparse Row format>}

In [47]:
vecterizor

{'section': TfidfVectorizer(min_df=5, stop_words='english'),
 'question': TfidfVectorizer(min_df=5, stop_words='english'),
 'text': TfidfVectorizer(min_df=5, stop_words='english')}

In [48]:
df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...


In [49]:
n = len(df)

In [50]:
score = np.zeros(n)
boosts = {
    'question': 3,
    'test': 0.5
}

query = "Do I need to know python to sign up for the January course?"

for f in fields:
    q = vecterizor[f].transform([query])
    x = matrices[f]

    boost = boosts.get(f, 1.0)
    f_Score = cosine_similarity(x,q).flatten()

    score =  score + boost * f_Score

In [51]:
filters =  {
    'course' : 'data-engineering-zoomcamp'
}

In [52]:
for field, value in filters.items():
    mask = (df[field] == value).astype(int).values
    score = score * mask

In [53]:
idx = np.argsort(-score)[:5]

In [54]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
18,data-engineering-zoomcamp,General course-related questions,Leaderboard - I am not on the leaderboard / ho...,When you set up your account you are automatic...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...


In [55]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [56]:
index = TextSearch(
    text_fields= ['section', 'question','text']
)
index.fit(documents)

index.search(
    query = "Do I need to know python to sign up for the January course?",
    n_results= 5,
    boost= {'question': 3.0},
    filters = {'course' : 'data-engineering-zoomcamp'}
)


[{'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'After you create a GitHub account, you should clone the course repo to your local machine using the process outlined in this video: Git for Everybody: How to Clone a Repository from GitHub\nHaving this local repository on your computer will make it easy for you to access the instructors’ code and make pull requests (if you want to add your own notes or make changes to the course content).\nYou will probably also create your own repositories that host your notes, versions of your file, to do this. Here is a great tutorial that shows you how to do 

#### Embeddings  --- bag of words to --SVD-->embeddings
- all the above was with bag of words, no order and exact match no synomous matches, and embeddings are dense vectors

In [57]:
from sklearn.decomposition import TruncatedSVD

x = matrices['text']
cv = vecterizor['text']

svd = TruncatedSVD(n_components=16)
x_emb = svd.fit_transform(x)
x_emb.shape

(948, 16)

In [58]:
x_emb[0]

array([ 0.09652667, -0.08188362, -0.10276487, -0.07853007,  0.06824269,
       -0.06096377,  0.01425251, -0.10983963, -0.2325963 ,  0.32992239,
        0.01646184,  0.05449125, -0.08009826, -0.05815949, -0.02350436,
       -0.00450797])

In [79]:
query = "Do I need to know python to sign up for the January course?"
query_1 = "where to learn about Docker?"

Q = cv.transform([query_1])
Q_emb = svd.transform(Q)

In [80]:
Q_emb[0]

array([ 0.20789224,  0.34198914, -0.10113737,  0.01658993,  0.1734254 ,
        0.0503992 ,  0.01886665, -0.18844338, -0.00203279, -0.07107651,
       -0.18436967,  0.01116522,  0.03385963, -0.08373382, -0.00918037,
       -0.09713003])

In [81]:
np.dot(Q_emb[0], x_emb[0])

0.015565189737192987

In [82]:
score = cosine_similarity(x_emb, Q_emb).flatten()
idx = np.argsort(-score)[:5]
list(df.loc[idx].text)
df.loc[idx]

Unnamed: 0,course,section,question,text
300,data-engineering-zoomcamp,Module 4: analytics engineering with dbt,​​VS Code: NoPermissions (FileSystemError): Er...,If you have problems editing dbt_project.yml w...
21,data-engineering-zoomcamp,General course-related questions,Environment - Is GitHub codespaces an alternat...,GitHub Codespaces offers you computing Linux r...
90,data-engineering-zoomcamp,Module 1: Docker and Terraform,Docker-Compose - Which docker-compose binary t...,To figure out which docker-compose you need to...
58,data-engineering-zoomcamp,Module 1: Docker and Terraform,Docker - cs to store all code in your default ...,More info in the Docker Docs on Best Practises
91,data-engineering-zoomcamp,Module 1: Docker and Terraform,Docker-Compose - Error undefined volume in Win...,If you wrote the docker-compose.yaml file exac...


In [64]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=16)
x_emb = nmf.fit_transform(x)
x_emb[0]

array([0.00589172, 0.00589251, 0.        , 0.        , 0.08418423,
       0.        , 0.00104489, 0.        , 0.00222125, 0.01235737,
       0.00030938, 0.        , 0.        , 0.00400841, 0.00765089,
       0.00976496])

In [77]:
query = "where to learn about Docker?"
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

array([0.        , 0.26720567, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.17367807, 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [83]:
score = cosine_similarity(x_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)
df.loc[idx]

Unnamed: 0,course,section,question,text
300,data-engineering-zoomcamp,Module 4: analytics engineering with dbt,​​VS Code: NoPermissions (FileSystemError): Er...,If you have problems editing dbt_project.yml w...
21,data-engineering-zoomcamp,General course-related questions,Environment - Is GitHub codespaces an alternat...,GitHub Codespaces offers you computing Linux r...
90,data-engineering-zoomcamp,Module 1: Docker and Terraform,Docker-Compose - Which docker-compose binary t...,To figure out which docker-compose you need to...
58,data-engineering-zoomcamp,Module 1: Docker and Terraform,Docker - cs to store all code in your default ...,More info in the Docker Docs on Best Practises
91,data-engineering-zoomcamp,Module 1: Docker and Terraform,Docker-Compose - Error undefined volume in Win...,If you wrote the docker-compose.yaml file exac...
86,data-engineering-zoomcamp,Module 1: Docker and Terraform,Docker-Compose - docker-compose still not avai...,This is happen to me after following 1.4.1 vid...
916,mlops-zoomcamp,Module 4: Deployment,WARNING: The requested image's platform (linux...,To resolve this make sure to build the docker ...
371,data-engineering-zoomcamp,Module 6: streaming with kafka,Could not start docker image “control-center” ...,Check Docker Compose File:\nEnsure that your d...
723,machine-learning-zoomcamp,9. Serverless Deep Learning,Docker Temporary failure in name resolution,Add the next lines to vim /etc/docker/daemon.j...
383,data-engineering-zoomcamp,Module 6: streaming with kafka,Python Kafka: ‘KafkaTimeoutError: Failed to up...,Restarting all services worked for me:\ndocker...


In [85]:
import torch

from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [86]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

In [87]:
encoded_input

{'input_ids': tensor([[  101,  2748,  1010,  2057,  2097,  2562,  2035,  1996,  4475,  2044,
          1996,  2607, 12321,  1012,   102],
        [  101,  2017,  2064,  3582,  1996,  2607,  2012,  2115,  2219,  6393,
          2044,  2009, 12321,   102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}

In [90]:
with torch.no_grad():
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

hidden_states.shape

torch.Size([2, 15, 768])

In [89]:
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

torch.Size([2, 768])

In [91]:
sentence_embeddings

tensor([[ 0.3600, -0.1607,  0.3545,  ...,  0.0429,  0.0348, -0.0382],
        [ 0.1785, -0.5000,  0.2528,  ..., -0.1141, -0.3361,  0.4110]])