In [None]:
Build Your Own Search Engine

Code for the "Build Your Own Search Engine" workshop

What we will do:

Use Zoomcamp FAQ documents
DE Zoomcamp
ML Zoomcamp
MLOps Zoomcamp
Create a search engine for retreiving these documents
Later the results can be used for a Q&A RAG system
Reference implementation for text search
Workshop Outline
Preparing the Environment
Basics of Text Search
Basics of Information Retrieval
Introduction to vector spaces, bag of words, and TF-IDF
Implementing Basic Text Search
TF-IDF scoring with sklearn
Keyword filtering using pandas
Creating a class for relevance search
Embeddings and Vector Search
Vector embeddings
Word2Vec and other approaches for word embeddings
LSA (Latent Semantic Analysis) for document embeddings
Implementing vector search with LSA
BERT embeddings
Combining Text and Vector Search (5 minutes)
Practical Implementation Aspects and Tools (10 minutes)
Real-world implementation tools:
inverted indexes for text search
LSH for vector search (using random projections)
Technologies:
Lucene/Elasticsearch for text search
FAISS and and other vector databases

In [None]:
# Create 
# Download the data/documents from github
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)



In [2]:
# Push documents into a pandas dataframe
# columns - course, section, question, text
import pandas as pd

df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [3]:
# Basic Text Mining Dictionary 
# - Information Retrieval - The process of obtaining relevant information from large datasets based on user queries.
# - Vector Spaces - A mathematical representation where text is converted into vectors (points in space) allowing for quantitative comparison.
# - Bag of Words - A simple text representation model treating each document as a collection of words disregarding grammar and word order but keeping multiplicity.
# - Stop Words - A set of words like (for, on, no etc) that do not add context
# to a document or piece of text
# - TF-IDF (Term Frequency-Inverse Document Frequency) - A statistical measure used to evaluate how important a word is to a document in a collection or corpus. It increases with the number of times a word appears in the document but is offset by the frequency of the word in the corpus.


In [4]:
# Manual keyword filtering of the dataframe - search for
# all records where course is equal to XXXXXX
df[df.course == 'data-engineering-zoomcamp'].head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [5]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [6]:
import sklearn

In [7]:
# Vectorization of a document - converts each document
# into a matrix whereby each cell has a word or a representation
# of the word in a numeric/binary form
# stopwords are excluded in the vectorizer too

from sklearn.feature_extraction.text import CountVectorizer

# Establish the vectorizer excluding English stop-words
cv = CountVectorizer(stop_words='english')

# 'fit' the vectorizer on the example document
X = cv.fit_transform(docs_example)

# Display all words (=feature_names) within the document
names = cv.get_feature_names_out()

# We create a new dataframe with the 'bag of words'
# such that each cell has a 1 or 0 to represent whether the word
# found in the document of not, then T the dataframe to
# empower an easier display of the data
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


In [8]:
# Replace CountVectorizer with TfidVectorizer
# provides a weight of 'importance' to each word - infrequent words
# will have higher importance than frequent words
from sklearn.feature_extraction.text import TfidfVectorizer

# Establish the vectorizer excluding English stop-words
# vectorizer uses a statistical fomula to vectorize the bag of words
# based on the relevancy of the word in a document
cv = TfidfVectorizer(stop_words='english')

# 'fit' the vectorizer on the example document
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4
15th,0.46,0.0,0.0,0.0,0.0
2024,0.46,0.0,0.0,0.0,0.0
cloud,0.0,0.0,0.0,0.0,0.46
course,0.37,0.0,0.0,0.0,0.37
date,0.0,0.0,0.5,0.0,0.0
github,0.0,0.58,0.0,0.0,0.0
google,0.0,0.0,0.0,0.0,0.46
homeworks,0.0,0.0,0.5,0.0,0.0
jan,0.46,0.0,0.0,0.0,0.0
listed,0.0,0.58,0.0,0.0,0.0


In [9]:
# Query-Document Similarity
# We represent the query in the same vector space -
#i.e. using the same vectorizer:
query = "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
q.toarray()

array([[0.        , 0.        , 0.        , 0.62791376, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.77828292, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

In [10]:
# We can see the words of the query and the words of some document:
# terms of our query
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

# terms of query against our document (document = docs_example)
# showing the weight given to each word in the 'bag of words'
# whilst comparing the query to the document.
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

{'15th': 0.0,
 '2024': 0.0,
 'cloud': 0.0,
 'course': 0.0,
 'date': 0.0,
 'github': 0.5773502691896258,
 'google': 0.0,
 'homeworks': 0.0,
 'jan': 0.0,
 'listed': 0.5773502691896258,
 'participation': 0.0,
 'prerequisites': 0.5773502691896258,
 'python': 0.0,
 'registration': 0.0,
 'required': 0.0,
 'setup': 0.0,
 'start': 0.0,
 'starts': 0.0,
 'submit': 0.0}

In [11]:
# We compare the words of the query vs the words of a document:
df_qd = pd.DataFrame([query_dict, doc_dict], index=['query', 'doc']).T
# here we go one step futher by depicting whether the query words
# exist in the document more easily - any returned value means the word exists,
# if the results is 0 the word does not exist
(df_qd['query'] * df_qd['doc']).sum()

0.0

In [12]:
# Compute dot-product and view the score in a matrix - a simple way of 
# computing similarities (url:https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/01-intro/08-linear-algebra.md)
X.dot(q.T).toarray()

array([[0.23490553],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.59579005]])

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
#'similarity score' between query and doc (document and vector)
score = cosine_similarity(X, q).flatten()
score

array([0.23490553, 0.        , 0.        , 0.        , 0.59579005])

In [14]:
import numpy as np
# sort by score lowest to highest
np.argsort(score)

array([1, 2, 3, 0, 4])

In [15]:
# Vectorizing all documents
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=3)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X

transformers['text'].get_feature_names_out()
matrices['text']

<948x2118 sparse matrix of type '<class 'numpy.float64'>'
	with 26463 stored elements in Compressed Sparse Row format>

In [16]:
transformers

{'section': TfidfVectorizer(min_df=3, stop_words='english'),
 'question': TfidfVectorizer(min_df=3, stop_words='english'),
 'text': TfidfVectorizer(min_df=3, stop_words='english')}

In [17]:
# search via the text field
query = "I just signed up. Is it too late to join the course?"

q = transformers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()

In [18]:
score

array([0.3336047 , 0.        , 0.        , 0.1328874 , 0.        ,
       0.        , 0.        , 0.12722114, 0.        , 0.        ,
       0.        , 0.10830554, 0.        , 0.        , 0.        ,
       0.23530268, 0.        , 0.        , 0.04595339, 0.        ,
       0.        , 0.        , 0.22668   , 0.07952931, 0.        ,
       0.        , 0.        , 0.1894954 , 0.08310739, 0.        ,
       0.        , 0.        , 0.        , 0.03724346, 0.        ,
       0.        , 0.        , 0.        , 0.16484429, 0.0231432 ,
       0.03424155, 0.05174621, 0.        , 0.03167699, 0.        ,
       0.        , 0.        , 0.        , 0.02097473, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.04722243, 0.        , 0.0073737 ,
       0.        , 0.        , 0.        , 0.        , 0.04161211,
       0.        , 0.        , 0.        , 0.        , 0.     

In [19]:
# only for a specific course 
mask = (df.course == 'data-engineering-zoomcamp').values
score = score * mask
score

array([0.3336047 , 0.        , 0.        , 0.1328874 , 0.        ,
       0.        , 0.        , 0.12722114, 0.        , 0.        ,
       0.        , 0.10830554, 0.        , 0.        , 0.        ,
       0.23530268, 0.        , 0.        , 0.04595339, 0.        ,
       0.        , 0.        , 0.22668   , 0.07952931, 0.        ,
       0.        , 0.        , 0.1894954 , 0.08310739, 0.        ,
       0.        , 0.        , 0.        , 0.03724346, 0.        ,
       0.        , 0.        , 0.        , 0.16484429, 0.0231432 ,
       0.03424155, 0.05174621, 0.        , 0.03167699, 0.        ,
       0.        , 0.        , 0.        , 0.02097473, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.04722243, 0.        , 0.0073737 ,
       0.        , 0.        , 0.        , 0.        , 0.04161211,
       0.        , 0.        , 0.        , 0.        , 0.     

In [20]:
# get the top results
idx = np.argsort(-score)[:10]

In [21]:
# get the top 10 docs
df.iloc[idx]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
15,data-engineering-zoomcamp,General course-related questions,Homework - Are late submissions of homework al...,"No, late submissions are not allowed. But if t..."
22,data-engineering-zoomcamp,General course-related questions,Environment - Do we really have to use GitHub ...,It's up to you which platform and environment ...
27,data-engineering-zoomcamp,General course-related questions,Environment - The GCP and other cloud provider...,You can do most of the course without a cloud....
38,data-engineering-zoomcamp,General course-related questions,Project - What is Project Attemp #1 and Projec...,You will have two attempts for a project. If t...
287,data-engineering-zoomcamp,Module 4: analytics engineering with dbt,CREATE TABLE has columns with duplicate name l...,This error could result if you are using some ...
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
113,data-engineering-zoomcamp,Module 1: Docker and Terraform,"Postgres - ""Column does not exist"" but it actu...","In the join queries, if we mention the column ..."
11,data-engineering-zoomcamp,General course-related questions,Certificate - Can I follow the course in a sel...,"No, you can only get a certificate if you fini..."


In [22]:
# search with all fields and boosting + filtering

# Boosting - giving higher importance to certain fields (a feature applied
# in elastic search. in this example - we apply a 3X weight on the 'question' field
# question 3x, each other field 1.0
boost = {'question': 3.0}

score = np.zeros(len(df))

for f in fields:
    b = boost.get(f, 1.0)
    q = transformers[f].transform([query])
    s = cosine_similarity(matrices[f], q).flatten()
    score = score + b * s

In [23]:
# filter the search results - show only when course == data-engineering-zoomcamp
filters = {
    'course': 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (df[field] == value).values
    score = score * mask

In [24]:
# show results
idx = np.argsort(-score)[:10]
results = df.iloc[idx]
results.to_dict(orient='records')

[{'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.'},
 {'course': 'data-eng

In [25]:
#Putting it all together
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [26]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just singned up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

In [27]:
# 4. Embeddings and Vector Search

# What are Embeddings?
# Conversion to Numbers: Embeddings transform different words, sentences and 
#documents into dense vectors (arrays with numbers).
# Capturing Similarity: They ensure similar items have similar numerical vectors, 
#illustrating their closeness in terms of characteristics.
# Dimensionality Reduction: Embeddings reduce complex characteristics into vectors.
# Use in Machine Learning: These numerical vectors are used in machine 
#learning models for tasks such as recommendations, text analysis, 
#and pattern recognition.

In [28]:
# Singular Value Decomposition (SVD) is the simplest way to turn Bag-of-Words 
# representation into embeddings

# This way we still don't preserve the word order (because it wasn't
# in the Bag-of-Words representation) but we reduce dimensionality and 
# capture synonyms.

# We won't go into mathematics, it's sufficient to know that SVD
# "compresses" our input vectors in such a way that as much as 
# possible of the original information is retained.

# This compression is lossy compression - meaning that we won't 
# be able to restore the 100% of the original vector, 
# but the result is close enough
from sklearn.decomposition import TruncatedSVD

X = matrices['text']
cv = transformers['text']

svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

# Dense embedding of a document
X_emb[0]

array([ 0.08800338, -0.07502007, -0.10123079,  0.04931358,  0.05390868,
       -0.05925045,  0.02443221,  0.04426507, -0.20895382,  0.33615457,
        0.09004844,  0.11762374, -0.09676248,  0.01665059, -0.04075185,
       -0.05252142])

In [29]:
X_emb.shape

(948, 16)

In [30]:
# Embedding for the query
query = 'I just signed up. Is it too late to join the course?'

Q = cv.transform([query])
Q_emb = svd.transform(Q)
Q_emb[0]

array([ 0.04353774, -0.03060714, -0.04435312,  0.01184352,  0.02518917,
       -0.05045197,  0.01208353,  0.02640784, -0.1180829 ,  0.17873943,
        0.06010436,  0.07859567, -0.05460703,  0.0081158 , -0.04885947,
       -0.02874882])

In [31]:
# Similarity between query and the document:
np.dot(X_emb[0], Q_emb[0])

0.12534800873587068

In [32]:
# Find similarity score across all documents
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

['Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects for each submission), you will get the certificate for the course. According to the course coordinator, Alexey Grigorev, only two projects are needed to get the course certificate.\n(optional) David Odimegwu',
 'No, it’s not possible. The form is closed after the due date. But don’t worry, homework is not mandatory for finishing the course.',
 "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) 

In [33]:
# Non-Negative Matrix Factorization
# SVD creates values with negative numbers. It's difficult to interpet them.

# NMF (Non-Negative Matrix Factorization) is a similar concept, 
# except for non-negative input matrices it produces non-negative results.
from sklearn.decomposition import NMF
# We can interpret each of the columns (features) of the embeddings 
# as different topic/concents and to what extent this document is about this concept.
nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.31300564,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [34]:
# NMF for the query
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

array([0.        , 0.00114369, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.17519878,
       0.        , 0.        , 0.        , 0.00072698, 0.        ,
       0.        ])

In [35]:
# compute similarity
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

["The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'Please choose the closest one to your answer. Also do not post your answer in the course slack channel.',
 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'No, it’s not possible. The form is closed a

In [36]:
df.loc[idx]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
814,mlops-zoomcamp,+-General course questions,What if my answer is not exactly the same as t...,Please choose the closest one to your answer. ...
449,machine-learning-zoomcamp,General course-related questions,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some..."
451,machine-learning-zoomcamp,General course-related questions,Can I submit the homework after the due date?,"No, it’s not possible. The form is closed afte..."
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
764,machine-learning-zoomcamp,Projects (Midterm and Capstone),What If I submitted only two projects and fail...,If you have submitted two projects (and peer-r...
437,machine-learning-zoomcamp,General course-related questions,What if I miss a session?,"Everything is recorded, so you won’t miss anyt..."
436,machine-learning-zoomcamp,General course-related questions,Is it going to be live? When?,"The course videos are pre-recorded, you can st..."
11,data-engineering-zoomcamp,General course-related questions,Certificate - Can I follow the course in a sel...,"No, you can only get a certificate if you fini..."
15,data-engineering-zoomcamp,General course-related questions,Homework - Are late submissions of homework al...,"No, late submissions are not allowed. But if t..."


In [37]:
# BERT 
# The problem with the previous two approaches is that they don't 
# take into account the word order. They just treat all the words separately 
# (that's why it's called "Bag-of-Words").BERT and other transformer models 
# don't have this problem.
# pip install transformers tqdm

In [40]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [41]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

In [42]:
# compute the embeddings
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

In [43]:
# compress the embeddings
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

torch.Size([2, 768])

In [44]:
# convert to an array
X_emb = sentence_embeddings.numpy()

In [45]:
# move sentence_embeddings_cpu = sentence_embeddings.cpu()
sentence_embeddings_cpu = sentence_embeddings.cpu()

In [46]:
# compute the embeddings for the documents
def make_batches(seq, n):
    result = []
    for i in range(0, len(seq), n):
        batch = seq[i:i+n]
        result.append(batch)
    return result

In [62]:
from tqdm import tqdm

def compute_embeddings(texts, batch_size=8):
    text_batches = make_batches(texts, 8)
    
    all_embeddings = []
    
    for batch in tqdm(text_batches):
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    
        with torch.no_grad():
            outputs = model(**encoded_input)
            hidden_states = outputs.last_hidden_state
            
            batch_embeddings = hidden_states.mean(dim=1)
            batch_embeddings_np = batch_embeddings.cpu().numpy()
            all_embeddings.append(batch_embeddings_np)
    
    final_embeddings = np.vstack(all_embeddings)
    return final_embeddings

In [63]:
X_text = compute_embeddings(df['text'].tolist())

100%|███████████████████████████████████████████████████████████████████████| 119/119 [01:06<00:00,  1.79it/s]


In [64]:
X_text

array([[-0.0045632 , -0.11667518,  0.62747186, ..., -0.03659187,
         0.10031684,  0.02927118],
       [-0.14233613, -0.19853905,  0.28455386, ..., -0.01139046,
        -0.15399753,  0.09535087],
       [ 0.19672222, -0.08461297,  0.28200483, ...,  0.11395872,
        -0.06448034, -0.01282615],
       ...,
       [-0.28217435, -0.33324352,  0.29784992, ..., -0.35042742,
         0.03266045,  0.09537277],
       [-0.42807105, -0.39468765,  0.30942005, ..., -0.05943286,
        -0.12965187,  0.0788707 ],
       [-0.16892162, -0.25146297,  0.4784332 , ..., -0.18535407,
        -0.16108921,  0.2727294 ]], dtype=float32)

TypeError: '_LazyModule' object is not callable