In [27]:
from datetime import timedelta
import pandas as pd
from datetime import datetime
from sentence_transformers import SentenceTransformer
import torch
import faiss
import numpy as np
import joblib
import unicodedata
import json
from pathlib import Path

In [28]:
def show_df_as_markdown(df: pd.DataFrame):
    print(df.to_markdown())

In [29]:
def timestamp_to_start_seconds(start_time):
    error = False

    try:
        start_time_parsed = datetime.strptime(start_time, '%H:%M:%S')
    except ValueError as e:
        error = True

    if error:
        start_time_parsed = datetime.strptime(start_time, '%H:%M:%S.%f')

    start_seconds = int(timedelta(hours=start_time_parsed.hour,minutes=start_time_parsed.minute,seconds=start_time_parsed.second).total_seconds())

    return start_seconds

In [30]:
def vector_search(query: list[str], model: SentenceTransformer, index, num_results=10):
    vector = model.encode(list(query))
    D, I = index.search(np.array(vector).astype("float32"), k=num_results)
    return D, I

In [31]:
transcript_files = list(Path().glob("lecture_transcripts/*.csv"))
youtube_video_names = list(map(lambda x: unicodedata.normalize("NFD", x.stem.split(" (192kbit_AAC)")[0]), transcript_files))

In [32]:
with open("youtube_links.json", "r") as f:
    youtube_links = json.load(f)

In [33]:
dfs = []
for i, (file, video_name) in enumerate(zip(transcript_files, youtube_video_names)):
    temp_df = pd.read_csv(file, sep=";")
    temp_df["start_seconds"] = temp_df["Timestamp"].apply(lambda x: timestamp_to_start_seconds(x.split(" - ")[0]))
    temp_df["start_link"] = temp_df["start_seconds"].apply(lambda x: youtube_links[video_name]+f"?t={x}")

    dfs.append(temp_df)
df = pd.concat(dfs, ignore_index=True)

In [34]:
df.to_parquet("knowledge_base.parquet")

In [35]:
sentences = df["Speech"].tolist()

In [36]:
model = SentenceTransformer("TamedWicked/MathBERT_hr")

No sentence-transformers model found with name C:\Users\mpajas.DESKTOP-0MU8OSB/.cache\torch\sentence_transformers\TamedWicked_MathBERT_hr. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at C:\Users\mpajas.DESKTOP-0MU8OSB/.cache\torch\sentence_transformers\TamedWicked_MathBERT_hr were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you e

In [37]:
if torch.cuda.is_available():
    model = model.to(torch.device("cuda"))
model.device

device(type='cuda', index=0)

In [38]:
embeddings = model.encode(sentences, show_progress_bar=True, convert_to_numpy=True)

Batches:   0%|          | 0/599 [00:00<?, ?it/s]

In [39]:
joblib.dump(embeddings, "knowledge_base_embeddings.pkl")

['knowledge_base_embeddings.pkl']

In [40]:
index = faiss.IndexFlatL2(embeddings.shape[1])
# index = faiss.IndexIDMap(index)

In [41]:
index.add(embeddings)

In [42]:
D, I = index.search(np.array([embeddings[4000]]), k=10)

In [43]:
D, I = vector_search(["Koje strukture su izomorfne?"], model, index, num_results=10)
result = df[["Speech", "start_link"]].iloc[I[0]]
show_df_as_markdown(result)

|       | Speech                                                                          | start_link                          |
|------:|:--------------------------------------------------------------------------------|:------------------------------------|
|  7092 | ali strukture koje su elementarno ekvivalente ne moraju biti izomorfne.         | https://youtu.be/3YoHR8CGU-4?t=2308 |
|  8856 | Dakle, mogu trivijalno uzeti dvije strukture koje jesu izomorfne,               | https://youtu.be/Y4sFFGs6rt4?t=490  |
|  8926 | Dakle, to je zapravo pravi izomorfizam dvije strukture.                         | https://youtu.be/Y4sFFGs6rt4?t=804  |
|  6689 | koji je izomorfan ovoj manjej strukturi.                                        | https://youtu.be/3YoHR8CGU-4?t=470  |
|  6181 | Dakle, ne mora biti, postoje i strukture prvog rada koje nisu normalne,         | https://youtu.be/QWzjU9Tdt4Y?t=825  |
|  6698 | Svakako važan teorem je da izomorfne strukture,                                 

In [44]:
D, I = vector_search(["Gdje smo zapeli prvi put?"], model, index, num_results=10)
result = df[["Speech", "start_link"]].iloc[I[0]]
show_df_as_markdown(result)

|       | Speech                                                                          | start_link                          |
|------:|:--------------------------------------------------------------------------------|:------------------------------------|
|     0 | Prvi put smo bili zapeli u jednom dijelu samo kod kregove interpolacijske leme. | https://youtu.be/fMMhJYNpHfM?t=0    |
|  9005 | ali to ti je zapravo model za S u kojem ne vrijedi F.                           | https://youtu.be/Y4sFFGs6rt4?t=1202 |
| 16843 | Zašto na D od N1 ne vrijedi ovo? Zato što S nije ispunjev,                      | https://youtu.be/hn1JzNNQzpY?t=1698 |
|  4061 | Znači, kad imamo dvije negacije ispred A, onda možemo ih maknuti.               | https://youtu.be/4MVlcNiz7CI?t=776  |
|  6212 | ne znam, na elementarnoj matematici ili gdje već.                               | https://youtu.be/QWzjU9Tdt4Y?t=965  |
|  1825 | Znači, mogu zaključiti da postoji ovaj V na kojem bi bio psi. Dobro?            

In [45]:
D, I = vector_search(["Što je elementarno preslikavanje?"], model, index, num_results=10)
result = df[["Speech", "start_link"]].iloc[I[0]]
show_df_as_markdown(result)

|       | Speech                                                                             | start_link                          |
|------:|:-----------------------------------------------------------------------------------|:------------------------------------|
|  6728 | onda zovemo elementarno preslikavanje.                                             | https://youtu.be/3YoHR8CGU-4?t=632  |
|  6753 | šta je elementarno preslikavanje, šta je jakih homomorfiza,                        | https://youtu.be/3YoHR8CGU-4?t=718  |
|  6786 | Dakle, da ovo elementarno preslikavanje mora nekako drugačije preslikati elemente, | https://youtu.be/3YoHR8CGU-4?t=863  |
|  6748 | elementarno preslikavanje jakih homomorfiza.                                       | https://youtu.be/3YoHR8CGU-4?t=701  |
|  6732 | Izomorfizmi uvijek jesu elementarna preslikavanja,                                 | https://youtu.be/3YoHR8CGU-4?t=646  |
|  6730 | pa niti jaki homomorfizam, ne mora biti elementarno preslik

In [46]:
D, I = vector_search(["Što su elementarna preslikavanja?"], model, index, num_results=10)
result = df[["Speech", "start_link"]].iloc[I[0]]
show_df_as_markdown(result)

|      | Speech                                                                             | start_link                         |
|-----:|:-----------------------------------------------------------------------------------|:-----------------------------------|
| 6728 | onda zovemo elementarno preslikavanje.                                             | https://youtu.be/3YoHR8CGU-4?t=632 |
| 6732 | Izomorfizmi uvijek jesu elementarna preslikavanja,                                 | https://youtu.be/3YoHR8CGU-4?t=646 |
| 6786 | Dakle, da ovo elementarno preslikavanje mora nekako drugačije preslikati elemente, | https://youtu.be/3YoHR8CGU-4?t=863 |
| 6785 | i da imate elementarno preslikavanje, ali da nemate elementarni podmodel.          | https://youtu.be/3YoHR8CGU-4?t=858 |
| 6748 | elementarno preslikavanje jakih homomorfiza.                                       | https://youtu.be/3YoHR8CGU-4?t=701 |
| 6722 | Neka preslikavanja nisu u onom smislu,                                    

In [47]:
D, I = vector_search(["Što su izomorfizmi?"], model, index, num_results=10)
result = df[["Speech", "start_link"]].iloc[I[0]]
show_df_as_markdown(result)

|       | Speech                                                                                                        | start_link                          |
|------:|:--------------------------------------------------------------------------------------------------------------|:------------------------------------|
| 15557 | po nekakvom izomorfizmu F.                                                                                    | https://youtu.be/dt5g46wavvQ?t=78   |
| 15555 | i oni su izomorfni,                                                                                           | https://youtu.be/dt5g46wavvQ?t=72   |
| 17305 | bi zapravo bio izomorfan sa M, a onda bi naravno imao tu nekakav izomorfizam i onda bi taj A koji postoji u N | https://youtu.be/CEEm6l5UEHw?t=1766 |
|  8848 | Ako su konačno izomorfne.                                                                                     | https://youtu.be/Y4sFFGs6rt4?t=459  |
| 17682 | naravno, opet sad, do na izomo

In [48]:
D, I = vector_search(["Elementarno ekvivalentne strukture"], model, index, num_results=10)
result = df[["Speech", "start_link"]].iloc[I[0]]
show_df_as_markdown(result)

|       | Speech                                                                  | start_link                          |
|------:|:------------------------------------------------------------------------|:------------------------------------|
| 17793 | I to su elementarno ekvivalentne strukture.                             | https://youtu.be/8yu-sTrLbyw?t=1739 |
|  6701 | Izomorfne strukture jesu elementarno ekvivalente.                       | https://youtu.be/3YoHR8CGU-4?t=509  |
|  8366 | Dakle, dvije strukture su elementarno ekvivalentne                      | https://youtu.be/hF6Zm4IUYGQ?t=476  |
|  7091 | Znači, strukture koje su elementarni jesu elementarno ekvivalentne,     | https://youtu.be/3YoHR8CGU-4?t=2305 |
| 17772 | ali moraš biti u nekoj elementarno ekvivalentnoj strukturi.             | https://youtu.be/8yu-sTrLbyw?t=1632 |
| 12278 | Ali ima, ima pisanja. Dvije sigma strukture su elementarno ekvalentne,  | https://youtu.be/UXihhCqIjkg?t=1720 |
|  5278 | Elementarno je