In [15]:
from datetime import timedelta
from glob import glob
import pandas as pd
from datetime import datetime
from sentence_transformers import SentenceTransformer
import torch
import faiss
import numpy as np
import joblib

In [16]:
def timestamp_to_start_seconds(start_time):
    error = False

    try:
        start_time_parsed = datetime.strptime(start_time, '%H:%M:%S')
    except ValueError as e:
        error = True

    if error:
        start_time_parsed = datetime.strptime(start_time, '%H:%M:%S.%f')

    start_seconds = int(timedelta(hours=start_time_parsed.hour,minutes=start_time_parsed.minute,seconds=start_time_parsed.second).total_seconds())

    return start_seconds

In [17]:
transcript_files = glob("lecture_transcripts/*.csv")

In [18]:
youtube_links = [
    "https://youtu.be/fMMhJYNpHfM", #22.12.2022.

    "https://youtu.be/QWzjU9Tdt4Y", #3.11.2022.
    "https://youtu.be/3YoHR8CGU-4", #3.11.2022.

    "https://youtu.be/jebx2t2NX6U", #1.12.2022.
    "https://youtu.be/Q19taQtrlR0", #1.12.2022.

    "https://youtu.be/MASKzjeZyLM", #2.2.2023.
    "https://youtu.be/SpSD84Kgqio", #2.2.2023.


]

In [19]:
dfs = []
for i, file in enumerate(transcript_files):
    temp_df = pd.read_csv(file, sep=";")
    temp_df["start_seconds"] = temp_df["Timestamp"].apply(lambda x: timestamp_to_start_seconds(x.split(" - ")[0]))
    temp_df["start_link"] = temp_df["start_seconds"].apply(lambda x: youtube_links[i]+f"?t={x}")
    dfs.append(temp_df)

In [20]:
# dfs = [pd.read_csv(file, sep=";") for file in transcript_files]

In [21]:
df = pd.concat(dfs, ignore_index=True)

In [22]:
df

Unnamed: 0,Speech,Timestamp,start_seconds,start_link
0,Prvi put smo bili zapeli u jednom dijelu samo ...,0:00:00 - 0:00:20,0,https://youtu.be/fMMhJYNpHfM?t=0
1,Nije baš bilo jasno kako primijeniti ovaj post...,0:00:20 - 0:00:27,20,https://youtu.be/fMMhJYNpHfM?t=20
2,"Imali smo niz formula A0, A1 itd.",0:00:27 - 0:00:32,27,https://youtu.be/fMMhJYNpHfM?t=27
3,"I tako dalje, onda smo od njih nekako gradili ...",0:00:32 - 0:00:46,32,https://youtu.be/fMMhJYNpHfM?t=32
4,Uzimajući ili te formule Aove ili njihove nega...,0:00:46 - 0:00:52,46,https://youtu.be/fMMhJYNpHfM?t=46
...,...,...,...,...
4138,postoji tzv. močkinom u Polinom koji ima to sv...,0:45:35 - 0:45:42,2735,https://youtu.be/SpSD84Kgqio?t=2735
4139,ali nije zbroj kvadrata.,0:45:42 - 0:45:44,2742,https://youtu.be/SpSD84Kgqio?t=2742
4140,"Ali kažem, nećemo ništa više o tome reći, bar ...",0:45:44 - 0:45:48,2744,https://youtu.be/SpSD84Kgqio?t=2744
4141,"Dobro, to je to. Ima možda nekih pitanja online?",0:45:50 - 0:45:54,2750,https://youtu.be/SpSD84Kgqio?t=2750


In [23]:
sentences = df["Speech"].tolist()

In [25]:
model = joblib.load("mathbert_model.pkl") #SentenceTransformer("MathBERT_hr")

In [26]:
if torch.cuda.is_available():
    model = model.to(torch.device("cuda"))
model.device

device(type='cuda', index=0)

In [27]:
embeddings = model.encode(sentences, show_progress_bar=True, convert_to_numpy=True)

Batches:   0%|          | 0/130 [00:00<?, ?it/s]

In [28]:
index = faiss.IndexFlatL2(embeddings.shape[1])
# index = faiss.IndexIDMap(index)

In [29]:
index.add(embeddings)

In [30]:
D, I = index.search(np.array([embeddings[4000]]), k=10)

In [31]:
def vector_search(query: list[str], model: SentenceTransformer, index, num_results=10):
    """Tranforms query to vector using a pretrained, sentence-level
    DistilBERT model and finds similar vectors using FAISS.

    Args:
        query (str): User query that should be more than a sentence long.
        model (sentence_transformers.SentenceTransformer.SentenceTransformer)
        index (`numpy.ndarray`): FAISS index that needs to be deserialized.
        num_results (int): Number of results to return.

    Returns:
        D (:obj:`numpy.array` of `float`): Distance between results and query.
        I (:obj:`numpy.array` of `int`): Paper ID of the results.

    """
    vector = model.encode(list(query))
    D, I = index.search(np.array(vector).astype("float32"), k=num_results)
    return D, I

In [32]:
# D, I = vector_search(["Gdje smo zapeli prvi put?"], model, index, num_results=10)
# D, I = vector_search(["Levenhim-Skulemov teorem"], model, index, num_results=10)
# D, I = vector_search(["Čemu služe atomarne formule?"], model, index, num_results=10)
D, I = vector_search(["Koje strukture su izomorfne?"], model, index, num_results=10)
# D, I = vector_search(["Zašto je to korisno?"], model, index, num_results=10)

df[["Speech", "start_link"]].iloc[I[0]]

Unnamed: 0,Speech,start_link
1520,ali strukture koje su elementarno ekvivalente ...,https://youtu.be/3YoHR8CGU-4?t=2308
1117,koji je izomorfan ovoj manjej strukturi.,https://youtu.be/3YoHR8CGU-4?t=470
609,"Dakle, ne mora biti, postoje i strukture prvog...",https://youtu.be/QWzjU9Tdt4Y?t=825
1703,"Dakle, ali naravno, ove dvije stvari su izomor...",https://youtu.be/jebx2t2NX6U?t=412
1126,"Svakako važan teorem je da izomorfne strukture,",https://youtu.be/3YoHR8CGU-4?t=500
1014,"Dakle, imamo dvije strukture i onda zapravo ho...",https://youtu.be/3YoHR8CGU-4?t=10
1129,Izomorfne strukture jesu elementarno ekvivalente.,https://youtu.be/3YoHR8CGU-4?t=509
967,"Svakako, homomorfizam, dakle, struktura koje s...",https://youtu.be/QWzjU9Tdt4Y?t=2555
1833,recimo Z2 i Z3 sigurno nisu izomorfne.,https://youtu.be/jebx2t2NX6U?t=1156
1571,Što su relacijske strukture? To su naravno one...,https://youtu.be/3YoHR8CGU-4?t=2528


In [33]:
D, I = vector_search(["Gdje smo zapeli prvi put?"], model, index, num_results=10)
df[["Speech", "start_link"]].iloc[I[0]]

Unnamed: 0,Speech,start_link
0,Prvi put smo bili zapeli u jednom dijelu samo ...,https://youtu.be/fMMhJYNpHfM?t=0
640,"ne znam, na elementarnoj matematici ili gdje već.",https://youtu.be/QWzjU9Tdt4Y?t=965
4113,"Dobro, i šta sad? Činjenica na f vrijedige nap...",https://youtu.be/SpSD84Kgqio?t=2578
339,"ali zasad samo konačno mnogo, ako uzmem i pres...",https://youtu.be/fMMhJYNpHfM?t=2215
302,za tu tvrdnju koju vi postavite pred njih. Nek...,https://youtu.be/fMMhJYNpHfM?t=2008
1560,Ne mora nikakav uvjet biti zadovoljen.,https://youtu.be/3YoHR8CGU-4?t=2469
876,"to zapravo znači da ja moram sve variable, pri...",https://youtu.be/QWzjU9Tdt4Y?t=2170
300,"tvrdnje, ona će glasati ili za jednu ili za dr...",https://youtu.be/fMMhJYNpHfM?t=1986
3911,"Znači za početak moramo nešto reći o idealima,...",https://youtu.be/SpSD84Kgqio?t=1571
2748,kad smo gledali šta to zapravo znači za,https://youtu.be/MASKzjeZyLM?t=186


In [34]:
D, I = vector_search(["Što je elementarno preslikavanje?"], model, index, num_results=10)
df[["Speech", "start_link"]].iloc[I[0]]

Unnamed: 0,Speech,start_link
1156,onda zovemo elementarno preslikavanje.,https://youtu.be/3YoHR8CGU-4?t=632
1181,"šta je elementarno preslikavanje, šta je jakih...",https://youtu.be/3YoHR8CGU-4?t=718
1214,"Dakle, da ovo elementarno preslikavanje mora n...",https://youtu.be/3YoHR8CGU-4?t=863
1160,Izomorfizmi uvijek jesu elementarna preslikava...,https://youtu.be/3YoHR8CGU-4?t=646
1176,elementarno preslikavanje jakih homomorfiza.,https://youtu.be/3YoHR8CGU-4?t=701
1158,"pa niti jaki homomorfizam, ne mora biti elemen...",https://youtu.be/3YoHR8CGU-4?t=638
1170,Elementarno preslikavanje općenito ne mora bit...,https://youtu.be/3YoHR8CGU-4?t=678
1431,"Dakle, to je kao elementarno preslikavanje, sa...",https://youtu.be/3YoHR8CGU-4?t=1820
1213,"i da imate elementarno preslikavanje, ali da n...",https://youtu.be/3YoHR8CGU-4?t=858
1150,"Neka preslikavanja nisu u onom smislu,",https://youtu.be/3YoHR8CGU-4?t=600


In [35]:
D, I = vector_search(["Što su elementarna preslikavanja?"], model, index, num_results=10)
df[["Speech", "start_link"]].iloc[I[0]]

Unnamed: 0,Speech,start_link
1160,Izomorfizmi uvijek jesu elementarna preslikava...,https://youtu.be/3YoHR8CGU-4?t=646
1156,onda zovemo elementarno preslikavanje.,https://youtu.be/3YoHR8CGU-4?t=632
1214,"Dakle, da ovo elementarno preslikavanje mora n...",https://youtu.be/3YoHR8CGU-4?t=863
1176,elementarno preslikavanje jakih homomorfiza.,https://youtu.be/3YoHR8CGU-4?t=701
1213,"i da imate elementarno preslikavanje, ali da n...",https://youtu.be/3YoHR8CGU-4?t=858
1150,"Neka preslikavanja nisu u onom smislu,",https://youtu.be/3YoHR8CGU-4?t=600
1181,"šta je elementarno preslikavanje, šta je jakih...",https://youtu.be/3YoHR8CGU-4?t=718
1170,Elementarno preslikavanje općenito ne mora bit...,https://youtu.be/3YoHR8CGU-4?t=678
1158,"pa niti jaki homomorfizam, ne mora biti elemen...",https://youtu.be/3YoHR8CGU-4?t=638
1199,"na prvi pogled, isti uvjet kao kod elementarno...",https://youtu.be/3YoHR8CGU-4?t=808


In [36]:
D, I = vector_search(["Što su izomorfizmi?"], model, index, num_results=10)
df[["Speech", "start_link"]].iloc[I[0]]

Unnamed: 0,Speech,start_link
1703,"Dakle, ali naravno, ove dvije stvari su izomor...",https://youtu.be/jebx2t2NX6U?t=412
1553,i na njemu definirati izomorfizam.,https://youtu.be/3YoHR8CGU-4?t=2444
2295,ako dva vektorska prostora imaju istu dimenzij...,https://youtu.be/Q19taQtrlR0?t=952
1160,Izomorfizmi uvijek jesu elementarna preslikava...,https://youtu.be/3YoHR8CGU-4?t=646
1411,koji je zapravo čak i izomorfizam.,https://youtu.be/3YoHR8CGU-4?t=1710
3375,što to zapravo znači?,https://youtu.be/MASKzjeZyLM?t=1954
1833,recimo Z2 i Z3 sigurno nisu izomorfne.,https://youtu.be/jebx2t2NX6U?t=1156
1520,ali strukture koje su elementarno ekvivalente ...,https://youtu.be/3YoHR8CGU-4?t=2308
1830,"hoću reći ne izomorflog, se može modelirati u ...",https://youtu.be/jebx2t2NX6U?t=1143
1129,Izomorfne strukture jesu elementarno ekvivalente.,https://youtu.be/3YoHR8CGU-4?t=509


In [37]:
D, I = vector_search(["Elementarno ekvivalentne strukture"], model, index, num_results=10)
df[["Speech", "start_link"]].iloc[I[0]]

Unnamed: 0,Speech,start_link
1129,Izomorfne strukture jesu elementarno ekvivalente.,https://youtu.be/3YoHR8CGU-4?t=509
1519,"Znači, strukture koje su elementarni jesu elem...",https://youtu.be/3YoHR8CGU-4?t=2305
1520,ali strukture koje su elementarno ekvivalente ...,https://youtu.be/3YoHR8CGU-4?t=2308
912,"nad istom signaturom, to je važno, su elementa...",https://youtu.be/QWzjU9Tdt4Y?t=2331
1135,pa tako je elementarno ekvivalenciju.,https://youtu.be/3YoHR8CGU-4?t=533
923,"bez obzira na valuaciju, onda kažemo da su te ...",https://youtu.be/QWzjU9Tdt4Y?t=2370
2311,"Ali, možemo razne druge stvari tu reći i pokaz...",https://youtu.be/Q19taQtrlR0?t=1067
3407,ekvivalentno s time da zapravo kažete,https://youtu.be/MASKzjeZyLM?t=2030
2680,"po definiciji ekvivalentne, pa se tu nema šta ...",https://youtu.be/Q19taQtrlR0?t=2531
3128,onda je to ekvivalentno sa,https://youtu.be/MASKzjeZyLM?t=1256
