In [142]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [143]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [144]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [145]:
df = pd.DataFrame(documents, columns=['course','section','question','text'])
df[df.course=='data-engineering-zoomcamp'].head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [146]:
doc_examples = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [147]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english', min_df = 5)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
03,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
05,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0,0,1,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
yml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
youtube,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
zip,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [148]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english', min_df = 5)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
02,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
03,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
04,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
05,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0.0,0.00,0.28,0.0,0.0,0.00,0.21,0.2,0.15,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
yml,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.00
youtube,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.15,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
zip,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00


In [149]:
query = "Do I need to know python to sign up for the January course?"
q = cv.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [150]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

{'01': np.float64(0.0),
 '02': np.float64(0.0),
 '03': np.float64(0.0),
 '04': np.float64(0.0),
 '05': np.float64(0.0),
 '06': np.float64(0.0),
 '09': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '11': np.float64(0.0),
 '12': np.float64(0.0),
 '127': np.float64(0.0),
 '13': np.float64(0.0),
 '14': np.float64(0.0),
 '15': np.float64(0.0),
 '16': np.float64(0.0),
 '17': np.float64(0.0),
 '19': np.float64(0.0),
 '1st': np.float64(0.0),
 '20': np.float64(0.0),
 '2019': np.float64(0.0),
 '2020': np.float64(0.0),
 '2021': np.float64(0.0),
 '2022': np.float64(0.0),
 '2023': np.float64(0.0),
 '2024': np.float64(0.0),
 '21': np.float64(0.0),
 '22': np.float64(0.0),
 '24': np.float64(0.0),
 '25': np.float64(0.0),
 '2pacx': np.float64(0.0),
 '30': np.float64(0.0),
 '35': np.float64(0.0),
 '403': np.float64(0.0),
 '42': np.float64(0.0),
 '50': np.float64(0.0),
 '5000': np.float64(0.0),
 '5431': np.float64(0.0),
 '5432': np.float64(0.0),
 '60': np.float64(0.0),
 '600': np.floa

In [151]:
dot_product = X.dot(q.T).toarray()
dot_product

array([[0.19464486],
       [0.        ],
       [0.        ],
       [0.06011641],
       [0.04932915],
       [0.        ],
       [0.        ],
       [0.13477565],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.15899187],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.07431408],
       [0.        ],
       [0.        ],
       [0.05779673],
       [0.07243428],
       [0.        ],
       [0.05174293],
       [0.16373635],
       [0.08076031],
       [0.        ],
       [0.09755254],
       [0.        ],
       [0.21069625],
       [0.12067781],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.06381749],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.00910541],
       [0.02835681],
       [0.05480112],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.024

In [152]:
from sklearn.metrics.pairwise import cosine_similarity

In [153]:
cosine = cosine_similarity(X,q)

In [154]:
score = cosine_similarity(X,q).flatten()

In [155]:
score

array([0.19464486, 0.        , 0.        , 0.06011641, 0.04932915,
       0.        , 0.        , 0.13477565, 0.        , 0.        ,
       0.        , 0.15899187, 0.        , 0.        , 0.        ,
       0.07431408, 0.        , 0.        , 0.05779673, 0.07243428,
       0.        , 0.05174293, 0.16373635, 0.08076031, 0.        ,
       0.09755254, 0.        , 0.21069625, 0.12067781, 0.        ,
       0.        , 0.        , 0.        , 0.06381749, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00910541,
       0.02835681, 0.05480112, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.02469964, 0.05129386, 0.06013439,
       0.05252658, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.04169018, 0.        , 0.        , 0.        , 0.0075293 ,
       0.        , 0.        , 0.01971463, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [156]:
import numpy as np

In [157]:
np.argsort(score)

array([718, 699, 700, 701, 703, 704, 707, 710, 711, 712, 717, 698, 476,
       477, 478, 479, 480, 481, 482, 483, 486, 487, 833, 820, 821, 822,
       823, 824, 825, 827, 828, 830, 832, 488, 834, 467, 468, 471, 472,
       473, 474, 475, 697, 526, 514, 515, 516, 517, 518, 519, 520, 523,
       524, 525, 513, 527, 528, 530, 532, 533, 534, 535, 536, 537, 538,
       499, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 819, 501,
       504, 505, 506, 507, 508, 509, 510, 512, 864, 386, 851, 853, 854,
       855, 859, 860, 861, 862, 863, 385, 865, 387, 389, 390, 392, 397,
       399, 400, 402, 404, 369, 358, 359, 360, 361, 362, 363,  32, 366,
       367, 368, 405, 370, 371, 376, 377, 379, 380, 382, 383, 384, 442,
       839, 840, 841, 843, 846, 847, 848, 850, 437, 441, 836, 443, 444,
       447, 453, 460, 461, 462, 463, 466, 420, 407, 408, 409, 410, 412,
       414, 416, 417, 418, 419, 542, 421, 422, 423, 426, 427, 428, 429,
       430, 432, 654, 643, 644, 645, 646, 647, 649, 650, 651, 65

In [158]:
np.argsort(score)[-5:]

array([764,  27, 806, 577, 445])

In [159]:
df.iloc[0].text



"The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."

In [160]:
df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...


In [161]:
fields = ["section","question","text"]

In [162]:
matrices = {}
vectorizers = {}

for f in fields:
    cv = CountVectorizer(stop_words='english', min_df = 5)
    X = cv.fit_transform(df[f])
    matrices[f] = X
    vectorizers[f] = cv
    


In [163]:
matrices

{'section': <Compressed Sparse Row sparse matrix of dtype 'int64'
 	with 3090 stored elements and shape (948, 66)>,
 'question': <Compressed Sparse Row sparse matrix of dtype 'int64'
 	with 3431 stored elements and shape (948, 291)>,
 'text': <Compressed Sparse Row sparse matrix of dtype 'int64'
 	with 23808 stored elements and shape (948, 1333)>}

In [164]:
n = len(df)
score = np.zeros(n)

query = "I just discovered the course, is it too late to join?"

boosts = {
    'question': 3
}

'''Lo que hacemos es primero meter el vectorizador de section para definir cómo lo vamos a vectorizar, luego transformamos el query:
CountVectorizer(stop_words='english', min_df = 5).transform([query]).
Luego introducimos la matriz de section, es decir toda la columna de section pero ya vectorizada, y hacemos cosine_similarity entre la matriz y el query.
Ese f_score se va sumando con question y text hasta obtener la mayor probabilidad''' 

for f in fields:
    q = vectorizers[f].transform([query])
    X = matrices[f]
    f_score = cosine_similarity(X,q).flatten()

    boost = boosts.get(f, 1.0) #si f no está, el output es el predeterminado, es decir 1.0

    score  = score + boost * f_score

In [165]:
score

array([3.70073646, 3.5       , 2.94948974, 2.92992704, 3.5       ,
       3.5       , 2.23205081, 3.77216553, 2.94948974, 3.5       ,
       3.18328157, 2.8570226 , 0.5       , 0.5       , 0.5       ,
       0.65430335, 0.5       , 2.62132034, 0.60369517, 0.5       ,
       0.5       , 0.5       , 0.9330127 , 0.64433757, 0.5       ,
       0.5       , 0.5       , 0.85355339, 0.67817416, 0.5       ,
       0.5       , 0.5       , 0.5       , 1.78629062, 3.5       ,
       1.84164079, 0.5       , 0.5       , 0.5       , 0.55053987,
       0.58006408, 2.0949158 , 0.5       , 0.57332356, 0.        ,
       0.        , 0.        , 0.        , 0.04778185, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.10910895, 0.        , 0.01340141,
       0.        , 0.        , 0.        , 0.        , 0.10050378,
       0.        , 0.        , 0.        , 0.        , 0.     

In [166]:
filter = {
    'course': 'data-engineering-zoomcamp'
}

In [167]:
for field,value in filter.items():
    mask = (df[field] == value).astype(int).values #pandas está devolviendo una columna, por eso mask devuelve columna. Con values pasamos a una matriz np.
    score = score * mask
score

array([3.70073646, 3.5       , 2.94948974, 2.92992704, 3.5       ,
       3.5       , 2.23205081, 3.77216553, 2.94948974, 3.5       ,
       3.18328157, 2.8570226 , 0.5       , 0.5       , 0.5       ,
       0.65430335, 0.5       , 2.62132034, 0.60369517, 0.5       ,
       0.5       , 0.5       , 0.9330127 , 0.64433757, 0.5       ,
       0.5       , 0.5       , 0.85355339, 0.67817416, 0.5       ,
       0.5       , 0.5       , 0.5       , 1.78629062, 3.5       ,
       1.84164079, 0.5       , 0.5       , 0.5       , 0.55053987,
       0.58006408, 2.0949158 , 0.5       , 0.57332356, 0.        ,
       0.        , 0.        , 0.        , 0.04778185, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.10910895, 0.        , 0.01340141,
       0.        , 0.        , 0.        , 0.        , 0.10050378,
       0.        , 0.        , 0.        , 0.        , 0.     

In [168]:
idx = np.argsort(score)[-5:]

In [169]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."


In [170]:
xd = matrices['text']
xd

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 23808 stored elements and shape (948, 1333)>

In [171]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params) #recibe parámetros. Equivalente a TfidfVectorizer(max_features=100, stop_words="english")
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [172]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just singned up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

In [180]:
from sklearn.decomposition import TruncatedSVD

X = matrices['text']
cv = vectorizers['text'] #transformers

svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

X_emb[0]

array([ 0.53652588, -0.29608035, -0.21851638,  0.19393098, -0.50442142,
       -0.23258523,  0.08417124,  0.06102404, -0.14208574,  0.04710989,
       -0.18670818, -0.14600749, -0.01231612, -0.04376446, -0.24149636,
       -0.41845344])

In [174]:
query = 'I just singned up. Can I still join the course?'
Q = cv.transform([query])
Q_emb = svd.transform(Q)
Q_emb[0]

array([ 0.05509125, -0.06357324, -0.01495708, -0.00684086, -0.05777341,
       -0.06069735, -0.01507002, -0.01134177, -0.0293562 ,  0.02414019,
       -0.0140307 , -0.00804881,  0.05972193,  0.02133949, -0.01714831,
       -0.06568253])

In [175]:
np.dot(X_emb[0], Q_emb[0])

np.float64(0.13605255583754772)

In [176]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

['Answer: Ideally midterms up to module-06, capstones include all modules in that cohort’s syllabus. But you can include anything extra that you want to feature. Just be sure to document anything not covered in class.\nAlso watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\nMore discussions:\n[source1] [source2] [source3]',
 'The course is available in the self-paced mode too, so you can go through the materials at any time. But if you want to do it as a cohort with other students, the next iterations will happen in September 2023, September 2024 (and potentially other Septembers as well).',
 "Yes! We'll cover some linear algebra in the course, but in general, there will be very few formulas, mostly code.\nHere are some interesting videos covering linear algebra that you can already watch: ML Zoomcamp 1.8 - Linear Algebra Refresher from Alexey Grigorev or the excellent playlis

In [177]:
df.loc[idx]

Unnamed: 0,course,section,question,text
758,machine-learning-zoomcamp,Projects (Midterm and Capstone),"What modules, topics, problem-sets should a mi...","Answer: Ideally midterms up to module-06, caps..."
450,machine-learning-zoomcamp,General course-related questions,When does the next iteration start?,The course is available in the self-paced mode...
439,machine-learning-zoomcamp,General course-related questions,I don't know math. Can I take the course?,Yes! We'll cover some linear algebra in the co...
814,mlops-zoomcamp,+-General course questions,What if my answer is not exactly the same as t...,Please choose the closest one to your answer. ...
456,machine-learning-zoomcamp,General course-related questions,Submitting learning in public links,When you post about what you learned from the ...
761,machine-learning-zoomcamp,Projects (Midterm and Capstone),Computing the hash for project review,See the answer here.
449,machine-learning-zoomcamp,General course-related questions,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some..."
440,machine-learning-zoomcamp,General course-related questions,"I filled the form, but haven't received a conf...","The process is automated now, so you should re..."
452,machine-learning-zoomcamp,General course-related questions,I just joined. What should I do next? How can ...,Welcome to the course! Go to the course page (...
850,mlops-zoomcamp,Module 1: Introduction,Running out of memory,Problem: The output of DictVectorizer was taki...


In [178]:
df.loc[449]

course                              machine-learning-zoomcamp
section                      General course-related questions
question    The course has already started. Can I still jo...
text        Yes, you can. You won’t be able to submit some...
Name: 449, dtype: object

In [183]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

array([0.00045566, 0.05929197, 0.01828719, 0.00489922, 0.00720578,
       0.01437048, 0.00794671, 0.00486221, 0.        , 0.02728313,
       0.        , 0.02625698, 0.00172451, 0.        , 0.00398525,
       0.00233463])

In [None]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

In [187]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
df.loc[idx]

Unnamed: 0,course,section,question,text
69,data-engineering-zoomcamp,Module 1: Docker and Terraform,Docker - build error checking context: can’t s...,Found the issue in the PopOS linux. It happene...
140,data-engineering-zoomcamp,Module 1: Docker and Terraform,GCP VM - Port forwarding from GCP without usin...,"You can easily forward the ports of pgAdmin, p..."
96,data-engineering-zoomcamp,Module 1: Docker and Terraform,WSL - Permissions too open at Windows,Issue when trying to run the GPC VM through SS...
361,data-engineering-zoomcamp,Module 5: pyspark,Jupyter Notebook or SparkUI not loading proper...,Possible solution - Try to forward the port us...
351,data-engineering-zoomcamp,Module 5: pyspark,How to port forward outside VS Code,"I don’t use visual studio, so I did it the old..."
827,mlops-zoomcamp,Module 1: Introduction,Is the AWS free tier enough for doing this cou...,For many parts - yes. Some things like kinesis...
136,data-engineering-zoomcamp,Module 1: Docker and Terraform,GCP VM - mkdir: cannot create directory ‘.ssh’...,I am trying to create a directory but it won't...
937,mlops-zoomcamp,Module 6: Best practices,Could not reconfigure pytest from zero after g...,No option to remove pytest test\nRemove .vscod...
138,data-engineering-zoomcamp,Module 1: Docker and Terraform,. GCP VM - VM connection request timeout,Question: I connected to my VM perfectly fine ...
828,mlops-zoomcamp,Module 1: Introduction,AWS EC2: this site can’t be reached,When I click an open IP-address in an AWS EC2 ...
