In [32]:
# get data (FAQ documents)
import requests 
import json

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

json.dump(documents_raw, open('documents.json', 'w'), indent=4)


In [34]:

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [35]:
# create dataframe from the docs data
import pandas as pd

df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [36]:
print("number of documents in data-engineering-zoomcamp course: ")
print(len(df[df.course == 'data-engineering-zoomcamp']))

print("\ncourses in the data: ")
print(df["course"].unique())


number of documents in data-engineering-zoomcamp course: 
435

courses in the data: 
['data-engineering-zoomcamp' 'machine-learning-zoomcamp' 'mlops-zoomcamp']


In [38]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course",
    "I love DataTalks.Club, DataTalks.Club is the best, DataTalks.Club is amazing"
]

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
03,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
05,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0,0,1,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
yml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
youtube,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
zip,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5
15th,0.46,0.0,0.0,0.0,0.0,0.0
2024,0.46,0.0,0.0,0.0,0.0,0.0
amazing,0.0,0.0,0.0,0.0,0.0,0.22
best,0.0,0.0,0.0,0.0,0.0,0.22
cloud,0.0,0.0,0.0,0.0,0.46,0.0
club,0.0,0.0,0.0,0.0,0.0,0.65
course,0.38,0.0,0.0,0.0,0.38,0.0
datatalks,0.0,0.0,0.0,0.0,0.0,0.65
date,0.0,0.0,0.5,0.0,0.0,0.0
github,0.0,0.58,0.0,0.0,0.0,0.0


In [27]:
df.text

0      The purpose of this document is to capture fre...
1      GitHub - DataTalksClub data-engineering-zoomca...
2      Yes, even if you don't register, you're still ...
3      You don't need it. You're accepted. You can al...
4      You can start by installing and setting up all...
                             ...                        
943    Problem description\nThis is the step in the c...
944    Problem description\nWhen a docker-compose fil...
945    Problem description\nIf you are having problem...
946    Problem description\nPre-commit command was fa...
947    Problem description\nInfrastructure created in...
Name: text, Length: 948, dtype: object

In [28]:
query = "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
q.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.6340862 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.77326237, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

In [30]:
query_dict = dict(zip(names, q.toarray()[0]))
print(query_dict)

doc_dict = dict(zip(names, X.toarray()[1]))
print(doc_dict)

{'15th': np.float64(0.0), '2024': np.float64(0.0), 'amazing': np.float64(0.0), 'best': np.float64(0.0), 'cloud': np.float64(0.0), 'club': np.float64(0.0), 'course': np.float64(0.6340862024337309), 'datatalks': np.float64(0.0), 'date': np.float64(0.0), 'github': np.float64(0.0), 'google': np.float64(0.0), 'homeworks': np.float64(0.0), 'jan': np.float64(0.0), 'listed': np.float64(0.0), 'love': np.float64(0.0), 'participation': np.float64(0.0), 'prerequisites': np.float64(0.0), 'python': np.float64(0.7732623667832087), 'registration': np.float64(0.0), 'required': np.float64(0.0), 'setup': np.float64(0.0), 'start': np.float64(0.0), 'starts': np.float64(0.0), 'submit': np.float64(0.0)}
{'15th': np.float64(0.0), '2024': np.float64(0.0), 'amazing': np.float64(0.0), 'best': np.float64(0.0), 'cloud': np.float64(0.0), 'club': np.float64(0.0), 'course': np.float64(0.0), 'datatalks': np.float64(0.0), 'date': np.float64(0.0), 'github': np.float64(0.5773502691896257), 'google': np.float64(0.0), 'hom

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(X, q)

array([[0.24054627],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.59827661],
       [0.        ]])