### Importing the necessary libraries

In [19]:
import pandas as pd
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

### Implementing text search with sci-kit learn

In [7]:
# we will start off by converting to json the FAQs from the various courses from DataTalks
# this will be used as the knowledge/data base by the search engine that we should be building shortly

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [10]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [11]:
# converting to pandas dataframe

df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [14]:
# the next step is to vectorise the text column but to understand how vectorisation works
# we can look at the following example - we start off with a list of texts called documents

documents = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [15]:
# next we initialise the object so that we can convert it into a encoded vector

cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(documents)

names = cv.get_feature_names_out()

# this is what is called a bag of words - there is no order
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs  # following is an illustration

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


In [18]:
# this is what we need - but not the best as there is no importance

cv = CountVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(df.text) # sparse matrix

In [None]:
# hence we use TF-IDF (Term Frequency-Inverse Document Frequency)
# to give less importance to words that appear more frequently

tf = TfidfVectorizer(stop_words='english', min_df=5)
X = tf.fit_transform(df.text) # sparse matrix