In [1]:
from sentence_transformers import SentenceTransformer
import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
import requests

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
uri = "llm-zoomcap-lancedb"
db = lancedb.connect(uri)

In [3]:
model = get_registry().get("sentence-transformers").create(name="multi-qa-distilbert-cos-v1", device="cpu")

In [4]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()
documents = [doc for doc in documents if doc['course']=="machine-learning-zoomcamp"]
# Iterate through each dictionary in the list
for doc in documents:
    if 'text' in doc:
        doc['answer'] = doc.pop('text')

documents[0]

{'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp',
 'id': '0227b872',
 'answer': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork'}

#### Using LanceDB auto embeding

In [5]:
class CourseQuestions(LanceModel):
    section: str
    question: str = model.SourceField()
    course: str
    id: str
    answer: str = model.SourceField()
    vector: Vector(model.ndims()) = model.VectorField()

In [6]:
# course-questions
table = db.create_table("course_questions_auto", schema=CourseQuestions, mode="overwrite")
table.add(documents)

In [7]:
user_question = "I just discovered the course. Can I still join it?"
results = (
    table
    .search(user_question, query_type="vector")
    .metric("cosine")
    .nprobes(10000)
    .select(["id", "question", "answer"])
    .limit(5)
    .to_pandas()
)
results

Unnamed: 0,id,question,answer,_distance
0,636f55d5,When does the next iteration start?,The course is available in the self-paced mode...,0.525028
1,6ba259b1,"I filled the form, but haven't received a conf...","The process is automated now, so you should re...",0.527779
2,ee58a693,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some...",0.529869
3,e7ba6b8a,The course videos are from the previous iterat...,We won’t re-record the course videos. The focu...,0.539758
4,39fda9f0,Is it going to be live? When?,"The course videos are pre-recorded, you can st...",0.615941
