In [17]:
# Imports

from qdrant_client import QdrantClient, models
from fastembed import  TextEmbedding

from openai import OpenAI
import requests
import json

In [18]:

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

docs = []
for c in documents_raw:
    for d in c['documents']:
        d['course'] = c['course']
        docs.append(d)

docs[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [19]:
qd_client = QdrantClient("http://localhost:6333") 

EMBEDDING_DIMENSIONS = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [None]:
collection_name = "zoomcamp-faq"
qd_client.delete_collection(collection_name)

In [None]:


qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONS,
        distance=models.Distance.COSINE
    )
)


True

In [21]:
# qd_client.delete_collection(collection_name)


In [22]:
docs[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [28]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword" # exact match on string metadata field
)

UpdateResult(operation_id=6, status=<UpdateStatus.COMPLETED: 'completed'>)

In [23]:
points = []

for i, doc in enumerate(docs):
    
    q_a = doc['question'] + ' ' + doc['text']  # Concatenate question and text for embedding
    vector=models.Document(text=q_a, model=model_handle)

    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

qd_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:

def vector_search(question, course="data-engineering-zoomcamp", limit=5):
    print(f"Using Vector Search with filter: {course}. Results limit: {limit}")
    
    q_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle
        ),
        query_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=limit,
        with_payload=True
    )

    results = []
    for point in q_points.points:
        results.append(point.payload)

    return results

In [27]:
vector_search("How to install Kafka?", course="dataengineering-zoomcamp", limit=3)

Using Vector Search with filter: dataengineering-zoomcamp. Results limit: 3


[]

In [8]:
# LLM API client

%load_ext dotenv
%dotenv /Users/sethurama/DEV/LM/course-llm-zc/.env

In [9]:
llm_client = OpenAI()

def build_prompt(q_question, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT.
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT doesn't contain the answer, output NONE


QUESTION: {question} 

CONTEXT: {context}
""".strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer:  {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=q_question, context=context).strip()
    return prompt

# Query the LLM with the modified prompt
def query_llm(mod_prompt):
    response = llm_client.chat.completions.create(
        model = 'gpt-4o-mini',
        messages = [{"role": "user", "content": mod_prompt}]
    )
    
    return response.choices[0].message.content

In [10]:
def rag(query):
    search_results = vector_search(query)
    prompt = build_prompt(query, search_results)
    answer = query_llm(prompt)
    return answer

    

In [11]:
rag("How do I run kafka?")

Using Vector Search with filter: dataengineering-zoomcamp. Results limit: 5


TypeError: tuple indices must be integers or slices, not str