In [1]:
import os
import requests
from tqdm import tqdm
from openai import OpenAI
from elasticsearch import Elasticsearch

In [2]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)
len(documents)

948

In [3]:
es_client = Elasticsearch('http://localhost:9200')

In [4]:
es_client.options(ignore_status=[400,404]).indices.delete(index='course-questions')

ObjectApiResponse({'error': {'root_cause': [{'type': 'index_not_found_exception', 'reason': 'no such index [course-questions]', 'resource.type': 'index_or_alias', 'resource.id': 'course-questions', 'index_uuid': '_na_', 'index': 'course-questions'}], 'type': 'index_not_found_exception', 'reason': 'no such index [course-questions]', 'resource.type': 'index_or_alias', 'resource.id': 'course-questions', 'index_uuid': '_na_', 'index': 'course-questions'}, 'status': 404})

In [6]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} # primary key in RDBMS taste
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [9]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

100%|██████████| 948/948 [00:34<00:00, 27.60it/s]


In [54]:
def search(query, size=5, search_words=[]):
    def filter_builder():
        return list(map(lambda word: {"term": {**word}}, search_words))
        
    search_query = {
        "size": size,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": filter_builder()
            }
        }
    }

    return es_client.search(index=index_name, body=search_query)

In [60]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context_template = """
    Q: {question}
    A: {text}
    """.strip()

    context = ""
    
    for doc in search_results['hits']['hits']:
        context += context_template.format(question=doc["_source"]["question"], text=doc["_source"]["text"]) + "\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [23]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [66]:
def llm(prompt):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [67]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [68]:
rag("I just discovered the course. Can I still join it?")

' As a course teaching assistant, I\'d like to clarify that even though it seems most courses are not taking new students after they start and we recommend registering as soon as possible, there is flexibility in enrolling at the end of November for this particular course. If you decide to join nearer to or right until the final deadlines while focusing on completing two projects out of three required ones before submitting your assignments, it will still be eligible for a certificate upon completion as per our FAQ guidelines (http://mlzoomcamp.com/).\n\nTo access course materials and get started: Go to the course page at http://mlzoomcamp.com/, scroll down from where you landed on top of recent activities, start going through available material there including cohort-specific content which can be found in our folders for respective years under "Cohort". Watching videos is recommended and office hours provided by previous students could provide helpful insights as well. In case the cou

In [69]:
print(_)

 As a course teaching assistant, I'd like to clarify that even though it seems most courses are not taking new students after they start and we recommend registering as soon as possible, there is flexibility in enrolling at the end of November for this particular course. If you decide to join nearer to or right until the final deadlines while focusing on completing two projects out of three required ones before submitting your assignments, it will still be eligible for a certificate upon completion as per our FAQ guidelines (http://mlzoomcamp.com/).

To access course materials and get started: Go to the course page at http://mlzoomcamp.com/, scroll down from where you landed on top of recent activities, start going through available material there including cohort-specific content which can be found in our folders for respective years under "Cohort". Watching videos is recommended and office hours provided by previous students could provide helpful insights as well. In case the course 