In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [1]:
import minsearch
import json
import requests 

In [2]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [None]:
index = minsearch.Index(
    text_fields=['question', 'text', 'section']
    , keyword_fields=['course']
)

In [None]:
index.fit(documents)

In [None]:
from openai import OpenAI
client = OpenAI()

In [17]:
def search(query):
    boost = {
        'question': 3.0
    }
    
    results = index.search(
        query=query
        , boost_dict=boost
        , filter_dict={ 'course': 'data-engineering-zoomcamp' }
        , num_results=5
    )
    

In [18]:
def build_prompt(query, search_results):
    prompt_template = """
    You are a course teaching assistant. Answer the QUESTION based on the CONTEXT. 
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't contain the answer, output NONE
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """
    
    context = ""
    
    for doc in search_results:
        context += f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}"
    
    prompt=prompt_template.format(question=query, context=context).strip()

    return prompt

In [20]:
def llm(prompt):
    response = client.chat.completitions.create(
        model="gpt-4o"
        , messages=[{"role": "user", "content": prompt}]
    )

    return reponse.choices[0].message.content
    

In [21]:
query = "how do I set up docker"

def rag(query):
    results = search(query)
    prompt = build_prompt(query, results)
    answer = llm(prompt)

    return answer

In [None]:
rag(query)

In [4]:
from elasticsearch import Elasticsearch

In [5]:
es_client = Elasticsearch('http://localhost:9200')

In [9]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [10]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:06<00:00, 141.99it/s]


In [12]:
query = "how do I set up docker"

In [21]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    r = es_client.search(index=index_name, body=search_query)

    results = []
    
    for hit in r['hits']['hits']:
        results.append(hit['_source'])

    return results

In [22]:
elastic_search(query)

[{'text': 'This tutorial shows you how to set up the Chrome Remote Desktop service on a Debian Linux virtual machine (VM) instance on Compute Engine. Chrome Remote Desktop allows you to remotely access applications with a graphical user interface.\nTaxi Data - Yellow Taxi Trip Records downloading error, Error no or XML error webpage\nWhen you try to download the 2021 data from TLC website, you get this error:\nIf you click on the link, and ERROR 403: Forbidden on the terminal.\nWe have a backup, so use it instead: https://github.com/DataTalksClub/nyc-tlc-data\nSo the link should be https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2021-01.csv.gz\nNote: Make sure to unzip the “gz” file (no, the “unzip” command won’t work for this.)\n“gzip -d file.gz”g',
  'section': 'Module 1: Docker and Terraform',
  'question': 'Set up Chrome Remote Desktop for Linux on Compute Engine',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'After you create a GitHub 

In [23]:
def rag(query):
    results = elastic_search(query)
    prompt = build_prompt(query, results)
    answer = llm(prompt)

    return answer