In [1]:
import groq

In [3]:
import elasticsearch

In [4]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': 'ca147a9c8921', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'O18eJGitR2yFpSEFNRCu2g', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [5]:
!wget https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json

--2024-11-26 09:04:46--  https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json [following]
--2024-11-26 09:04:46--  https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 658332 (643K) [text/plain]
Saving to: ‘documents.json’


2024-11-26 09:04:46 (82.7 MB/s) - ‘documents.json’ saved [658332/658332]



In [6]:
!head documents.json

[
  {
    "course": "data-engineering-zoomcamp",
    "documents": [
      {
        "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
        "section": "General course-related questions",
        "question": "Course - When will the course start?"
      },
      {


In [7]:
import json

with open('./documents.json', 'rt') as f_in:
    documents_file = json.load(f_in)

documents = []

for course in documents_file:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [9]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"
response = es.indices.create(index=index_name, body=index_settings)

response

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [10]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [20]:
def retrieve_documents(query, index_name="course-questions", max_results=5, course_name = "data-engineering-zoomcamp"):
 
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course_name
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents

In [17]:
response = es.search(index=index_name, body=search_query)

for hit in response['hits']['hits']:
    doc = hit['_source']
    print(f"Score: {hit['_score']}")
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:60]}...\n")

Score: 53.20021
Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Score: 47.001244
Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishe...

Score: 35.42595
Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependenc...

Score: 33.921974
Section: General course-related questions
Question: How do I use Git / GitHub for this course?
Answer: After you create a GitHub account, you should clone the cour...

Score: 27.163462
Section: Workshop 1 - dlthub
Question: How do I install the necessary dependencies to run the code?
Answer: Answer: To run the provided code, ensure that the 'dlt[duckd...



In [21]:
api_key =  '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'

In [22]:
from groq import Groq
client = Groq(api_key = api_key)

In [24]:
response = client.chat.completions.create(
    model="llama3-8b-8192",
    messages=[{"role": "user", "content": "The course already started. Can I still join?"}]
)
print(response.choices[0].message.content)

While it's natural to feel hesitant about joining a course that's already underway, it's not uncommon for students to join classes mid-semester or mid-year. Whether you can still join depends on various factors, such as:

1. **Class capacity**: If the course has limited seats and is already full, it might be challenging to accommodate you. In this case, the instructor or course administrator might not be able to add you to the class.
2. **Course type and structure**: If the course is designed to be taken from start to finish, joining late might put you at a disadvantage or make it difficult to fully understand the material.
3. **Instructor's policy**: Some instructors might be open to accepting new students mid-semester, while others might have a strict policy against it.
4. **You and your goals**: Even if you can't join the course, you might still be able to pursue the subject matter or skills you're interested in outside of the class, such as through online resources, readings, or me

In [27]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

context_docs = retrieve_documents(user_question)

context_result = ""

for doc in context_docs:
    doc_str = context_template.format(**doc)
    context_result += ("\n\n" + doc_str)

context = context_result.strip()
print(context)

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Terrafo

In [26]:
user_question

'How do I join the course after it has started?'

In [28]:
prompt = f"""
You're a course teaching assistant. Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. 
Only use the facts from the CONTEXT. If the CONTEXT doesn't contan the answer, return "NONE"

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

In [29]:
print(prompt)

You're a course teaching assistant. Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. 
Only use the facts from the CONTEXT. If the CONTEXT doesn't contan the answer, return "NONE"

QUESTION: How do I join the course after it has started?

CONTEXT:

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final 

In [31]:
response = client.chat.completions.create(
    model="llama3-groq-8b-8192-tool-use-preview",
    messages=[{"role": "user", "content": prompt}]
)
answer = response.choices[0].message.content
answer

"Based on the context, you can join the course after it has started. You'll still be eligible to submit homeworks, but be aware of the deadlines for final projects."

In [32]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

prompt_template = """
You're a course teaching assistant.
Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.
Don't use other information outside of the provided CONTEXT.  

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()


def build_context(documents):
    context_result = ""
    
    for doc in documents:
        doc_str = context_template.format(**doc)
        context_result += ("\n\n" + doc_str)
    
    return context_result.strip()


def build_prompt(user_question, documents):
    context = build_context(documents)
    prompt = prompt_template.format(
        user_question=user_question,
        context=context
    )
    return prompt

def ask_groq(prompt, model="llama3-groq-70b-8192-tool-use-preview"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    answer = response.choices[0].message.content
    return answer

# use ask_groq and model="llama3-8b-8192" if using groq

def qa_bot(user_question):
    context_docs = retrieve_documents(user_question)
    prompt = build_prompt(user_question, context_docs)
    answer = ask_groq(prompt)
    return answer

In [33]:
qa_bot("Is there homework?")

'Yes, there is homework in the course. You can find the homework URL in the homework link, which is typically your repository on platforms like GitHub, GitLab, or Bitbucket.'