In [42]:
import numpy as np
import pandas as pd



In [43]:
import matplotlib.pyplot as plt
import seaborn as sns

## Load JSON File

In [4]:
# download the json data here: but we already have it in the same directory as this notebook
# wget https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json

In [44]:
import json

with open('./documents.json', 'rt') as f_in:
    documents_file = json.load(f_in)

documents = []

for course in documents_file:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [45]:
!head documents.json

[
  {
    "course": "data-engineering-zoomcamp",
    "documents": [
      {
        "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
        "section": "General course-related questions",
        "question": "Course - When will the course start?"
      },
      {


In [46]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

## R - RETRIEVAL

R - "retrieval", is one of the components of RAG. For retrieval, we need a search system. In our example, we will use `elasticsearch` for searching.

In [47]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

# You should see the same response as earlier with curl.
es.info()

ObjectApiResponse({'name': '7981acfa981a', 'cluster_name': 'docker-cluster', 'cluster_uuid': '0OYZGaylT26xAwW9Fv50AQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

### Indexing Document

Using the keys from each dictionaries in documents objects. 

Before we can index the documents (one time document), we need to create an index (an index in `elasticsearch` is like a table in a "usual" databases):

In [11]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} # keyword is to restrict our search to the course we want ot retrieve from.
        }
    }
}

index_name = "course-questions"

response = es.indices.create(index=index_name, body=index_settings)

response

### Import all the document into `elasticsearch`

In [48]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

Consider the example below to understand how `elasticsearch` works. From the user query example below, elasticsearch will look for the key words: join, course, after and started and then return documents related to this words:

In [49]:
user_query = "How do I join the course after it has started?"

The limitation is when we have the word 'program' in place of 'course', it won't be able to identify the difference between program and course. 

We can turn this query into a vector and then use `Word2Vec` to add semantic to it.

For now, I will query `elasticsearch`:

This query:

* Retrieves top 5 matching documents by default.
* Searches in the "question", "text", "section" fields, prioritizing "question" using multi_match query with type best_fields (see here for more information)
* Matches user query "How do I join the course after it has started?".
* Shows results only for the "data-engineering-zoomcamp" course.

Let's see the output:

In [50]:
es = Elasticsearch("http://localhost:9200")
    
def retrieve_documents(query, index_name="course-questions", max_results=5):
    
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    
    documents = [hit['_source'] for hit in response['hits']['hits']]
    
    return documents

In [51]:
response = retrieve_documents(user_query)

for doc in response:
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:60]}...\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishe...

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishe...



## G - GENERATION

Now that we have retrieved ourput based on query from database, it is the time generated a prompt from this retrieved result and put it into **LLM**.

### Testing Google Gemini

In [52]:
import textwrap

import google.generativeai as genai

import os

from IPython.display import display
from IPython.display import Markdown

In [53]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [54]:
GOOGLE_API_KEY = os.getenv("GOOGLE_AI_API_KEY")

genai.configure(api_key=GOOGLE_API_KEY)

What version of the model is available:

In [55]:
# print the available model

for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest


TypeError: Model.__init__() got an unexpected keyword argument 'max_temperature'

In [56]:
# using the text model
model = genai.GenerativeModel('gemini-pro')

In [57]:
%%time
response = model.generate_content("What is the meaning of life?")

to_markdown(response.text)

CPU times: user 37.7 ms, sys: 7.39 ms, total: 45.1 ms
Wall time: 7.24 s


> The meaning of life is a deeply personal and subjective question that has been pondered by philosophers, theologians, and individuals throughout history. There is no one definitive answer, and different people may find meaning in different aspects of their lives. Some common themes that have been suggested include:
> 
> * **Purpose:** Many people find meaning in a sense of purpose or calling. This could involve a career, a hobby, or a personal goal that they feel passionate about.
> * **Relationships:** Strong relationships with family, friends, and loved ones can provide a sense of meaning and fulfillment.
> * **Experiences:** Some people find meaning in simply living life to the fullest and embracing new experiences. This could involve travel, adventure, or trying new things.
> * **Contribution:** Making a positive impact on the world can give people a sense of meaning. This could involve volunteering, helping others, or creating something meaningful.
> * **Growth and learning:** Personal growth and learning can provide a sense of meaning as people develop their skills and knowledge.
> * **Values:** Living in accordance with one's values can provide a sense of meaning and direction. This could involve being kind, honest, or compassionate.
> * **Spirituality:** For some people, spirituality provides a sense of meaning and purpose. This could involve following a particular religion, practicing meditation, or connecting with nature.
> 
> Ultimately, the meaning of life is what each individual makes it. It is a personal journey of discovery and reflection that can lead to a fulfilling and meaningful existence.

**Retrieve Documents from `elasticsearch`**

In [63]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

context_docs = retrieve_documents(user_query)

context_result = ""

for doc in context_docs:
    doc_str = context_template.format(**doc)
    context_result += ("\n\n" + doc_str)

context = context_result.strip()
print(context)

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: G

### Building the Prompt

* first we put all the documents together in a string.

In [64]:
prompt_template = """
You're a course teaching assistant.
Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.
Don't use other information outside of the provided CONTEXT.  

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

prompt_template.format(
        user_question=user_query,
        context=context # context from the previous result
    )
    

"You're a course teaching assistant.\nAnswer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.\nDon't use other information outside of the provided CONTEXT.  \n\nQUESTION: How do I join the course after it has started?\n\nCONTEXT:\n\nSection: General course-related questions\nQuestion: Course - Can I still join the course after the start date?\nAnswer: Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.\n\nSection: General course-related questions\nQuestion: Course - Can I still join the course after the start date?\nAnswer: Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.\n\nSection: General course-related questions\nQuestion: Course - 

**Puting all together as a function**

In [None]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

prompt_template = """
You're a course teaching assistant.
Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.
Don't use other information outside of the provided CONTEXT.  

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()


def build_context(documents):
    context_result = ""
    
    for doc in documents:
        doc_str = context_template.format(**doc)
        context_result += ("\n\n" + doc_str)
    
    return context_result.strip()


def build_prompt(user_question, documents):
    context = build_context(documents)
    prompt = prompt_template.format(
        user_question=user_question,
        context=context
    )
    return prompt

def ask_gemini(prompt, model="gemini-pro"):
    model = genai.GenerativeModel(model)
    response = model.generate_content(prompt)
    answer = to_markdown(response.text)
    return answer

def qa_bot(user_question):
    context_docs = retrieve_documents(user_question)
    prompt = build_prompt(user_question, context_docs)
    answer = ask_openai(prompt)
    return answer

In [61]:
context_template

'Section: {section}\nQuestion: {question}\nAnswer: {text}'