In [1]:
# !pip install groq elasticsearch
# !pip install pandas

In [4]:
import minsearch
import json
import os
from groq import Groq
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
from dotenv import load_dotenv

In [5]:
load_dotenv()
os.environ['HF_HOME'] = 'run/cache/'
api_key = os.getenv('GROQ_API_KEY')

'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'

In [28]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [29]:
index = minsearch.Index(
    text_fields = ["question", "text", "section"],
    keyword_fields = ["course"]
)
index.fit(documents)

<minsearch.Index at 0x76a0775d3aa0>

In [9]:
client = Groq(api_key=api_key)

In [10]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
    Only use the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't contain answer, output NONE. 
    Do not Quote the CONTEXT in the answer.

    QUESTION: {question}
    CONTEXT: {context}
    """.strip()
    
    context = ""
    
    for doc in search_results:
        context = context + f"section:{doc['section']}\n question:{doc['question']}\n text:{doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [11]:
def llm(prompt):
    response = client.chat.completions.create(
    #
    # Required parameters
    #
    messages=[
        # Set an optional system message. This sets the behavior of the
        # assistant and can be used to provide specific instructions for
        # how it should behave throughout the conversation.
        {
            "role": "system",
            "content": prompt
        },
        # Set a user message for the assistant to respond to.
        {
            "role": "user",
            "content": query,
        }
    ],

    # The language model which will generate the completion.
    model="llama-3.3-70b-versatile",

    #
    # Optional parameters
    #

    # Controls randomness: lowering results in less random completions.
    # As the temperature approaches zero, the model will become deterministic
    # and repetitive.
    temperature=0.5,

    # The maximum number of tokens to generate. Requests can use up to
    # 32,768 tokens shared between prompt and completion.
    max_tokens=1024,

    # Controls diversity via nucleus sampling: 0.5 means half of all
    # likelihood-weighted options are considered.
    top_p=1,

    # A stop sequence is a predefined or user-specified text string that
    # signals an AI to stop generating content, ensuring its responses
    # remain focused and concise. Examples include punctuation marks and
    # markers like "[end]".
    stop=None,

    # If set, partial message deltas will be sent.
    stream=False,
    )

    # Print the completion returned by the LLM.
    return response.choices[0].message.content

In [14]:
es_client = Elasticsearch('http://127.0.0.1:9200')
es_client.info()

ObjectApiResponse({'name': '9f40de865906', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'sGX3ZP1xSfu5IU60NmvODQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [15]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-faqs"
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-faqs'})

In [16]:
documents[0]


{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [17]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [18]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    results_docs = []
    for hit in response['hits']['hits']:
        results_docs.append(hit['_source'])

    return results_docs

In [19]:
query = "The course has already started, can I still enroll?"

def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [20]:
rag(query)

"Yes, you're still eligible to submit the homework, but be aware of the deadlines for turning in the final projects."

In [21]:
search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }


response = es_client.search(index=index_name, body=search_query)

results_docs = []
for hit in response['hits']['hits']:
        results_docs.append(hit['_score'])



In [22]:
results_docs

[48.763668, 36.514423, 36.514423, 34.183365, 27.925764]