In [7]:
import os
from openai import OpenAI
import minsearch
import json
from tqdm.auto import tqdm
import tiktoken

In [7]:
with open('documents.json', 'rt') as f_in:
    documents_raw = json.load(f_in)

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)


# using Elastic search

In [8]:
from elasticsearch import Elasticsearch


es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [24]:
# client = OpenAI(
#     api_key=os.environ.get("OPEN_API_KEY"),
# )


# for ollama
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

def search(query):
    boost = {'question': 3.0, 'text': 0.5}
    filter_dict = {'course': 'data-engineering-zoomcamp'}
    
    results = index.search(
        query=query,
        filter_dict=filter_dict,
        boost_dict=boost,
        num_results=11
    )
    
    return results

def get_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ table.
    Answer the QUESTION only using facts from the CONTEXT. 
    If you cannot answer the QUESTION using the CONTEXT, return NONE.
    
    QUESTION: {question}
    
    CONTEXT: {context}
    """.strip()


    context = ""
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    return prompt_template.format(question=query, context=context.strip()).strip()


def llm(prompt):
    response = client.chat.completions.create(
        model="gpt-4o", 
        messages=[{"role":"user", "content": prompt}]
    )

    return response.choices[0].message.content




def llm_ollama(prompt):
    response = client.chat.completions.create(
        model="phi3", 
        messages=[{"role":"user", "content": prompt}]
    )

    return response.choices[0].message.content



In [27]:

def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs


def rag(query):
    search_results = elastic_search(query)
    prompt = get_prompt(query, search_results)
    answer = llm(prompt)
    return answer



def rag_ollama(query):
    search_results = elastic_search(query)
    prompt = get_prompt(query, search_results)
    answer = llm_ollama(prompt)
    return answer

In [28]:
query = 'I just disovered the course. Can I still join it?'


# rag(query)
output = rag_ollama(query)
print(output)

 Yes, you are still eligible to submit the course assignments even after the official start date of the Course - General Data Engineering Bootcamp has passed your way by email as there's no strict registration deadline mentioned in the provided context for receiving confirmation emails post-start date; however, be mindful that final projects have submission deadlines.


# Homework

In [13]:
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama'
)


def llm_ollama(prompt):
    response = client.chat.completions.create(
        model="gemma:2b", 
        messages=[{"role":"user", "content": prompt}],
        temperature=0.0
    )

    return response.choices[0].message.content

prompt = "What's the formula for energy?"

output = llm_ollama(prompt)

In [14]:
print(output)

Sure, here's the formula for energy:

**E = K + U**

Where:

* **E** is the energy in joules (J)
* **K** is the kinetic energy in joules (J)
* **U** is the potential energy in joules (J)

**Kinetic energy (K)** is the energy an object possesses when it moves or is in motion. It is calculated as half the product of an object's mass (m) and its velocity (v) squared:

**K = 1/2mv^2**

**Potential energy (U)** is the energy an object possesses due to its position or configuration. It is calculated as the product of an object's mass, gravitational constant (g), and height or position above a reference point.

**U = mgh**

Where:

* **m** is the mass in kilograms (kg)
* **g** is the gravitational constant (9.8 m/s^2)
* **h** is the height or position in meters (m)

The formula shows that energy can be expressed as the sum of kinetic and potential energy. The kinetic energy is a measure of the object's ability to do work, while the potential energy is a measure of the object's ability to do w

In [15]:
from transformers import AutoTokenizer

from huggingface_hub import login
import os

login(os.environ['HF_TOKEN'])

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")

output_ids = tokenizer.tokenize(output)
len(output_ids)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/talldarkandhandsome/.cache/huggingface/token
Login successful


280