In [1]:
!rm -f minsearch.py
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-07-05 15:32:22--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-07-05 15:32:23 (27.7 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [2]:
import requests 
import minsearch

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x7ff0b66b0280>

In [3]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [4]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [5]:
from openai import OpenAI

In [6]:
#here we make the change. Now the openai client is connected to ollama

#client = OpenAI()   #to be changed with the following lines

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [7]:
#we have to change only the llm model (cause we are still using openai client!)
def llm(prompt):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [8]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [13]:
llm('write that this is a test')

" This is a test.\n\n\nThis sentence was composed to fulfill the given instruction, which simply requests a statement indicating it'ths a test situation without any additional or unnecessary information."

In [14]:
print(_)

 This is a test.


This sentence was composed to fulfill the given instruction, which simply requests a statement indicating it'ths a test situation without any additional or unnecessary information.


In [10]:
query = 'I just discovered the course. can i still enroll?'

In [11]:
rag(query)

' Based on the information provided in the FAQ database, yes, you are still eligible to enroll and submit homeworks after the course start date as long as there is no formal registration deadline mentioned for late registrations within this specific context. However, make sure not to leave everything until last minute due to upcoming final project submission deadlines which will be communicated in time.'

In [12]:
print(_)

 Based on the information provided in the FAQ database, yes, you are still eligible to enroll and submit homeworks after the course start date as long as there is no formal registration deadline mentioned for late registrations within this specific context. However, make sure not to leave everything until last minute due to upcoming final project submission deadlines which will be communicated in time.


In [15]:
#execute again with ollama dockerized
llm('write that this is a test')

' This message serves as a placeholder text to indicate where you would typically insert an automated testing script.\n\n```python\n# Simulating the inclusion of a unit test in Python using pytest framework\n\nimport pytest\n\ndef add(a, b):\n    return a + b\n\n@pytest.mark.parametrize("input_value1", [0, 5]) # Test with different first numbers\n@pytestenerate multiple inputs for testing the second number as well: [-2, 3] @pytest.mark.generate(second_nums) def test_add():     assert add(input_value1, -2) == input_value1-2 # Test if addition is correct\nassert add(input_value1,  3) == input_value1+3    # Ensure the function works for positive second number as well. \n```'

In [16]:
#execute again with ollama dockerized
rag(query)

" As per our FAQ database, I'm sorry but there is no specific information available regarding late registration for this course that started on January 15th, 2024. However, it seems students are still able to submit homeworks and potentially follow along with materials post-course completion or continue onto the next cohort if applicable as per our policies discussed in Section A of General Course Questions. Please confirm directly from course providers for exact late registration policy details since this information might vary depending on specific circumstances, instructor's discretion etc.\n"