### chatcompletition, Prompt Template, LLM api call

In [1]:
import minsearch
import json
from dotenv import load_dotenv
from groq import Groq
import os

load_dotenv()

True

In [2]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

documents=[]

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)
        
Index = minsearch.Index(
    text_fields = ['question','section','text'],
    keyword_fields = ['course']
)  

Index.fit(documents)

<minsearch.Index at 0x7ff53ecb1ab0>

In [3]:
def search(query):
    boost = {'question': 3, 'section' : 0.4}

    results = Index.search(
        query=query,
        filter_dict={'course':'mlops-zoomcamp'},
        boost_dict=boost,
        num_results=5
)
    return results

In [4]:
def build_prompt(query, search_results):
    prompt_template = """ 
You are an expert machine learning and mlops engineering helping a junior engineer as an assitant and guide. 
Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering. DO NOT USE OTHER CONTENT OTHER THAN GIVEN CONTEXT!
if the CONTEXT does not contain the answer, Output "Not FOUND in the context given" and explain your answer with reasons.

QUESTION: {question}

CONTEXT: {context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [5]:
def llm_call(prompt):
    client = Groq(
    api_key=os.getenv("GROQ_API_KEY"),
)
    response = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="llama3-8b-8192",
)
    return print(response.choices[0].message.content)

In [6]:
Query = "How to use mlflow for experiment tracking?"
def rag(query):
    
    search_results = search(query)
    Prompt = build_prompt(query, search_results)
    answer = llm_call(Prompt)
    return answer

In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [8]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
input_text = "translate English to German: How old are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
outputs = model.generate(input_ids)
result = tokenizer.decode(outputs[0])



In [10]:
result

'<pad> Wie alt sind Sie?</s>'

In [14]:
def llm_call(prompt):
    #this call is going to produce max length error
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
    outputs = model.generate(input_ids)
    result = tokenizer.decode(outputs[0])
    return result

In [20]:
Query = "How to register for the course, I started late?"
def rag(query):
    
    search_results = search(query)
    Prompt = build_prompt(query, search_results)
    answer = llm_call(Prompt)
    return answer

#### max size limit example error msg, solution hits sliding window
- Token indices sequence length is longer than the specified maximum sequence length for this model (1472 > 512). Running this sequence through the model will result in indexing errors

In [17]:
def llm_call(prompt, max_length=512, min_length=100, temperature=0.7, top_k=50, top_p=0.95, num_beams=5):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
    outputs = model.generate(
        input_ids,
        max_length=max_length,
        min_length=min_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        no_repeat_ngram_size=2,
        do_sample=True,  # Set to True to enable sampling and allow for varied responses
        num_beams=num_beams  # Number of beams for beam search
    )
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result


In [22]:
rag(Query)

"You don't need to register, registration is not mandatory. It is for gauging the level of interest and collecting data for analytics. You can also just start learning and submitting homework without registering while a cohort is “live”. Registration is just to gauge interest before the start date. section: General course questions question: Course - How do I start? answer: See what things are where by reading pins and bookmarks on the course-channel reading the repo (bookmarked in channel) and watching the video lessons (playlist bookmarked"