## Install packages

In [None]:
!pip install torch
!pip install -U transformers
!pip install pyngrok

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from flask import Flask, request, jsonify
from pyngrok import ngrok
from google.colab import userdata

Add Env variables and authenticate ngrok

In [None]:
NGROK_TOKEN = userdata.get("NGROK_TOKEN")
HUGGING_FACE_TOKEN = userdata.get("HUGGING_FACE_TOKEN")

In [None]:
ngrok.set_auth_token(NGROK_TOKEN)

In [None]:
model = "meta-llama/Llama-3.2-3B-Instruct"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model,token=HUGGING_FACE_TOKEN)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B-Instruct",
    token=HUGGING_FACE_TOKEN
)

### Deploy with Flask

In [None]:
app = Flask(__name__)

In [None]:
@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    prompt = data.get("prompt", "")

    messages = [
        {"role": "system", "content": "Give honest answer for questions asked"},
        {"role": "user", "content": prompt},
    ]

    prompt = "\n".join([f"{message['role']}: {message['content']}" for message in messages])
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(
        inputs["input_ids"],  # Input tokens
        max_new_tokens=256,   # Maximum number of tokens to generate
        num_return_sequences=1,
        do_sample=True,       # Enables sampling for more creative responses
        top_k=50,             # Controls randomness
        top_p=0.95,           # Nucleus sampling for diversity
        temperature=0.2       # Creativity of the response
    )

    # Decode the output and return the response
    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(response_text)
    ai_bot_dialogue = re.findall(r'system: (.*?)(?=\nuser:|$)', response_text, re.DOTALL)
    if ai_bot_dialogue:
      ai_bot_dialogue = ai_bot_dialogue[-1].strip()
    return jsonify({"response": ai_bot_dialogue})

In [None]:
port=5000

public_url = ngrok.connect(port)
print(f" * Ngrok tunnel \"{public_url}\" -> \"http://127.0.0.1:5000\"")
app.run(host='0.0.0.0', port=port)