In [None]:
!pip install -q transformers accelerate bitsandbytes sentencepiece torch requests geocoder
!pip install -q --upgrade gradio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m121.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m95.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import userdata
from __future__ import annotations
import os
import warnings
from typing import List, Tuple, Optional
from transformers import TextIteratorStreamer
import threading
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr
import requests
import geocoder

Complete Gradio chat app for Microsoft Phi-3 Mini (Instruct) with 8-bit quantization.
- Uses bitsandbytes 8-bit loading when available.
- Falls back to FP16 on CUDA or FP32 on CPU with a clear warning.
- Applies the chat template with a system prompt: "You are an AI Farming Advisor."

Install (pick the cuda variant that matches your system):
    pip install torch --index-url https://download.pytorch.org/whl/cu121  # or cpu
    pip install transformers accelerate bitsandbytes gradio sentencepiece
    # On Windows, prefer a CUDA-specific bitsandbytes build if needed, e.g.:
    # pip install bitsandbytes-cuda121  # or cuda118/cuda117 depending on your drivers

Run:
    python phi3_mini_gradio_8bit.py
Then open the printed local URL in your browser.

In [None]:
MODEL_ID = os.environ.get("PHI3_MODEL_ID", "microsoft/Phi-3-mini-4k-instruct")
hf_token = userdata.get('HF_TOKEN')
WEATHER_API_KEY = userdata.get('WEATHER_API_KEY')

### Model / Tokenizer Loading

In [None]:
def load_model_and_tokenizer():
    """Load Phi-3 Mini in 8-bit if possible, with graceful fallbacks."""
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

    load_kwargs = {
        "device_map": "auto",
    }

    # Try 8-bit first
    use_8bit = True
    try:
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            load_in_8bit=True,  # requires bitsandbytes with GPU support
            **load_kwargs,
        )
        actual_dtype = "int8 (bitsandbytes)"
    except Exception as e:
        warnings.warn(
            "8-bit loading failed (bitsandbytes missing or no GPU support). "
            f"Falling back. Error: {e}"
        )
        use_8bit = False
        # Fallback: FP16 if CUDA available, else FP32
        if torch.cuda.is_available():
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_ID,
                torch_dtype=torch.float16,
                **load_kwargs,
            )
            actual_dtype = "float16"
        else:
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_ID,
                torch_dtype=torch.float32,
                device_map={"": "cpu"},  # ensure CPU if no CUDA
            )
            actual_dtype = "float32 (CPU)"

    # Some Phi-3 checkpoints may lack a defined pad token; ensure one exists for batching
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer, actual_dtype, use_8bit


model, tokenizer, actual_dtype, using_8bit = load_model_and_tokenizer()
print(f"Loaded {MODEL_ID} with dtype: {actual_dtype} | 8-bit: {using_8bit}")


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded microsoft/Phi-3-mini-4k-instruct with dtype: int8 (bitsandbytes) | 8-bit: True


### ---------------------- HELPER FUNCTIONS ----------------------

In [None]:
def get_location():
    """Get approximate user location based on IP address."""
    g = geocoder.ip('me')
    return g.city, g.country, g.lat, g.lng

def get_weather(city):
    """Fetch weather data for the given city using OpenWeatherMap API."""
    url = f"https://api.openweathermap.org/data/2.5/weather?q={city}&appid={WEATHER_API_KEY}&units=metric"
    r = requests.get(url)
    print("reponse from weather api ")
    print(r.json())
    if r.status_code == 200:
        return r.json()
    else:
        return None

city, country, lat, lng = get_location()
weather_data = get_weather(city)
sys_prompt = f"You are an AI Farming Advisor. "
if weather_data:
    weather_desc = weather_data['weather'][0]['description']
    temp = weather_data['main']['temp']
    sys_prompt = f"{sys_prompt} The current location is {city}, {country}, the weather is {weather_desc} and the temperature is {temp}°C. "


reponse from weather api 
{'coord': {'lon': -121.1787, 'lat': 45.5946}, 'weather': [{'id': 803, 'main': 'Clouds', 'description': 'broken clouds', 'icon': '04n'}], 'base': 'stations', 'main': {'temp': 20.63, 'feels_like': 20.92, 'temp_min': 19.37, 'temp_max': 21.81, 'pressure': 1012, 'humidity': 83, 'sea_level': 1012, 'grnd_level': 956}, 'visibility': 10000, 'wind': {'speed': 2.68, 'deg': 306, 'gust': 4.47}, 'clouds': {'all': 68}, 'dt': 1755321261, 'sys': {'type': 2, 'id': 2012249, 'country': 'US', 'sunrise': 1755263141, 'sunset': 1755313969}, 'timezone': -25200, 'id': 5756304, 'name': 'The Dalles', 'cod': 200}


### Chat Utilities

In [None]:
def build_messages(history: List[Tuple[str, str]], user_msg: str):
    """
    Convert Gradio history into a list of chat messages for apply_chat_template.
    history: list of (user, assistant) strings. Only non-empty entries are added.
    """
    msgs = [{"role": "system", "content": sys_prompt}]
    for user, assistant in history:
        if user:
            msgs.append({"role": "user", "content": user})
        if assistant:
            msgs.append({"role": "assistant", "content": assistant})
    msgs.append({"role": "user", "content": user_msg})
    return msgs


def generate_reply(user_msg: str, history: List[Tuple[str, str]]):
    messages = build_messages(history, user_msg)

    # Build prompt using the chat template
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(model.device)

    # Create streamer for live output
    streamer = TextIteratorStreamer(
        tokenizer,
        skip_special_tokens=True,
        skip_prompt=True
    )

    # Model.generate runs in a background thread
    generation_kwargs = dict(
        input_ids=prompt,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.05,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        streamer=streamer
    )

    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # Yield each new token as it's generated
    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        yield partial_text.strip()

###Gradio app

In [None]:
def predict(message: str, history: List[Tuple[str, str]]):
    try:
        reply = generate_reply(message, history)
    except RuntimeError as e:
        # Common errors: NaNs from precision mismatch, OOM, etc.
        reply = (
            "I hit a runtime error while generating a reply. "
            f"Details: {e}\n\n"
            "Tips: If you're on CPU, try shorter responses (reduce max_new_tokens). "
            "If you're on GPU without 8-bit support, install a CUDA-enabled bitsandbytes build."
        )
    return reply


def build_ui():
    title = f"Phi-3 Mini Instruct Chat ({actual_dtype})"
    description = (
        f"System prompt: <i>{sys_prompt}</i><br>"
        f"Model: <code>{MODEL_ID}</code> | Precision: <b>{actual_dtype}</b> | 8-bit: <b>{using_8bit}</b>"
    )

    chat = gr.ChatInterface(
      fn=predict,
      title=title,
      description=description,
    streaming=True
    )

    return chat

def respond(message, chat_history):
    partial_message = ""
    for chunk in generate_reply(message, chat_history):
        partial_message = chunk
        yield chat_history + [[message, partial_message]]

def build_ui():
    title = f"AI Farming Advisor"
    description = (
        f"Phi-3 Mini Instruct Chat ({actual_dtype})<br>"
        f"System prompt: <i>{sys_prompt}</i><br>"
        f"Model: <code>{MODEL_ID}</code> | Precision: <b>{actual_dtype}</b> | 8-bit: <b>{using_8bit}</b>"
    )

    with gr.Blocks() as demo:
        gr.Markdown(f"# {title}")
        gr.Markdown(description)

        chatbot = gr.Chatbot()
        msg = gr.Textbox(placeholder="Type your farming question here...")

        msg.submit(respond, [msg, chatbot], [chatbot])

    return demo

if __name__ == "__main__":
    ui = build_ui()
    ui.launch()

  chatbot = gr.Chatbot()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e856e2ca6656d48f91.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
