<a href="https://colab.research.google.com/github/lukassso/python-mini-projects/blob/master/local_chat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers gradio

Collecting gradio
  Downloading gradio-5.23.3-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr
import torch
import logging
import functools
import signal
from contextlib import contextmanager
import threading

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class TimeoutException(Exception):
    pass

@contextmanager
def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutException("Timed out!")
    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)

# Use TinyLlama - much smaller and faster
model_name = "microsoft/Phi-3-mini-4k-instruct"

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        low_cpu_mem_usage=True
    )
    model.config.pad_token_id = tokenizer.pad_token_id

except Exception as e:
    logger.error(f"Error loading model: {str(e)}")
    raise

def format_prompt(message):
    """Format the prompt following TinyLlama's chat template"""
    return f"<|system|>You are a helpful AI assistant.</s><|user|>{message}</s><|assistant|>"

def chat(message, history):
    try:
        # Format the prompt
        prompt = format_prompt(message)

        # Convert input to model's device
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            padding=True,
            truncation=True,
            return_attention_mask=True
        )
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        # Generate response with timeout
        try:
            with torch.no_grad():
                outputs = model.generate(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    max_new_tokens=64,  # Significantly reduced for faster responses
                    temperature=0.7,
                    do_sample=True,
                    top_p=0.95,
                    top_k=50,
                    num_beams=1,  # No beam search for faster generation
                    repetition_penalty=1.1,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                    max_time=10.0  # Maximum 10 seconds for generation
                )

            # Decode response
            response = tokenizer.decode(
                outputs[0][inputs["input_ids"].shape[1]:],
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )
            response = response.strip()

            if not response:
                return "I apologize, but I couldn't generate a response. Please try again."

            return response

        except Exception as e:
            logger.error(f"Generation error: {str(e)}")
            return "I apologize, but the generation timed out. Please try a shorter question or try again."

    except Exception as e:
        logger.error(f"Error in chat function: {str(e)}")
        return f"I apologize, but I encountered an error. Please try again. Error details: {str(e)}"

# Create the interface
try:
    demo = gr.ChatInterface(
        fn=chat,
        title="TinyLlama Chat Assistant",
        description="A lightweight chat interface powered by TinyLlama (1.1B parameters). Ask me anything!",
        examples=[
            "What is machine learning?",
            "Write a haiku",
            "Explain what is CPU"
        ]
    )

    # Launch with additional parameters
    demo.launch(
        share=True,
        debug=True,
    )

except Exception as e:
    logger.error(f"Error launching Gradio interface: {str(e)}")
    raise

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

  self.chatbot = Chatbot(


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://3fbda36b92ce00b62f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
