<a href="https://colab.research.google.com/github/muddasserniaz/miscfragments/blob/main/LLaMa3.1_Gradio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install gradio transformers torch -q
import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def get_huggingface_token():
    """
    Fetch Hugging Face token from environment variables or prompt the user.
    Returns:
        str: Hugging Face token
    Raises:
        ValueError: If no valid token is provided.
    """
    hf_token = os.getenv('HF_TOKEN')
    if not hf_token:
        logger.warning("HF_TOKEN environment variable is not set.")
        hf_token = input("Please enter your Hugging Face token: ").strip()
        if not hf_token:
            raise ValueError("A valid Hugging Face token is required to proceed.")
    return hf_token

def load_model_and_tokenizer(model_name, token):
    """
    Load the model and tokenizer using Hugging Face's Auto classes.
    Args:
        model_name (str): Hugging Face model name.
        token (str): Hugging Face token.
    Returns:
        tuple: model, tokenizer, device
    """
    try:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"Using device: {device}")

        # Load model and tokenizer using Auto classes
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
        model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=token).to(device)

        # Set or add padding token
        if tokenizer.pad_token is None:
            logger.info("Adding pad_token to the tokenizer.")
            tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
            model.resize_token_embeddings(len(tokenizer))  # Adjust model embeddings for new token

        return model, tokenizer, device
    except Exception as e:
        logger.error(f"Error loading model or tokenizer: {e}")
        raise

def chatbot_fn(prompt, chatbot_history=[]):
    """
    Chatbot function to handle user prompts and generate responses.
    Args:
        prompt (str): User input prompt.
        chatbot_history (list): History of the conversation.
    Returns:
        tuple: Assistant's response, updated conversation history.
    """
    if not prompt.strip():
        return "Please enter a valid prompt.", chatbot_history

    try:
        if chatbot_history:
            conversation = [item['content'] for item in chatbot_history]
            input_text = "\n".join(conversation) + f"\nUser: {prompt}\nAssistant:"
        else:
            input_text = f"User: {prompt}\nAssistant:"

        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            truncation=True,
            max_length=1024,
            padding=True
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs, max_new_tokens=150, pad_token_id=tokenizer.pad_token_id
            )
        response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        assistant_response = response_text.split("Assistant:")[-1].strip()
    except Exception as e:
        logger.error(f"Error generating response: {e}")
        return f"An error occurred: {e}", chatbot_history

    chatbot_history.append({"role": "user", "content": prompt})
    chatbot_history.append({"role": "assistant", "content": assistant_response})

    return assistant_response, chatbot_history

# Initialize Hugging Face model and tokenizer
try:
    hf_token = get_huggingface_token()
    #model_name = "meta-llama/Llama-3.1-8B-Instruct"  # Model name
    model_name = "meta-llama/Llama-3.2-1B"  # Model name
    model, tokenizer, device = load_model_and_tokenizer(model_name, hf_token)
except Exception as e:
    logger.critical("Failed to initialize the model. Exiting.")
    raise

# Define Gradio interface
iface = gr.Interface(
    fn=chatbot_fn,
    inputs=["text", "state"],
    outputs=["text", "state"],
    title="LLaMA 3.1-8B Instruct Chatbot (GPU-accelerated)",
    description="Chat with a LLaMA 3.1-based model using GPU acceleration!",
    allow_flagging="never",
)

if __name__ == "__main__":
    iface.launch()




KeyboardInterrupt: Interrupted by user