# Chatbot Application

## Setup

### Install Dependencies

In [1]:
# Install uv package manager if not already installed
import subprocess

try:
    subprocess.run(['uv', '--version'], capture_output=True, check=True)
    print("✓ uv is already installed")
except (subprocess.CalledProcessError, FileNotFoundError):
    print("Installing uv...")
    %pip install uv
    print("✓ uv installed successfully")
    print("Initializing uv...")
    %uv init .
    print("✓ uv initialized successfully")

✓ uv is already installed


In [2]:
# Environment Detection and Setup
try:
    import google.colab # Check if running in Google Colab
    IN_COLAB = True
    print("Running in Google Colab")
    cache_dir = None  # Colab uses default HF cache
except:
    IN_COLAB = False
    print("Running locally")
    # Use local models directory
    import os
    models_cache_dir = "./models"
    os.makedirs(models_cache_dir, exist_ok=True)

# Install dependencies with uv (works in both environments)
!uv add gradio torch transformers accelerate bitsandbytes compressed-tensors --quiet

import torch
print(f"✓ PyTorch version: {torch.__version__}")
print(f"✓ CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"✓ GPU: {torch.cuda.get_device_name(0)}")

Running locally
✓ PyTorch version: 2.9.1+cpu
✓ CUDA available: False


### Import Libraries

In [3]:
# Gradio Interface
import gradio as gr

# Hugging Face
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## Run the Application

### Hugging Face

In [4]:
# Using the same model you tried with vLLM
model_name = "Qwen/Qwen2.5-0.5B-Instruct"  # Qwen model
# model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"  # Deepseek model

# Set cache directory to local models folder
models_cache_dir = "./models"

print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True,
    cache_dir=models_cache_dir
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=models_cache_dir
)

print("✓ Model loaded successfully!")

Loading tokenizer and model...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


✓ Model loaded successfully!


In [None]:
# Basic response generation function (without conversation history)
def basic_response(prompt, max_tokens=512, temperature=0.7):
    """Generate a response from the model
    
    Usage:
    prompt = "Why is the sky blue?"
    response = generate_response(prompt)
    print(f"Question: {prompt}")
    print(f"\nAnswer: {response}")
    """

    # Format as chat message
    messages = [{"role": "user", "content": prompt}]

    # Apply chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Tokenize
    inputs = tokenizer([text], return_tensors="pt").to(model.device)

    # Generate
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=temperature,
        do_sample=True,
        top_p=0.9
    )

    # Decode
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract just the assistant's response
    response = generated_text.split("assistant")[-1].strip()

    return response

# Chat functionality with full conversation history
def chat_with_model(message, history):
    """Chat with full conversation history"""
    
    # Build messages from history
    messages = []
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})

    messages.append({"role": "user", "content": message})
    
    # Apply chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    inputs = tokenizer([text], return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        do_sample=True,
        top_p=0.9
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("assistant")[-1].strip()
    
    return response

In [None]:
# Create Gradio interface
demo = gr.ChatInterface(
    fn=chat_with_model,
    title="Chatbot Application",
    description="Chat with Qwen Model"
)

demo.launch()

  self.chatbot = Chatbot(


* Running on local URL:  http://127.0.0.1:7860

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




### llama.cpp

In [None]:
# Cell 1: Install llama-cpp-python with GPU support using uv
# This compiles with CUDA support for GPU acceleration
!CMAKE_ARGS="-DGGML_CUDA=on" uv pip install llama-cpp-python

print("✓ llama-cpp-python installed with GPU support!")

In [3]:
# Verify installation
try:
    import llama_cpp
    print(f"✓ llama-cpp version: {llama_cpp.__version__}")
except ImportError:
    print("✗ Installation failed. Try running this cell again.")

✓ llama-cpp version: 0.3.16


In [None]:
# Cell 2: Download a GGUF model
# Using huggingface-cli to download models easily
!uv pip install huggingface-hub

In [None]:
from huggingface_hub import hf_hub_download
import os

# Example: Download a small quantized model
# You can change this to any GGUF model from Hugging Face
model_repo = "bartowski/Qwen2.5-0.5B-Instruct-GGUF"
model_file = "Qwen2.5-0.5B-Instruct-Q4_K_M.gguf"  # Q4 quantization = good balance

# Set cache directory to local models folder
models_cache_dir = "./models"

print(f"Downloading {model_file} from {model_repo}...")
model_path = hf_hub_download(
    repo_id=model_repo,
    filename=model_file,
    cache_dir=models_cache_dir
)

print(f"✓ Model downloaded to: {model_path}")

  from .autonotebook import tqdm as notebook_tqdm


Downloading Qwen2.5-0.5B-Instruct-Q4_K_M.gguf from bartowski/Qwen2.5-0.5B-Instruct-GGUF...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


✓ Model downloaded to: ./models\models--bartowski--Qwen2.5-0.5B-Instruct-GGUF\snapshots\41ba88dbac95fed2528c92514c131d73eb5a174b\Qwen2.5-0.5B-Instruct-Q4_K_M.gguf


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [5]:
# Cell 3: Initialize llama.cpp and test
from llama_cpp import Llama

print("Loading model into llama.cpp...")

llm = Llama(
    model_path=model_path,
    n_ctx=4096,           # Context window size
    n_gpu_layers=-1,      # -1 = offload all layers to GPU, 0 = CPU only
    n_batch=512,          # Batch size for prompt processing
    n_threads=4,          # Number of CPU threads
    verbose=False         # Set to True for debugging
)

print("✓ Model loaded successfully!")

# Test generation
response = llm(
    "Why is the sky blue?",
    max_tokens=256,
    temperature=0.7,
    top_p=0.9,
    stop=["User:", "\n\n\n"]
)

print("\n=== Test Response ===")
print(response['choices'][0]['text'])

Loading model into llama.cpp...


llama_context: n_ctx_per_seq (4096) < n_ctx_train (32768) -- the full capacity of the model will not be utilized


✓ Model loaded successfully!

=== Test Response ===
 This is a very simple question, but one that can be quite challenging to answer. The sky is blue because of the scattering of sunlight by tiny particles in the air, such as water droplets and pollen. These particles absorb some of the light that enters the atmosphere, but some of it is reflected back into space. The remaining light is scattered in all directions, which we see as the sky. The blue color of the sky is due to the fact that the blue color is the color of the particles that absorb the most sunlight. The rest of the light is reflected back into space, which is why the sky appears blue. The color of the sky is also affected by the angle of the sun and the time of day. The sun's position in the sky and the time of day can affect the scattering of light and the color of the sky. Overall, the blue color of the sky is due to the scattering of sunlight by tiny particles in the air, which is caused by the fact that water droplets