## Gradio

[Gradio](https://www.gradio.app) can enable simple web interfaces to your software. In this example, we are using Gradio to get a simple chat interface to a large language model.

In [1]:
# Import necessary libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
import gradio as gr
import os
import random

In [2]:
# Use a random TCP port:
port = random.randint(10000, 50000)
# Get username
username = os.environ['USER']
# Construct URL:
relative_url = f'/user/{username}/proxy/absolute/{port}/'  # Needs to start with '/'
absolute_url = f'https://jupyterhub.vsc.ac.at{relative_url}'

In [3]:
# Load tokenizer and model and create a pipeline that can be used for inference:
model_name = '/gpfs/data/fs70824/LLMs_models_datasets/models/microsoft--phi-3.5-mini-instruct'
# model_name = 'microsoft/Phi-3.5-mini-instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='cuda',
    torch_dtype=torch.bfloat16,
    trust_remote_code=False,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
)
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


In [4]:
# Prepare a function that takes chatbot questions and returns the answer from the LLM:
def get_answer(question, history=[]):
    history.append(
        {'role': 'user', 'content': question}
    )
    result = pipe(history, max_new_tokens=500, return_full_text=False)
    return result[0]['generated_text'].strip()

In [5]:
# Create a Gradio ChatInterface and launch it:
chat_interface = gr.ChatInterface(get_answer, type='messages')
chat_interface.launch(share=False, inline=False, server_name='127.0.0.1', server_port=port, root_path=f'/user/{username}/proxy/absolute/{port}')
print(f'\nOpen the following URL in your webbrowser:\n{absolute_url}')

which: no node in (/opt/sw/jupyterhub/envs/conda/vsc5/jupyterhub-llm-training-v4/bin:/opt/sw/conda/miniconda3-24.1.2/condabin:/opt/sw/cuda-zen/spack-0.19.0/bin:/home/fs71550/mpfister/.local/bin:/home/fs71550/mpfister/bin:/usr/share/Modules/bin:/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/lpp/mmfs/bin:/opt/sw/slurm/x86_64/alma8.8/22-05-2-1/bin:/opt/sw/slurm/x86_64/alma8.8/22-05-2-1/sbin:/opt/sw/vsc_modules/modules-4.2.2/bin:/opt/sw/vsc4/VSC/x86_64/generic/bin:/opt/sw/tools:/opt/sw/conda/miniconda3/condabin:/opt/sw/conda/miniconda3/bin)


* Running on local URL:  http://127.0.0.1:39953

To create a public link, set `share=True` in `launch()`.

Open the following URL in your webbrowser:
https://jupyterhub.vsc.ac.at/user/mpfister/proxy/absolute/39953/
