<a href="https://colab.research.google.com/github/peremartra/LLMOptCost/blob/main/11/11_GradioInterface.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q gradio
!pip install -q transformers
!pip install -q bitsandbytes
!pip install -q accelerate
!pip install -q scipy
!pip install -q sentence-transformers


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m835.8 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.1/18.1 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.7/318.7 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.6/94.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.9/141.9 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import time
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

In [3]:
#OPT2="memory"
#OPT2="inference"
OPT2="none"

## Load Models

In [4]:
# Load the base model and quantized model from Hugging Face
base_model_name = "bigscience/bloomz-560m"

#4b Quantized model
#custom_model_name = "oopere/bloomz-560m-quantized_"+OPT2 #Use with OPT2="memory"

#Pruned 30%
pruned_model_name = "oopere/bloomz-560m-pruned" #Use with OPT="none" or with OPT="inference" to quantize the pruned model

#Pruned 30% KD with agnews.
#custom_model_name = "oopere/bloomz-560m-pruned-kdi-agnews" #Use with OPT="none" or with OPT="inference" to quantize the pruned model

#Pruned 30% KD with propietary dataset.
#https://huggingface.co/datasets/oopere/knowledge_transfer_1500_base
#custom_model_name = "oopere/bloomz-560m-pruned-kdi-kt1" #Use with OPT="none" or with OPT="inference" to quantize the pruned model

#Double KD agnews + custom
custom_model_name = "oopere/bloomz-560m-pruned-kdi-both" #Use with OPT="none" or with OPT="inference" to quantize the pruned model

In [5]:
# Load the tokenizer (both models use the same tokenizer)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device_map = {"": device} if device != "cpu" else {}

In [7]:
# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(base_model_name,
                                                  device_map=device,
                                                  )

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [8]:
def return_bnb(goal="memory"):
  if goal == "memory":
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,  # Optimize for memory using 4-bit quantization
            bnb_4bit_use_double_quant=True,  # Save memory but slower on inference
            bnb_4bit_quant_type="nf4",  # Use nf4 quantization for better memory usage
            bnb_4bit_compute_dtype=torch.float16  # Depending on GPU we can change for bfloat16.
        )
  elif goal == "inference":
        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True,  # Optimize for inference using 8-bit quantization
            bnb_4bit_compute_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
        )
  else:
      bnb_config=None
  return bnb_config

In [9]:
bnb_config = return_bnb(OPT2)

In [10]:
# Load the custom model with optional quantization.
quantized_model = AutoModelForCausalLM.from_pretrained(custom_model_name,
                    quantization_config=bnb_config,
                    device_map=device_map,
                    use_cache = False)

config.json:   0%|          | 0.00/821 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [11]:
# Load the quantized model
pruned_model = AutoModelForCausalLM.from_pretrained(pruned_model_name,
                    quantization_config=bnb_config,
                    device_map=device_map,
                    use_cache = False)

config.json:   0%|          | 0.00/807 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [12]:
base_model.get_memory_footprint()

2236858368

In [13]:
quantized_model.get_memory_footprint()

2236858368

## Comparing Embeddings to obtain cosine distance.

In [14]:
# Load embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
def calculate_cosine_distance(text1, text2):
    # Compute embeddings
    embedding1 = embedding_model.encode(text1)
    embedding2 = embedding_model.encode(text2)
    # Calculate cosine distance
    distance = cosine(embedding1, embedding2)
    return distance

## Calling the models

In [16]:
# Define the function that will take input and pass it to both models
def compare_models(prompt):

    inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
    # Generate output from base
    start_time_base = time.time()
    base_output = base_model.generate(**inputs)
    base_response = tokenizer.decode(base_output[0], skip_special_tokens=True)
    base_time = time.time() - start_time_base

    # Generate output from pruned model
    start_time_pruned = time.time()
    pruned_output = pruned_model.generate(**inputs)
    pruned_response = tokenizer.decode(pruned_output[0], skip_special_tokens=True)
    pruned_time = time.time() - start_time_pruned

    # Generate output from custom model
    start_time_quantized = time.time()
    quantized_output = quantized_model.generate(**inputs)
    quantized_response = tokenizer.decode(quantized_output[0], skip_special_tokens=True)
    quantized_time = time.time() - start_time_quantized


     # Calculate cosine distances
    cosine_distance_pruned = calculate_cosine_distance(base_response, pruned_response)
    cosine_distance_quantized = calculate_cosine_distance(base_response, quantized_response)


    # Return responses and metrics
    return (
        base_response,
        pruned_response,
        quantized_response,
        f"{base_time:.3f}",
        f"{pruned_time:.3f}",
        f"{quantized_time:.3f}",
        f"{cosine_distance_pruned:.3f}",
        f"{cosine_distance_quantized:.3f}"
    )

## Gradio Interface

In [17]:
# Initialize an empty list to store responses and distances
custom_responses_data = []

# Define the function to update the table dynamically
def update_custom_responses_table(custom_response, cosine_distance):
    # Append new data to the list
    custom_responses_data.append([custom_response, cosine_distance])
    return custom_responses_data

In [18]:
# Create Gradio Interface with custom layout
with gr.Blocks() as interface:
    with gr.Row():
        with gr.Column(scale=1):
            input_text = gr.Textbox(label="Input Prompt")
            clear_btn = gr.Button("Clear")
            submit_btn = gr.Button("Submit")
            base_response_box = gr.Textbox(label="Base Model Response", interactive=False)
            custom_response_box = gr.Textbox(label="Pruned Model Response", interactive=False)
            dynamic_response_box = gr.Textbox(label="Custom Input Response", interactive=False)

        with gr.Column(scale=1):
            base_time_box = gr.Textbox(label="Base Model Response Time", interactive=False)
            custom_time_box = gr.Textbox(label="Pruned Model Response Time", interactive=False)
            dynamic_time_box = gr.Textbox(label="Custom Input Response Time", interactive=False)
            custom_embedding_distance_box = gr.Textbox(label="Pruned (Cosine Similarity)", interactive=False)
            dynamic_embedding_distance_box = gr.Textbox(label="Custom (Cosine Similarity)", interactive=False)

    # Add a table to display custom responses and cosine distances
    custom_responses_table = gr.Dataframe(
        headers=["Custom Model Response", "Cosine Distance (Base vs Custom)"],
        value=custom_responses_data,  # Initialize with the custom responses data
        interactive=False,  # Users cannot modify the table manually
        label="Custom Responses and Cosine Distances"
    )

    # Define the button actions
    def handle_submit(prompt):
        # Get the responses, times, and distances
        base_response, custom_response, dynamic_response, base_time, custom_time, dynamic_time, cosine_distance_custom, cosine_distance_dynamic = compare_models(prompt)

        # Update the custom responses table with the new response and distance
        updated_table = update_custom_responses_table(custom_response, cosine_distance_custom)

        # Return the updated outputs and table
        return base_response, custom_response, dynamic_response, base_time, custom_time, dynamic_time, cosine_distance_custom, cosine_distance_dynamic, updated_table

    submit_btn.click(handle_submit, inputs=[input_text],
                     outputs=[base_response_box, custom_response_box, dynamic_response_box,
                              base_time_box, custom_time_box, dynamic_time_box,
                              custom_embedding_distance_box, dynamic_embedding_distance_box,
                              custom_responses_table])

    clear_btn.click(lambda: ("", "", "", "", "", "", "", "", []),  # Clear table and outputs
                    outputs=[base_response_box, custom_response_box, dynamic_response_box,
                             base_time_box, custom_time_box, dynamic_time_box,
                             custom_embedding_distance_box, dynamic_embedding_distance_box,
                             custom_responses_table])

In [19]:
# Launch the Gradio interface
interface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://6a440f3c5773c6db0c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


