In [1]:
import torch
import psutil
import time
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM

# Model setup
device = torch.device("mps")  # or "cpu" or "cuda"
name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForCausalLM.from_pretrained(name).to(device)
proc = psutil.Process()

# Gradio function
def generate_text(prompt, max_len=60, temp=1.0):
    t0, m0 = time.time(), proc.memory_info().rss / 1e6
    ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    output = model.generate(ids, max_length=max_len, do_sample=True, temperature=temp)
    text = tokenizer.decode(output[0], skip_special_tokens=True)
    t1, m1 = time.time(), proc.memory_info().rss / 1e6
    stats = f"⏱ {t1-t0:.2f}s |{m1-m0:.1f} MB (peak ~{m1:.1f} MB)"
    return text, stats

# Gradio app
demo = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Textbox(lines=3, label="Prompt"),
        gr.Slider(20, 200, value=60, step=10, label="Max length"),
    ],
    outputs=[
        gr.Textbox(label="Generated text"),
        gr.Markdown(),
    ],
    title="DistilGPT-2 Playground",
)

demo.launch(inline=True, share=False)

  from .autonotebook import tqdm as notebook_tqdm


* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
