In [1]:
!pip install -q fastapi uvicorn transformers torch pyngrok nest-asyncio

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/95.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m92.2/95.2 kB[0m [31m88.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m92.2/95.2 kB[0m [31m88.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m75.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m67.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━

In [None]:
from fastapi import FastAPI, HTTPException, Request
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import uvicorn
from pyngrok import ngrok
import nest_asyncio
import threading
from typing import Optional
import time
from functools import lru_cache

nest_asyncio.apply()

# Model caching decorator
def cache_model(maxsize=1):
    return lru_cache(maxsize=maxsize)

@cache_model()
def load_model(model_name: str):
    print(f"Loading model {model_name}...")
    start_time = time.time()
    model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE)
    print(f"Model loaded in {time.time() - start_time:.2f} seconds")
    return model

@cache_model()
def load_tokenizer(model_name: str):
    print(f"Loading tokenizer {model_name}...")
    start_time = time.time()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(f"Tokenizer loaded in {time.time() - start_time:.2f} seconds")
    return tokenizer

app = FastAPI(title="Optimized Text Generation API", docs_url="/docs", redoc_url=None)

class PromptRequest(BaseModel):
    prompt: str
    max_length: Optional[int] = 100
    temperature: Optional[float] = 0.7
    top_p: Optional[float] = 0.9
    top_k: Optional[int] = 50
    repetition_penalty: Optional[float] = 1.0

MODEL_NAME = "gpt2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Pre-load model and tokenizer at startup
print("Pre-loading model and tokenizer...")
start_load_time = time.time()
tokenizer = load_tokenizer(MODEL_NAME)
model = load_model(MODEL_NAME)
print(f"Total pre-load time: {time.time() - start_load_time:.2f} seconds")

@app.post("/generate", response_model=dict)
async def generate_text(request: PromptRequest, fastapi_request: Request):
    start_time = time.time()

    # Measure tokenization time
    tokenize_start = time.time()
    inputs = tokenizer(request.prompt, return_tensors="pt").to(DEVICE)
    tokenize_time = time.time() - tokenize_start

    # Measure generation time
    generate_start = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=request.max_length,
            temperature=request.temperature,
            top_p=request.top_p,
            top_k=request.top_k,
            repetition_penalty=request.repetition_penalty,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    generate_time = time.time() - generate_start

    # Measure decoding time
    decode_start = time.time()
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    decode_time = time.time() - decode_start

    generated_text = full_text[len(request.prompt):].strip()
    total_time = time.time() - start_time

    # Print latency metrics
    print(f"\nRequest latency breakdown:")
    print(f"Tokenization: {tokenize_time:.4f}s")
    print(f"Generation: {generate_time:.4f}s")
    print(f"Decoding: {decode_time:.4f}s")
    print(f"Total API time: {total_time:.4f}s")

    return {
        "generated_text": generated_text,
        "original_prompt": request.prompt,
        "parameters": request.dict(exclude={"prompt"}),
        "metrics": {
            "total_time": total_time,
            "tokenization_time": tokenize_time,
            "generation_time": generate_time,
            "decoding_time": decode_time
        }
    }

@app.get("/test")
async def test_endpoint():
    """Manual test endpoint with sample prompts"""
    test_prompts = [
        "The weather is beautiful today",
        "Artificial intelligence will in the future",
        "The food at this restaurant"
    ]

    results = []
    for prompt in test_prompts:
        start_time = time.time()
        request = PromptRequest(prompt=prompt, max_length=50)
        inputs = tokenizer(request.prompt, return_tensors="pt").to(DEVICE)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=request.max_length,
                temperature=request.temperature,
                top_p=request.top_p,
                top_k=request.top_k,
                repetition_penalty=request.repetition_penalty,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )

        full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_text = full_text[len(request.prompt):].strip()
        elapsed = time.time() - start_time

        results.append({
            "prompt": prompt,
            "generated_text": generated_text,
            "response_time": elapsed
        })

    return {"test_results": results}

def start_server():
    uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info", access_log=False)

def setup_ngrok():
    NGROK_AUTH_TOKEN = "YOUR_TOKEN"
    ngrok.set_auth_token(NGROK_AUTH_TOKEN)
    tunnel = ngrok.connect(8000, bind_tls=True)
    print(f"\nNgrok tunnel: {tunnel.public_url}")
    print(f"API docs: {tunnel.public_url}/docs")
    print(f"Test endpoint: {tunnel.public_url}/test\n")
    return tunnel

if __name__ == "__main__":
    import nest_asyncio
    nest_asyncio.apply()

    server_thread = threading.Thread(target=start_server, daemon=True)
    server_thread.start()

    setup_ngrok()

    server_thread.join()

Pre-loading model and tokenizer...
Loading tokenizer gpt2...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizer loaded in 3.44 seconds
Loading model gpt2...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model loaded in 11.30 seconds
Total pre-load time: 14.74 seconds


INFO:     Started server process [769]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)



Ngrok tunnel: https://3b6a-34-90-90-201.ngrok-free.app
API docs: https://3b6a-34-90-90-201.ngrok-free.app/docs
Test endpoint: https://3b6a-34-90-90-201.ngrok-free.app/test


Request latency breakdown:
Tokenization: 0.0007s
Generation: 1.0075s
Decoding: 0.0006s
Total API time: 1.0088s


<ipython-input-2-3e628b720724>:97: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  "parameters": request.dict(exclude={"prompt"}),



Request latency breakdown:
Tokenization: 0.0010s
Generation: 1.6071s
Decoding: 0.0003s
Total API time: 1.6085s

Request latency breakdown:
Tokenization: 0.0009s
Generation: 0.3786s
Decoding: 0.0002s
Total API time: 0.3798s

Request latency breakdown:
Tokenization: 0.0008s
Generation: 1.1255s
Decoding: 0.0005s
Total API time: 1.1268s

Request latency breakdown:
Tokenization: 0.0009s
Generation: 1.5553s
Decoding: 0.0009s
Total API time: 1.5571s

Request latency breakdown:
Tokenization: 0.0009s
Generation: 0.4924s
Decoding: 0.0002s
Total API time: 0.4935s

Request latency breakdown:
Tokenization: 0.0009s
Generation: 0.9154s
Decoding: 0.0006s
Total API time: 0.9169s

Request latency breakdown:
Tokenization: 0.0008s
Generation: 1.1506s
Decoding: 0.0006s
Total API time: 1.1520s

Request latency breakdown:
Tokenization: 0.0006s
Generation: 0.0129s
Decoding: 0.0001s
Total API time: 0.0136s

Request latency breakdown:
Tokenization: 0.0006s
Generation: 0.6527s
Decoding: 0.0003s
Total API time: 