In [1]:
!pip install transformers datasets peft torch evaluate accelerate bitsandbytes flash-attn --no-build-isolation
!pip install gradio huggingface_hub


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting flash-attn
  Downloading flash_attn-2.7.4.post1.tar.gz (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-

In [2]:
# Reinstall flash-attn specifically to ensure it's correctly configured
!pip install flash-attn --no-build-isolation




In [3]:
import torch
import numpy as np
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    GenerationConfig
)
from torch.nn.functional import softmax
import gradio as gr
from google.colab import userdata

In [4]:
# Configuración de dispositivo y memoria
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [5]:
# Limpiar caché de GPU
if torch.cuda.is_available():
    torch.cuda.empty_cache()

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

In [6]:
# Autenticación
token = userdata.get("HF_Token")
from huggingface_hub import login
login(token=token)


In [7]:
# ========== OPTIMIZACIÓN 1: CUANTIZACIÓN 4-BIT ==========
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)


In [8]:
# ========== OPTIMIZACIÓN 2: CARGA OPTIMIZADA DEL MODELO ==========
print("Cargando modelo con cuantización 4-bit...")
tokenizer = AutoTokenizer.from_pretrained(
    "fdtn-ai/Foundation-Sec-8B",
    padding_side="left",
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    "fdtn-ai/Foundation-Sec-8B",
    quantization_config=quantization_config,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    attn_implementation="flash_attention_2"  # Si está disponible
)

# Configurar pad_token si no existe
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Cargando modelo con cuantización 4-bit...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/840 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

In [9]:
# ========== OPTIMIZACIÓN 3: CONFIGURACIÓN DE GENERACIÓN OPTIMIZADA ==========
generation_config = GenerationConfig(
    max_new_tokens=100,
    do_sample=True,
    top_p=0.9,
    temperature=0.1,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    # Optimizaciones adicionales
    use_cache=True,
    repetition_penalty=1.1,
)

In [10]:
# ========== OPTIMIZACIÓN 4: FUNCIÓN DE GENERACIÓN OPTIMIZADA ==========
@torch.inference_mode()  # Desactiva gradientes para inferencia
def generate_response_optimized(prompt, max_tokens=100):
    # Tokenización optimizada
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512,  # Limitar longitud de entrada
        padding=False
    ).to(model.device)

    # Generar con configuración optimizada
    with torch.cuda.amp.autocast():  # Precisión mixta
        outputs = model.generate(
            **inputs,
            generation_config=generation_config,
            max_new_tokens=max_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=0.1,
            use_cache=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decodificación optimizada
    response = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    ).strip()

    return response

In [11]:
# ========== OPTIMIZACIÓN 5: INTERFAZ GRADIO OPTIMIZADA ==========
def generate_response_gradio(prompt, history, max_tokens=80):
    """Función optimizada para Gradio con tokens limitados"""
    try:
        response = generate_response_optimized(prompt, max_tokens)
        return response
    except Exception as e:
        return f"Error: {str(e)}"


In [13]:
# ========== PRUEBA RÁPIDA ==========
print("Realizando prueba de velocidad...")
import time

test_prompt = "¿What is CVE y CWE?"
start_time = time.time()
test_response = generate_response_optimized(test_prompt, max_tokens=50)
end_time = time.time()

print(f"\n=== RESULTADO DE PRUEBA ===")
print(f"Tiempo de respuesta: {end_time - start_time:.2f} segundos")
print(f"Respuesta: {test_response}")

Realizando prueba de velocidad...


  with torch.cuda.amp.autocast():  # Precisión mixta



=== RESULTADO DE PRUEBA ===
Tiempo de respuesta: 1.71 segundos
Respuesta: - Common Vulnerabilities and Exposures (CVE)
- Common Weakness Enumeration (CWE)


In [14]:
# ========== INTERFAZ GRADIO OPTIMIZADA ==========
with gr.Blocks(css="footer {visibility: hidden}") as demo:
    gr.Markdown("# Foundation-Sec-8B Demo (OPTIMIZADO)")
    gr.Markdown("Modelo optimizado para respuestas rápidas (cuantización 4-bit)")

    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(height=400, show_copy_button=True)

        with gr.Column(scale=1):
            gr.Markdown("### Configuración")
            max_tokens_slider = gr.Slider(
                minimum=20,
                maximum=200,
                value=80,
                step=10,
                label="Máximo tokens"
            )
            clear_btn = gr.Button("🗑️ Limpiar", variant="secondary")

    with gr.Row():
        msg = gr.Textbox(
            placeholder="Escribe tu pregunta de seguridad aquí...",
            label="Prompt",
            scale=4
        )
        submit_btn = gr.Button("📤 Enviar", variant="primary", scale=1)

    def user(message, history):
        return "", history + [[message, None]]

    def bot(history, max_tokens):
        if history and history[-1][1] is None:
            prompt = history[-1][0]
            response = generate_response_gradio(prompt, history, max_tokens)
            history[-1][1] = response
        return history

    # Eventos
    submit_btn.click(
        user,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot,
        [chatbot, max_tokens_slider],
        chatbot
    )

    msg.submit(
        user,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot,
        [chatbot, max_tokens_slider],
        chatbot
    )

    clear_btn.click(lambda: None, None, chatbot, queue=False)

  chatbot = gr.Chatbot(height=400, show_copy_button=True)


In [15]:
# Lanzar con configuración optimizada
print("\n🚀 Lanzando interfaz optimizada...")
demo.launch(
    debug=False,  # Desactivar debug en producción
    share=True,
    server_name="0.0.0.0",
    inbrowser=True,
    quiet=True
)


🚀 Lanzando interfaz optimizada...
* Running on public URL: https://4a7128471c1c9e5cc2.gradio.live




In [15]:

# ========== EJEMPLO DE INFERENCIA BATCH (OPCIONAL) ==========
def batch_inference_example():
    """Ejemplo de inferencia por lotes para múltiples prompts"""
    prompts = [
        "What is SQL injection?",
        "Explain network segmentation",
        "Best practices for API security"
    ]

    print("\n=== INFERENCIA POR LOTES ===")
    start_time = time.time()

    for i, prompt in enumerate(prompts):
        response = generate_response_optimized(prompt, max_tokens=30)
        print(f"\nPrompt {i+1}: {prompt}")
        print(f"Respuesta: {response}")

    total_time = time.time() - start_time
    print(f"\nTiempo total para {len(prompts)} prompts: {total_time:.2f}s")
    print(f"Tiempo promedio por prompt: {total_time/len(prompts):.2f}s")

# Ejecutar ejemplo de batch (opcional)
# batch_inference_example()