# 02 - Pipeline Deployment - RAG OpenShift AI

## 🎯 Objetivo
Crear, compilar y deployar el pipeline RAG en OpenShift AI Data Science Pipelines.
Este notebook toma los components desarrollados en el notebook anterior y los prepara para deployment real.

## 📋 Lo que Haremos
1. **Crear Components como archivos Python separados**
2. **Definir Pipeline real con components importados**
3. **Compilar Pipeline a YAML**
4. **Deploy en OpenShift AI**
5. **Testing del Pipeline deployado**
6. **Configurar Webhook Handler**

## 🔧 Setup Inicial

In [1]:
import os
import sys
import json
import yaml
from datetime import datetime
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Verificar KFP y dependencias
try:
    import kfp
    from kfp import dsl, compiler
    from kfp.client import Client
    from kfp.dsl import component, pipeline, Input, Output, Dataset
    print(f"✅ KubeFlow Pipelines disponible: {kfp.__version__}")
except ImportError:
    print("❌ Instalando KubeFlow Pipelines...")
    !pip install kfp>=2.0.0

# Verificar conexión a OpenShift AI (si está disponible)
try:
    # Intentar conectar al pipeline service
    PIPELINE_HOST = os.getenv('KFP_HOST', 'http://ml-pipeline:8888')
    print(f"🔗 Pipeline Host configurado: {PIPELINE_HOST}")
except Exception as e:
    print(f"⚠️ Conexión a pipeline host pendiente: {e}")

print("🚀 Notebook de deployment iniciado")

✅ KubeFlow Pipelines disponible: 2.12.1
🔗 Pipeline Host configurado: http://ml-pipeline:8888
🚀 Notebook de deployment iniciado


## 📁 Crear Estructura de Archivos para Components

Creamos los archivos Python separados para cada component del pipeline

In [2]:
def create_project_structure():
    """Crear la estructura de directorios y archivos del proyecto"""
    
    # Definir estructura de directorios
    directories = [
        "components",
        "pipelines", 
        "webhook",
        "config",
        "deploy/minio",
        "deploy/elasticsearch", 
        "deploy/webhook",
        "tests"
    ]
    
    print("📁 Creando estructura de directorios...")
    
    # Crear directorios
    for dir_path in directories:
        Path(dir_path).mkdir(parents=True, exist_ok=True)
        print(f"  ✅ {dir_path}/")
    
    # Crear archivos __init__.py para modules Python
    init_files = [
        "components/__init__.py",
        "pipelines/__init__.py",
        "webhook/__init__.py"
    ]
    
    for init_file in init_files:
        Path(init_file).touch(exist_ok=True)
        print(f"  ✅ {init_file}")
    
    print("\n📋 Estructura creada:")
    print("  components/          # Pipeline components")
    print("  pipelines/           # Pipeline definitions") 
    print("  webhook/             # Webhook handler")
    print("  config/              # Configuration files")
    print("  deploy/              # Deployment manifests")
    print("  tests/               # Integration tests")
    
    return directories

# Crear estructura
created_dirs = create_project_structure()

📁 Creando estructura de directorios...
  ✅ components/
  ✅ pipelines/
  ✅ webhook/
  ✅ config/
  ✅ deploy/minio/
  ✅ deploy/elasticsearch/
  ✅ deploy/webhook/
  ✅ tests/
  ✅ components/__init__.py
  ✅ pipelines/__init__.py
  ✅ webhook/__init__.py

📋 Estructura creada:
  components/          # Pipeline components
  pipelines/           # Pipeline definitions
  webhook/             # Webhook handler
  config/              # Configuration files
  deploy/              # Deployment manifests
  tests/               # Integration tests


## 🔧 Component 1: Text Processing Components

Creamos el archivo con los components de procesamiento de texto

In [3]:
def create_text_processing_components():
    """Crear archivo components/text_processing.py"""
    
    text_processing_content = '''"""
Text Processing Components para RAG Pipeline
Incluye: extract_text_component y chunk_text_component
"""

from kfp.dsl import component, Input, Output, Dataset

@component(
    base_image="pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel",
    packages_to_install=[
        "PyPDF2==3.0.1",
        "python-docx==0.8.11", 
        "minio==7.1.17",
        "chardet==5.2.0"
    ]
)
def extract_text_component(
    bucket_name: str,
    object_key: str,
    minio_endpoint: str,
    minio_access_key: str,
    minio_secret_key: str,
    extracted_text: Output[Dataset],
    metadata: Output[Dataset]
):
    """
    Extrae texto de documentos almacenados en MinIO.
    
    Args:
        bucket_name: Nombre del bucket en MinIO
        object_key: Path del archivo en el bucket
        minio_endpoint: Endpoint de MinIO
        minio_access_key: Access key de MinIO
        minio_secret_key: Secret key de MinIO
        extracted_text: Output dataset con el texto extraído
        metadata: Output dataset con metadata del documento
    """
    import os
    import json
    import tempfile
    from pathlib import Path
    from datetime import datetime
    from minio import Minio
    import PyPDF2
    from docx import Document
    import chardet
    
    # Conectar a MinIO
    minio_client = Minio(
        minio_endpoint,
        access_key=minio_access_key,
        secret_key=minio_secret_key,
        secure=False
    )
    
    # Crear directorio temporal
    with tempfile.TemporaryDirectory() as temp_dir:
        local_file_path = os.path.join(temp_dir, object_key.split('/')[-1])
        
        # Descargar archivo desde MinIO
        try:
            minio_client.fget_object(bucket_name, object_key, local_file_path)
            print(f"✅ Archivo descargado: {local_file_path}")
        except Exception as e:
            raise Exception(f"Error descargando archivo: {str(e)}")
        
        # Detectar tipo de archivo
        file_extension = Path(local_file_path).suffix.lower()
        file_size = os.path.getsize(local_file_path)
        
        # Extraer texto según el tipo de archivo
        extracted_content = ""
        
        if file_extension == '.pdf':
            try:
                with open(local_file_path, 'rb') as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    for page_num in range(len(pdf_reader.pages)):
                        page = pdf_reader.pages[page_num]
                        extracted_content += page.extract_text() + "\\n"
                print(f"✅ PDF procesado: {len(pdf_reader.pages)} páginas")
            except Exception as e:
                raise Exception(f"Error procesando PDF: {str(e)}")
                
        elif file_extension == '.docx':
            try:
                doc = Document(local_file_path)
                for paragraph in doc.paragraphs:
                    extracted_content += paragraph.text + "\\n"
                print(f"✅ DOCX procesado: {len(doc.paragraphs)} párrafos")
            except Exception as e:
                raise Exception(f"Error procesando DOCX: {str(e)}")
                
        elif file_extension in ['.txt', '.md']:
            try:
                # Detectar encoding
                with open(local_file_path, 'rb') as file:
                    raw_data = file.read()
                    encoding = chardet.detect(raw_data)['encoding']
                
                # Leer con encoding detectado
                with open(local_file_path, 'r', encoding=encoding) as file:
                    extracted_content = file.read()
                print(f"✅ TXT procesado con encoding: {encoding}")
            except Exception as e:
                raise Exception(f"Error procesando TXT: {str(e)}")
                
        else:
            raise Exception(f"Tipo de archivo no soportado: {file_extension}")
        
        # Validar que se extrajo contenido
        if not extracted_content.strip():
            raise Exception("No se pudo extraer texto del documento")
        
        # Preparar metadata
        document_metadata = {
            "source_file": object_key,
            "file_type": file_extension,
            "file_size": file_size,
            "processed_at": datetime.now().isoformat(),
            "char_count": len(extracted_content),
            "word_count": len(extracted_content.split()),
            "bucket_name": bucket_name
        }
        
        # Guardar outputs
        with open(extracted_text.path, 'w', encoding='utf-8') as f:
            f.write(extracted_content)
            
        with open(metadata.path, 'w', encoding='utf-8') as f:
            json.dump(document_metadata, f, indent=2)
        
        print(f"✅ Texto extraído: {len(extracted_content)} caracteres")


@component(
    base_image="pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel",
    packages_to_install=[
        "tiktoken==0.5.1",
        "langchain==0.0.350"
    ]
)
def chunk_text_component(
    extracted_text: Input[Dataset],
    metadata: Input[Dataset],
    chunk_size: int,
    chunk_overlap: int,
    chunks: Output[Dataset]
):
    """
    Divide el texto en chunks con overlap para processing óptimo.
    
    Args:
        extracted_text: Input dataset con texto extraído
        metadata: Input dataset con metadata del documento
        chunk_size: Tamaño máximo de cada chunk (en tokens)
        chunk_overlap: Overlap entre chunks (en tokens)
        chunks: Output dataset con chunks procesados
    """
    import json
    import tiktoken
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    
    # Leer input data
    with open(extracted_text.path, 'r', encoding='utf-8') as f:
        text_content = f.read()
    
    with open(metadata.path, 'r', encoding='utf-8') as f:
        doc_metadata = json.load(f)
    
    # Configurar tokenizer
    encoding = tiktoken.get_encoding("cl100k_base")
    
    def count_tokens(text: str) -> int:
        return len(encoding.encode(text))
    
    # Configurar text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size * 4,  # Aproximación: 1 token ≈ 4 caracteres
        chunk_overlap=chunk_overlap * 4,
        length_function=len,
        separators=["\\n\\n", "\\n", ". ", " ", ""]
    )
    
    # Dividir texto en chunks
    text_chunks = text_splitter.split_text(text_content)
    print(f"✅ Texto dividido en {len(text_chunks)} chunks")
    
    # Procesar cada chunk
    processed_chunks = []
    
    for i, chunk_text in enumerate(text_chunks):
        token_count = count_tokens(chunk_text)
        
        chunk_metadata = {
            "chunk_id": f"{doc_metadata['source_file']}_chunk_{i:04d}",
            "chunk_index": i,
            "total_chunks": len(text_chunks),
            "text": chunk_text.strip(),
            "token_count": token_count,
            "char_count": len(chunk_text),
            "word_count": len(chunk_text.split()),
            "source_document": doc_metadata['source_file'],
            "file_type": doc_metadata['file_type'],
            "processed_at": doc_metadata['processed_at']
        }
        
        processed_chunks.append(chunk_metadata)
    
    # Filtrar chunks muy pequeños
    processed_chunks = [chunk for chunk in processed_chunks if chunk['token_count'] >= 10]
    
    print(f"✅ Chunks procesados: {len(processed_chunks)}")
    
    # Guardar chunks
    with open(chunks.path, 'w', encoding='utf-8') as f:
        json.dump(processed_chunks, f, indent=2, ensure_ascii=False)
'''
    
    # Escribir archivo
    with open('components/text_processing.py', 'w', encoding='utf-8') as f:
        f.write(text_processing_content)
    
    print("✅ Archivo creado: components/text_processing.py")
    print("📋 Components incluidos:")
    print("  - extract_text_component")
    print("  - chunk_text_component")

# Crear archivo de text processing
create_text_processing_components()

✅ Archivo creado: components/text_processing.py
📋 Components incluidos:
  - extract_text_component
  - chunk_text_component


## 🎯 Component 2: Vector Processing Components

Creamos el archivo con los components de embeddings e indexación

In [4]:
def create_vector_processing_components():
    """Crear archivo components/vector_processing.py"""
    
    vector_processing_content = '''"""
Vector Processing Components para RAG Pipeline
Incluye: generate_embeddings_component y index_elasticsearch_component
"""

from kfp.dsl import component, Input, Output, Dataset

@component(
    base_image="pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel",
    packages_to_install=[
        "sentence-transformers==2.2.2",
        "numpy==1.24.3"
    ]
)
def generate_embeddings_component(
    chunks: Input[Dataset],
    model_name: str,
    embeddings: Output[Dataset]
):
    """
    Genera embeddings vectoriales para los chunks de texto.
    
    Args:
        chunks: Input dataset con chunks de texto
        model_name: Nombre del modelo de embeddings
        embeddings: Output dataset con embeddings generados
    """
    import json
    import numpy as np
    from sentence_transformers import SentenceTransformer
    import torch
    
    # Verificar si hay GPU disponible
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"🖥️ Usando device: {device}")
    
    # Cargar modelo de embeddings
    print(f"📥 Cargando modelo: {model_name}")
    model = SentenceTransformer(model_name, device=device)
    
    # Leer chunks
    with open(chunks.path, 'r', encoding='utf-8') as f:
        chunk_data = json.load(f)
    
    print(f"📝 Procesando {len(chunk_data)} chunks")
    
    # Extraer textos para embedding
    texts = [chunk['text'] for chunk in chunk_data]
    
    # Generar embeddings en batches para eficiencia
    batch_size = 32
    all_embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = model.encode(
            batch_texts,
            convert_to_numpy=True,
            show_progress_bar=True if i == 0 else False,
            normalize_embeddings=True
        )
        all_embeddings.extend(batch_embeddings)
        
        if i % (batch_size * 5) == 0:
            print(f"  Procesado: {min(i + batch_size, len(texts))}/{len(texts)} chunks")
    
    print(f"✅ Embeddings generados: {len(all_embeddings)} vectores de {len(all_embeddings[0])} dimensiones")
    
    # Combinar chunks con sus embeddings
    enriched_chunks = []
    for chunk, embedding in zip(chunk_data, all_embeddings):
        enriched_chunk = chunk.copy()
        enriched_chunk['embedding'] = embedding.tolist()
        enriched_chunk['embedding_dim'] = len(embedding)
        enriched_chunk['embedding_model'] = model_name
        enriched_chunks.append(enriched_chunk)
    
    # Guardar chunks enriquecidos con embeddings
    with open(embeddings.path, 'w', encoding='utf-8') as f:
        json.dump(enriched_chunks, f, indent=2, ensure_ascii=False)
    
    print(f"✅ Chunks enriquecidos guardados")


@component(
    base_image="pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel",
    packages_to_install=[
        "elasticsearch==8.11.0"
    ]
)
def index_elasticsearch_component(
    enriched_chunks: Input[Dataset],
    es_endpoint: str,
    es_index: str,
    index_status: Output[Dataset]
):
    """
    Indexa chunks enriquecidos en ElasticSearch.
    
    Args:
        enriched_chunks: Input dataset con chunks y embeddings
        es_endpoint: Endpoint de ElasticSearch
        es_index: Nombre del índice
        index_status: Output dataset con status de indexación
    """
    import json
    from datetime import datetime
    from elasticsearch import Elasticsearch
    from elasticsearch.helpers import bulk
    
    # Conectar a ElasticSearch
    try:
        es = Elasticsearch([es_endpoint], verify_certs=False)
        
        if not es.ping():
            raise Exception("No se puede conectar a ElasticSearch")
        
        print(f"✅ Conectado a ElasticSearch: {es_endpoint}")
    except Exception as e:
        raise Exception(f"Error conectando a ElasticSearch: {str(e)}")
    
    # Leer chunks enriquecidos
    with open(enriched_chunks.path, 'r', encoding='utf-8') as f:
        chunks_data = json.load(f)
    
    print(f"📝 Indexando {len(chunks_data)} chunks en índice: {es_index}")
    
    # Definir mapping del índice
    index_mapping = {
        "mappings": {
            "properties": {
                "chunk_id": {"type": "keyword"},
                "text": {
                    "type": "text",
                    "analyzer": "standard"
                },
                "embedding": {
                    "type": "dense_vector",
                    "dims": chunks_data[0]['embedding_dim'] if chunks_data else 384,
                    "index": True,
                    "similarity": "cosine"
                },
                "source_document": {"type": "keyword"},
                "file_type": {"type": "keyword"},
                "chunk_index": {"type": "integer"},
                "total_chunks": {"type": "integer"},
                "token_count": {"type": "integer"},
                "char_count": {"type": "integer"},
                "word_count": {"type": "integer"},
                "processed_at": {"type": "date"},
                "indexed_at": {"type": "date"}
            }
        },
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        }
    }
    
    # Crear índice si no existe
    if not es.indices.exists(index=es_index):
        es.indices.create(index=es_index, body=index_mapping)
        print(f"✅ Índice creado: {es_index}")
    else:
        print(f"ℹ️ Índice ya existe: {es_index}")
    
    # Preparar documentos para bulk indexing
    documents = []
    for chunk in chunks_data:
        doc = {
            "_index": es_index,
            "_id": chunk['chunk_id'],
            "_source": {
                **chunk,
                "indexed_at": datetime.now().isoformat()
            }
        }
        documents.append(doc)
    
    # Indexar en batches
    try:
        success_count, failed_items = bulk(
            es,
            documents,
            chunk_size=100,
            request_timeout=300
        )
        
        print(f"✅ Indexación completada:")
        print(f"  Documentos exitosos: {success_count}")
        print(f"  Documentos fallidos: {len(failed_items) if failed_items else 0}")
        
    except Exception as e:
        raise Exception(f"Error en bulk indexing: {str(e)}")
    
    # Refresh del índice
    es.indices.refresh(index=es_index)
    
    # Verificar indexación
    doc_count = es.count(index=es_index)['count']
    print(f"✅ Total documentos en índice: {doc_count}")
    
    # Preparar status de indexación
    indexing_status = {
        "index_name": es_index,
        "total_chunks": len(chunks_data),
        "indexed_chunks": success_count,
        "failed_chunks": len(failed_items) if failed_items else 0,
        "total_documents_in_index": doc_count,
        "indexed_at": datetime.now().isoformat(),
        "success": len(failed_items) == 0 if failed_items else True
    }
    
    # Guardar status
    with open(index_status.path, 'w', encoding='utf-8') as f:
        json.dump(indexing_status, f, indent=2)
    
    print(f"✅ Status de indexación guardado")
'''
    
    # Escribir archivo
    with open('components/vector_processing.py', 'w', encoding='utf-8') as f:
        f.write(vector_processing_content)
    
    print("✅ Archivo creado: components/vector_processing.py")
    print("📋 Components incluidos:")
    print("  - generate_embeddings_component")
    print("  - index_elasticsearch_component")

# Crear archivo de vector processing
create_vector_processing_components()

✅ Archivo creado: components/vector_processing.py
📋 Components incluidos:
  - generate_embeddings_component
  - index_elasticsearch_component


## 🔗 Pipeline Definition: RAG Pipeline Principal

Creamos el pipeline principal que orquesta todos los components

In [5]:
def create_rag_pipeline():
    """Crear archivo pipelines/rag_pipeline.py"""
    
    rag_pipeline_content = '''"""
RAG Pipeline Principal - OpenShift AI Data Science Pipeline
Orquesta todos los components para procesamiento completo de documentos
"""

from kfp import dsl
from kfp.dsl import pipeline

# Importar components
from components.text_processing import extract_text_component, chunk_text_component
from components.vector_processing import generate_embeddings_component, index_elasticsearch_component

@pipeline(
    name="rag-document-processing-v1",
    description="Pipeline completo de procesamiento de documentos RAG para OpenShift AI"
)
def rag_document_pipeline(
    bucket_name: str = "raw-documents",
    object_key: str = "",
    minio_endpoint: str = "minio-rag:9000",
    minio_access_key: str = "ragadmin",
    minio_secret_key: str = "RAGSecurePass123!",
    es_endpoint: str = "elasticsearch:9200",
    es_index: str = "rag-documents",
    chunk_size: int = 512,
    chunk_overlap: int = 50,
    embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
):
    """
    Pipeline completo de procesamiento de documentos RAG.
    
    Args:
        bucket_name: Nombre del bucket en MinIO
        object_key: Path del archivo a procesar
        minio_endpoint: Endpoint de MinIO
        minio_access_key: Access key de MinIO
        minio_secret_key: Secret key de MinIO
        es_endpoint: Endpoint de ElasticSearch
        es_index: Nombre del índice en ElasticSearch
        chunk_size: Tamaño de chunks en tokens
        chunk_overlap: Overlap entre chunks en tokens
        embedding_model: Modelo para generar embeddings
    
    Returns:
        Status de indexación final
    """
    
    # Step 1: Extract text from document
    extract_task = extract_text_component(
        bucket_name=bucket_name,
        object_key=object_key,
        minio_endpoint=minio_endpoint,
        minio_access_key=minio_access_key,
        minio_secret_key=minio_secret_key
    )
    extract_task.set_display_name("📄 Extract Text")
    extract_task.set_cpu_limit("500m")
    extract_task.set_memory_limit("1Gi")
    extract_task.set_retry(3)
    
    # Step 2: Chunk the extracted text
    chunk_task = chunk_text_component(
        extracted_text=extract_task.outputs['extracted_text'],
        metadata=extract_task.outputs['metadata'],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    ).after(extract_task)
    chunk_task.set_display_name("🧩 Chunk Text")
    chunk_task.set_cpu_limit("500m")
    chunk_task.set_memory_limit("1Gi")
    chunk_task.set_retry(3)
    
    # Step 3: Generate embeddings
    embedding_task = generate_embeddings_component(
        chunks=chunk_task.outputs['chunks'],
        model_name=embedding_model
    ).after(chunk_task)
    embedding_task.set_display_name("🎯 Generate Embeddings")
    embedding_task.set_cpu_limit("1000m")
    embedding_task.set_memory_limit("4Gi")
    embedding_task.set_retry(2)
    # embedding_task.set_gpu_limit("1")  # Uncomment si hay GPUs disponibles
    
    # Step 4: Index in ElasticSearch
    index_task = index_elasticsearch_component(
        enriched_chunks=embedding_task.outputs['embeddings'],
        es_endpoint=es_endpoint,
        es_index=es_index
    ).after(embedding_task)
    index_task.set_display_name("🔍 Index ElasticSearch")
    index_task.set_cpu_limit("500m")
    index_task.set_memory_limit("2Gi")
    index_task.set_retry(3)
    
    # Return final status
    return index_task.outputs['index_status']


# Pipeline alternativo para batch processing
@pipeline(
    name="rag-batch-processing-v1", 
    description="Pipeline para procesamiento batch de múltiples documentos"
)
def rag_batch_pipeline(
    bucket_name: str = "raw-documents",
    file_pattern: str = "*.pdf",
    es_endpoint: str = "elasticsearch:9200",
    es_index: str = "rag-documents",
    chunk_size: int = 512,
    chunk_overlap: int = 50,
    embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
    batch_size: int = 10
):
    """
    Pipeline para procesamiento batch de múltiples documentos.
    Útil para procesamiento inicial de grandes volúmenes.
    """
    # TODO: Implementar en futuras versiones
    # - Listar archivos en bucket por patrón
    # - Procesar en batches paralelos
    # - Consolidar resultados
    pass


if __name__ == "__main__":
    # Para testing local del pipeline definition
    print("✅ RAG Pipeline definido correctamente")
    print("📋 Pipeline functions disponibles:")
    print("  - rag_document_pipeline: Procesamiento individual")
    print("  - rag_batch_pipeline: Procesamiento batch (TODO)")
'''
    
    # Escribir archivo
    with open('pipelines/rag_pipeline.py', 'w', encoding='utf-8') as f:
        f.write(rag_pipeline_content)
    
    print("✅ Archivo creado: pipelines/rag_pipeline.py")
    print("📋 Pipeline functions incluidas:")
    print("  - rag_document_pipeline: Pipeline principal")
    print("  - rag_batch_pipeline: Pipeline batch (placeholder)")

# Crear archivo del pipeline
create_rag_pipeline()

✅ Archivo creado: pipelines/rag_pipeline.py
📋 Pipeline functions incluidas:
  - rag_document_pipeline: Pipeline principal
  - rag_batch_pipeline: Pipeline batch (placeholder)


## ⚙️ Configuration Files

Creamos los archivos de configuración necesarios para el deployment

In [7]:
def create_configuration_files():
    """Crear archivos de configuración del proyecto"""
    
    # 1. Pipeline Configuration
    pipeline_config = {
        "pipeline": {
            "name": "rag-document-processing-v1",
            "version": "1.0.0",
            "description": "RAG Document Processing Pipeline para OpenShift AI",
            "author": "RAG Team",
            "created": datetime.now().isoformat()
        },
        "components": {
            "base_image": "pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel",
            "extract_text": {
                "cpu_limit": "500m",
                "memory_limit": "1Gi",
                "retry_limit": 3
            },
            "chunk_text": {
                "cpu_limit": "500m", 
                "memory_limit": "1Gi",
                "retry_limit": 3
            },
            "generate_embeddings": {
                "cpu_limit": "1000m",
                "memory_limit": "4Gi", 
                "retry_limit": 2,
                "gpu_limit": "0"  # Set to "1" si hay GPU disponible
            },
            "index_elasticsearch": {
                "cpu_limit": "500m",
                "memory_limit": "2Gi",
                "retry_limit": 3
            }
        },
        "default_parameters": {
            "chunk_size": 512,
            "chunk_overlap": 50,
            "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
            "es_index": "rag-documents"
        },
        "storage": {
            "minio": {
                "endpoint": "minio-rag:9000",
                "bucket_raw": "raw-documents",
                "bucket_processed": "processed-documents",
                "bucket_failed": "failed-documents"
            },
            "elasticsearch": {
                "endpoint": "elasticsearch:9200",
                "index_prefix": "rag-",
                "replicas": 0,
                "shards": 1
            }
        }
    }
    
    with open('config/pipeline_config.yaml', 'w') as f:
        yaml.dump(pipeline_config, f, indent=2, default_flow_style=False)
    
    print("✅ Creado: config/pipeline_config.yaml")
    
    # 2. Secrets Template
    secrets_template = {
        "apiVersion": "v1",
        "kind": "Secret",
        "metadata": {
            "name": "rag-pipeline-secrets",
            "namespace": "rag-openshift-ai"
        },
        "type": "Opaque",
        "stringData": {
            "minio-access-key": "minio",
            "minio-secret-key": "minio123",
            "elasticsearch-username": "",
            "elasticsearch-password": "",
            "pipeline-webhook-token": "rag-pipeline-token-change-me"
        }
    }
    
    with open('config/secrets.yaml', 'w') as f:
        yaml.dump(secrets_template, f, indent=2, default_flow_style=False)
    
    print("✅ Creado: config/secrets.yaml")
    
    # 3. Requirements file
    requirements_content = '''# RAG Pipeline Requirements
# Core KFP
kfp>=2.0.0

# Document Processing
PyPDF2==3.0.1
python-docx==0.8.11
chardet==5.2.0

# Text Processing
tiktoken==0.5.1
langchain==0.0.350

# ML/Embeddings
sentence-transformers==2.2.2
torch>=2.0.1
numpy==1.24.3

# Storage/DB
minio==7.1.17
elasticsearch==8.11.0

# Web/API (for webhook)
flask==2.3.3
requests==2.31.0

# Utilities
pyyaml==6.0.1
python-dateutil==2.8.2
'''
    
    with open('requirements.txt', 'w') as f:
        f.write(requirements_content)
    
    print("✅ Creado: requirements.txt")
    
    # 4. Environment variables template
    env_template = '''# RAG Pipeline Environment Variables
# Copy to .env and modify as needed

# MinIO Configuration
MINIO_ENDPOINT=minio:9000
MINIO_ACCESS_KEY=minio
MINIO_SECRET_KEY=minio123
MINIO_SECURE=false

# ElasticSearch Configuration  
ES_ENDPOINT=elasticsearch:9200
ES_INDEX=rag-documents
ES_USERNAME=
ES_PASSWORD=

# Pipeline Configuration
CHUNK_SIZE=512
CHUNK_OVERLAP=50
EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2

# OpenShift AI Pipeline
KFP_HOST=http://ml-pipeline:8888
PIPELINE_NAMESPACE=rag-openshift-ai

# Webhook Configuration
WEBHOOK_PORT=8080
WEBHOOK_TOKEN=rag-pipeline-token-change-me
'''
    
    with open('config/env.template', 'w') as f:
        f.write(env_template)
    
    print("✅ Creado: config/env.template")
    
    print("\n📋 Archivos de configuración creados:")
    print("  config/pipeline_config.yaml - Configuración del pipeline")
    print("  config/secrets.yaml - Template de secrets K8s")
    print("  requirements.txt - Dependencias Python")  
    print("  config/env.template - Variables de ambiente")

# Crear archivos de configuración
create_configuration_files()

✅ Creado: config/pipeline_config.yaml
✅ Creado: config/secrets.yaml
✅ Creado: requirements.txt
✅ Creado: config/env.template

📋 Archivos de configuración creados:
  config/pipeline_config.yaml - Configuración del pipeline
  config/secrets.yaml - Template de secrets K8s
  requirements.txt - Dependencias Python
  config/env.template - Variables de ambiente
