# ARB-BOT - INGESTA V2.0 (Word)

**Antes de ejecutar:**
1. Sube: `MANUAL DE CONVIVENCIA ESCOLAR ROLDANISTA 2023.docx`
2. Ejecuta celda por celda

In [None]:
# 1) Instalar dependencias
!pip install -q supabase sentence-transformers psycopg2-binary python-docx
print('Dependencias instaladas')

In [None]:
# 2) Imports
import os
import json
import re
import psycopg2
from psycopg2.extras import execute_values
from sentence_transformers import SentenceTransformer
from docx import Document
from datetime import datetime, timezone

print('Imports listos')

In [None]:
# 3) Configuracion

DB_HOST = "aws-1-us-east-1.pooler.supabase.com"
DB_NAME = "postgres"
DB_USER = "postgres.ympekltzqzlsbdgbzbpz"
DB_PASS = "Z32pp23z$$1124$$"
DB_PORT = "6543"

SCHEMA = "vecs"
TABLE = "arbot_documents"

WORD_FILE = "MANUAL DE CONVIVENCIA ESCOLAR ROLDANISTA 2023.docx"

def get_connection():
    return psycopg2.connect(
        host=DB_HOST, 
        dbname=DB_NAME, 
        user=DB_USER, 
        password=DB_PASS, 
        port=DB_PORT
    )

print('Probando conexion...')
try:
    conn_test = get_connection()
    conn_test.close()
    print('Conexion OK')
except Exception as e:
    print(f'Error: {e}')

In [None]:
# 4) Funcion para leer Word (con paginas estimadas)

def read_word_extract_text(docx_path):
    doc = Document(docx_path)
    paragraphs = []
    full_text = ""
    
    print(f"Leyendo: {docx_path}")
    
    # Contar secciones (= paginas aproximadas)
    total_sections = len(doc.sections)
    total_paras = len(doc.paragraphs)
    
    for i, para in enumerate(doc.paragraphs):
        text = para.text.strip()
        if text:
            # Estimar pagina basado en posicion
            page_estimate = int((i / total_paras) * total_sections) + 1
            paragraphs.append({"index": i, "text": text, "page": page_estimate})
            full_text += text + "\n\n"
    
    print(f'Leidos: {len(paragraphs)} parrafos')
    print(f'Secciones/Paginas: {total_sections}')
    print("\nPreview:")
    print("-"*40)
    print(full_text[:500])
    print("-"*40)
    
    return {"text": full_text, "paragraphs": paragraphs, "total_pages": total_sections}

print('Funcion Word lista (con paginas)')

In [None]:
# 5) Chunking jerarquico COMPLETO (con paragrafos y keywords)

def extract_keywords(text, max_kw=5):
    """Extrae palabras clave del texto."""
    stopwords = {'el','la','los','las','de','del','en','con','por','para','que','se','su','sus','un','una','al','es','son','como','este','esta','estos','estas','lo','le','les','ser','hacer','puede','debe','cada','todo','toda','todos','todas','sin','sobre','entre','desde','hasta','cuando','donde','porque','esto','eso','asi','mas','menos','muy','bien','mal','solo','mismo','misma','otros','otras','otro','otra','hay','han','sido','esta','estan','tiene','tienen','cual','cuales','segun','mediante','dentro','fuera','antes','despues','durante','siempre','nunca','tambien','pero','sino','aunque','mientras','siendo','sera','seran','fueron','fue'}
    words = re.findall(r'\b[a-záéíóúñ]{4,}\b', text.lower())
    freq = {}
    for w in words:
        if w not in stopwords:
            freq[w] = freq.get(w, 0) + 1
    return [w for w, _ in sorted(freq.items(), key=lambda x: x[1], reverse=True)[:max_kw]]

def chunk_hierarchical_legal(full_text):
    lines = full_text.split('\n')
    chunks = []
    current = {'title': None, 'chapter': None, 'article': None, 'paragraph': None, 'text_lines': []}

    title_pat = re.compile(r'(?i)^(TITULO|TÍTULO)\s*[IVXLCDM\d]+')
    chapter_pat = re.compile(r'(?i)^(CAPITULO|CAPÍTULO)\s*[IVXLCDM\d]+')
    article_pat = re.compile(r'(?i)^(ARTICULO|ARTÍCULO|Articulo|Artículo)\s*\d+')
    paragraph_pat = re.compile(r'(?i)^(PARAGRAFO|PARÁGRAFO|Paragrafo|Parágrafo)\s*\d*')

    def save():
        txt = '\n'.join(current['text_lines']).strip()
        if txt:
            meta = {k: current.get(k) for k in ('title','chapter','article','paragraph')}
            meta['keywords'] = extract_keywords(txt)
            meta['chunk_tokens'] = len(txt.split())
            chunks.append({'text': txt, 'meta': meta})

    for line in lines:
        s = line.strip()
        if not s:
            current['text_lines'].append('')
            continue
        if title_pat.match(s):
            save()
            current = {'title': s, 'chapter': None, 'article': None, 'paragraph': None, 'text_lines': [s]}
            continue
        if chapter_pat.match(s):
            save()
            current['chapter'] = s
            current['paragraph'] = None
            current['text_lines'] = [s]
            continue
        if article_pat.match(s):
            save()
            current['article'] = s
            current['paragraph'] = None
            current['text_lines'] = [s]
            continue
        if paragraph_pat.match(s):
            save()
            current['paragraph'] = s
            current['text_lines'] = [s]
            continue
        current['text_lines'].append(s)

    save()
    
    for i, c in enumerate(chunks):
        c['meta']['chunk_index'] = i
        c['meta']['ingestion_date'] = datetime.now(timezone.utc).isoformat()
    
    print(f'Chunking: {len(chunks)} chunks')
    print(f'  - Con paragrafos detectados')
    print(f'  - Con keywords extraidas')
    print(f'  - Con chunk_tokens calculados')
    return chunks

print('Funcion chunking COMPLETO lista')

In [None]:
# 6) Modelo de embeddings

print('Cargando modelo...')
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
print(f'Modelo cargado - dimension: {model.get_sentence_embedding_dimension()}')

def make_embeddings(texts, batch_size=32):
    embs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        arr = model.encode(batch, show_progress_bar=True, convert_to_numpy=True)
        embs.extend([v.tolist() for v in arr])
    return embs

print('Embeddings listos')

In [None]:
# 7) Subir a Supabase

def upload_chunks(chunks, conn):
    cur = conn.cursor()
    
    records = []
    for i, c in enumerate(chunks):
        text = c['text'].strip()
        if not text: continue
        meta = c.get('meta', {})
        chunk_id = f"{meta.get('file', 'doc')}_{meta.get('chunk_index', i)}"
        records.append((chunk_id, text, json.dumps(meta)))
    
    print(f'Generando embeddings para {len(records)} chunks...')
    embeddings = make_embeddings([r[1] for r in records])
    
    print('Subiendo a Supabase...')
    insert_sql = f"""
        INSERT INTO {SCHEMA}.{TABLE} (id, vec, text, metadata) 
        VALUES %s 
        ON CONFLICT (id) DO UPDATE SET vec=EXCLUDED.vec, text=EXCLUDED.text, metadata=EXCLUDED.metadata
    """
    
    to_insert = [(rid, '['+','.join(map(str,emb))+']', txt, meta) 
                 for (rid, txt, meta), emb in zip(records, embeddings)]
    
    execute_values(cur, insert_sql, to_insert, template="(%s, %s::vector, %s, %s::jsonb)")
    conn.commit()
    print(f'Subidos {len(to_insert)} chunks')
    cur.close()

print('Funcion subida lista')

In [None]:
# 8) EJECUTAR INGESTA

if not os.path.exists(WORD_FILE):
    print(f"No se encuentra: {WORD_FILE}")
    print("Sube el archivo Word antes de ejecutar.")
else:
    print('='*50)
    print('INGESTA DESDE WORD')
    print('='*50)
    
    print('\nPaso 1: Leyendo Word...')
    data = read_word_extract_text(WORD_FILE)
    total_pages = data.get('total_pages', 192)
    
    print('\nPaso 2: Generando chunks...')
    chunks = chunk_hierarchical_legal(data['text'])
    
    # Asignar pagina estimada a cada chunk
    total_chunks = len(chunks)
    for i, c in enumerate(chunks):
        c['meta']['file'] = WORD_FILE
        c['meta']['page'] = int((i / total_chunks) * total_pages) + 1
    
    print(f'   Paginas asignadas (1 a {total_pages})')
    
    print('\nPaso 3: Conectando...')
    conn = get_connection()
    
    print('\nPaso 4: Borrando datos anteriores...')
    cur = conn.cursor()
    cur.execute(f"SELECT count(*) FROM {SCHEMA}.{TABLE}")
    datos_anteriores = cur.fetchone()[0]
    print(f'   Anteriores: {datos_anteriores} chunks')
    cur.execute(f"DELETE FROM {SCHEMA}.{TABLE};")
    conn.commit()
    cur.close()
    print('   Tabla limpiada')
    
    print('\nPaso 5: Subiendo chunks...')
    upload_chunks(chunks, conn)
    
    print('\nPaso 6: Verificando...')
    cur = conn.cursor()
    cur.execute(f"SELECT count(*) FROM {SCHEMA}.{TABLE}")
    total = cur.fetchone()[0]
    cur.close()
    conn.close()
    
    print('\n' + '='*50)
    print(f'INGESTA COMPLETADA - {total} chunks')
    print(f'Con metadatos: title, chapter, article, paragraph,')
    print(f'               page, keywords, chunk_tokens')
    print('='*50)

In [None]:
# 9) TEST: Verificar articulo 52

conn = get_connection()
cur = conn.cursor()
cur.execute(f"""
    SELECT LEFT(text, 600) 
    FROM {SCHEMA}.{TABLE} 
    WHERE text ILIKE '%articulo 52%' OR text ILIKE '%artículo 52%'
    LIMIT 1
""")
result = cur.fetchone()
cur.close()
conn.close()

if result:
    print('Articulo 52 encontrado:')
    print('-'*50)
    print(result[0])
    print('-'*50)
else:
    print('No se encontro el articulo 52')