In [None]:
import sys, pathlib
# Instala en el MISMO kernel del notebook
!{sys.executable} -m pip install -q spacy sentence-transformers scikit-learn unidecode pandas
!{sys.executable} -m spacy download es_core_news_md
pathlib.Path("resumen de analisis").mkdir(exist_ok=True)
print("✅ Setup listo.")

In [None]:
# ================== PARÁMETROS ==================
FILE_PATH = "../data/archivo.txt"  # 👈 CAMBIA AQUÍ

AXIS_TERMS = [
    "interdisciplina","interdisciplinar","interdisciplinario",
    "trabajo interdisciplinario","cruce de disciplinas","colaboración entre disciplinas"
]

TOP_SENTENCES = 120           # oraciones candidatas antes de recortar a fragmentos
SIM_THRESHOLD_PERCENTILE = 55 # laxo para no perder material
MIN_SNIPPET_WORDS = 2
MAX_SNIPPET_WORDS = 8
MIN_CONCEPT_WORDS = 2
MAX_CONCEPT_WORDS = 3

DESIGN_LEXICON = {
    "diseño","proyecto","prototipo","investigación","experiencia","narrativa","interfaz",
    "materialidad","tipografía","espacio","interacción","usuario","visual","sonoro",
    "exhibición","instalación","especulativo","metodología","iteración","proceso",
    "tecnología","multimedia","analogico","digital","sensorial","arquitectónico","inmersivo",
    "co-creación","cocreación","codiseño","co-diseño","usabilidad","prototipado"
}
STOP_SNIPPETS = {
    "gracias","muchas gracias","ok","vale","sí","si","hola","buenas","buenos dias","buenas tardes","buenas noches",
    "este video","el video","video"
}

# ================== CÓDIGO BASE (no necesitas tocar) ==================
import re, json
from pathlib import Path
from collections import Counter, defaultdict
import numpy as np
import pandas as pd

# NLP
import spacy
try:
    nlp = spacy.load("es_core_news_md")
except Exception:
    nlp = spacy.load("es_core_news_sm")

# Embeddings (opcional)
use_st = True
try:
    from sentence_transformers import SentenceTransformer, util
    embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
except Exception:
    print("⚠️ Sin Sentence-Transformers: usaré similitud de spaCy como respaldo.")
    use_st = False

def _normalize_token(t):
    from unidecode import unidecode
    t = re.sub(r"[^\wáéíóúñüÁÉÍÓÚÑÜ-]+", "", t.lower())
    return unidecode(t)

def _contains_anchor(phrase_norm, axis_terms, lexicon):
    toks = phrase_norm.split()
    return any((tok in lexicon) or re.search(r"^interdisciplin", tok) for tok in toks)

def _trim_phrase(tokens, min_w, max_w, axis_terms, lexicon, stop_set):
    toks = [_normalize_token(t) for t in tokens if _normalize_token(t)]
    if len(toks) < min_w:
        return ""
    if len(toks) > max_w:
        toks = toks[:max_w]
    phrase = " ".join(toks)
    if phrase in stop_set:
        return ""
    if len(set(toks)) == 1 and len(toks) <= 3:
        return ""
    if not _contains_anchor(phrase, axis_terms, lexicon):
        return ""
    return phrase

def _extract_snippets(sentences, axis_terms, design_lexicon,
                      top_sents=120, sim_pct=55,
                      min_w=2, max_w=8,
                      stop_snippets=STOP_SNIPPETS):
    axis_regex = re.compile(r"\binterdisciplin\w*|interdisciplina\w*\b", flags=re.IGNORECASE)

    # similitud al eje
    if use_st:
        sent_emb = embedder.encode(sentences, convert_to_tensor=True, normalize_embeddings=True)
        axis_emb = embedder.encode(axis_terms, convert_to_tensor=True, normalize_embeddings=True)
        axis_vec = axis_emb.mean(dim=0)
        sims = util.cos_sim(sent_emb, axis_vec).cpu().numpy().reshape(-1)
    else:
        axis_doc = nlp(" ".join(axis_terms))
        sims = np.array([nlp(s).similarity(axis_doc) for s in sentences], dtype="float32")
        axis_vec = None

    threshold = np.percentile(sims, sim_pct)
    idx_sorted = np.argsort(-sims)
    sem_idx = [i for i in idx_sorted if sims[i] >= threshold][:top_sents]
    kw_idx = [i for i, s in enumerate(sentences) if axis_regex.search(s)]

    candidate_idx, seen = [], set()
    for i in idx_sorted:
        if i in sem_idx or i in kw_idx:
            if i not in seen:
                candidate_idx.append(i); seen.add(i)
        if len(candidate_idx) >= top_sents:
            break
    candidate_sents = [sentences[i] for i in candidate_idx]

    snippets, seen_snips = [], set()
    for s in candidate_sents:
        d = nlp(s)
        toks = s.split()

        # Ventanas centradas en la keyword
        kw_positions = [i for i, t in enumerate(toks) if axis_regex.search(t)]
        for pos in kw_positions:
            radius = max(1, (max_w - 1)//2)
            left = max(0, pos - radius)
            right = min(len(toks), pos + radius + 1)
            phrase = _trim_phrase(toks[left:right], min_w, max_w, axis_terms, design_lexicon, stop_snippets)
            if phrase and phrase not in seen_snips:
                snippets.append(phrase); seen_snips.add(phrase)

        # Noun chunks
        for nc in d.noun_chunks:
            phrase = _trim_phrase([t.text for t in nc], min_w, max_w, axis_terms, design_lexicon, stop_snippets)
            if phrase and phrase not in seen_snips:
                snippets.append(phrase); seen_snips.add(phrase)

        # Secuencias ADJ/NOUN/PROPN
        buf = []
        for t in d:
            if t.pos_ in ("ADJ","NOUN","PROPN") and not t.is_stop:
                buf.append(t.text)
            else:
                phrase = _trim_phrase(buf, min_w, max_w, axis_terms, design_lexicon, stop_snippets)
                if phrase and phrase not in seen_snips:
                    snippets.append(phrase); seen_snips.add(phrase)
                buf = []
        phrase = _trim_phrase(buf, min_w, max_w, axis_terms, design_lexicon, stop_snippets)
        if phrase and phrase not in seen_snips:
            snippets.append(phrase); seen_snips.add(phrase)

    # Refuerzo si quedaron pocos
    if len(snippets) < 20:
        for s in candidate_sents[:60]:
            toks = s.split()
            phrase = _trim_phrase(toks[:max_w+2], min_w, max_w, axis_terms, design_lexicon, stop_snippets)
            if phrase and phrase not in seen_snips:
                snippets.append(phrase); seen_snips.add(phrase)

    return snippets, axis_vec

def _extract_concepts(snips, min_w=2, max_w=3, axis_terms=None, design_lexicon=None):
    concepts = []
    for sn in snips:
        d = nlp(sn)
        buf = []
        for t in d:
            if t.pos_ in ("ADJ","NOUN","PROPN") and not t.is_stop and len(t.lemma_)>1:
                buf.append(_normalize_token(t.lemma_.lower()))
        if len(buf) >= min_w:
            buf = buf[:max_w]
            c = " ".join(buf)
            if _contains_anchor(c, axis_terms or [], design_lexicon or set()):
                concepts.append(c)
    return Counter(concepts)

def run_analysis(file_path, axis_terms=AXIS_TERMS,
                 top_sentences=TOP_SENTENCES, sim_pct=SIM_THRESHOLD_PERCENTILE,
                 min_snip=MIN_SNIPPET_WORDS, max_snip=MAX_SNIPPET_WORDS,
                 min_conc=MIN_CONCEPT_WORDS, max_conc=MAX_CONCEPT_WORDS,
                 design_lexicon=DESIGN_LEXICON):
    # Leer texto
    text = Path(file_path).read_text(encoding="utf-8", errors="ignore")
    text = re.sub(r"\s+", " ", text).strip()
    doc = nlp(text)
    sentences = [s.text.strip() for s in doc.sents if s.text.strip()]
    if not sentences:
        raise ValueError("No se detectaron oraciones.")

    # Generar snippets y conceptos
    snippets, axis_vec = _extract_snippets(
        sentences, axis_terms, design_lexicon,
        top_sents=top_sentences, sim_pct=sim_pct,
        min_w=min_snip, max_w=max_snip, stop_snippets=STOP_SNIPPETS
    )
    concept_freq = _extract_concepts(snippets, min_w=min_conc, max_w=max_conc,
                                     axis_terms=axis_terms, design_lexicon=design_lexicon)

    # Clusters opcionales (sobre snippets)
    clusters_out = []
    if use_st and snippets:
        from sklearn.cluster import KMeans
        X = embedder.encode(snippets, normalize_embeddings=True)
        k = 2 if len(snippets) < 30 else (3 if len(snippets) < 60 else 4)
        kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
        labels = kmeans.fit_predict(X)
        buckets = defaultdict(list)
        for lab, sn in zip(labels, snippets):
            buckets[int(lab)].append(sn)
        for c, items in buckets.items():
            if axis_vec is not None:
                sims_sn = util.cos_sim(embedder.encode(items, convert_to_tensor=True, normalize_embeddings=True),
                                       axis_vec).cpu().numpy().reshape(-1)
                rep = items[int(np.argmax(sims_sn))]
            else:
                rep = items[0]
            clusters_out.append({
                "cluster": int(c),
                "n_snippets": len(items),
                "representative": rep,
                "snippets": items[:10]
            })

    # ====== SALIDA EN CARPETA RESUMEN ======
    stem = Path(file_path).stem
    out_dir = Path("resumen de analisis") / stem
    out_dir.mkdir(parents=True, exist_ok=True)

    # CSVs
    pd.DataFrame({"snippet": snippets}).to_csv(out_dir / f"{stem}_snippets.csv", index=False)
    pd.DataFrame([{"concept": w, "freq": f} for w, f in _extract_concepts(snippets).most_common(50)]
                 ).to_csv(out_dir / f"{stem}_conceptos.csv", index=False)

    # JSON
    summary = {
        "axis_terms": axis_terms,
        "file": str(file_path),
        "snippets": snippets[:120],
        "concepts": [{"concept": w, "freq": int(f)} for w, f in _extract_concepts(snippets).most_common(50)],
        "clusters": clusters_out
    }
    (out_dir / f"{stem}_resumen.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")

    # Salida legible
    print(f"=== ARCHIVO: {stem} ===")
    print(f"Resultados guardados en: {out_dir}")
    print("\n— Fragmentos (muestra) —")
    for s in snippets[:20]: print("-", s)
    print("\n— Conceptos (top 15) —")
    conc_top = _extract_concepts(snippets).most_common(15)
    for w, f in conc_top: print(f"- {w} ({f})")
    if clusters_out:
        print("\n— Subtemas (clusters) —")
        for cl in sorted(clusters_out, key=lambda d: d["cluster"]):
            print(f"\n[Cluster {cl['cluster']}] n={cl['n_snippets']}")
            print("Representativa:", cl["representative"])
            for s in cl["snippets"]: print("-", s)

# Ejecutar (usa los parámetros declarados arriba)
run_analysis(FILE_PATH, axis_terms=AXIS_TERMS)
