In [1]:
# ===============================================================
# 00_setup.ipynb - Configuraci√≥n Inicial del Proyecto
# ===============================================================
#
# **Prop√≥sito**: 
# - Validar estructura del proyecto
# - Verificar dependencias instaladas
# - Configurar paths usando utils_shared.py
# - Preparar entorno para ejecutar notebooks 01_ y 02_
#
# **Cu√°ndo ejecutar**:
# - Primera vez que clonas el repositorio
# - Despu√©s de cambios en la estructura de carpetas
# - Para verificar que todo est√° correctamente instalado
#
# **Qu√© NO hace**:
# - No procesa datos (eso es 01_eda_understanding.ipynb)
# - No crea splits (eso es 02_create_splits.ipynb)
# - No entrena modelos (eso es 02_baseline_*.ipynb)
#
# ===============================================================

import os, sys, pathlib, yaml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ---------------------------------------------------------------
# 1. Detecci√≥n de entorno (Local vs Google Colab)
# ---------------------------------------------------------------
try:
    import google.colab
    IN_COLAB = True
    print("üì¶ Entorno: Google Colab")
except ImportError:
    IN_COLAB = False
    print("üì¶ Entorno: Local (Jupyter/VSCode)")

if IN_COLAB:
    print("   üí° En Colab puedes montar Google Drive para persistir datos")
else:
    print("   üí° Ejecutando en entorno local")

üì¶ Entorno: Local (Jupyter/VSCode)
   üí° Ejecutando en entorno local


In [2]:
# ===============================================================
# 2. Configuraci√≥n de Paths usando utils_shared.py
# ===============================================================
#
# Intentamos usar utils_shared.py para centralizar configuraci√≥n.
# Si no est√° disponible, usamos configuraci√≥n manual.
#
# ===============================================================

try:
    from utils_shared import setup_paths
    print("‚úÖ Cargando paths desde utils_shared.py")
    
    paths = setup_paths()
    BASE_PATH = paths['BASE_PATH']
    DATA_PATH = paths['DATA_PATH']
    FORK_PATH = paths['FORK_PATH']
    SPLITS_PATH = paths['SPLITS_PATH']
    FIGS_PATH = paths['FIGS_PATH']
    
    print("   Usando configuraci√≥n centralizada")
    
except ImportError:
    print("‚ö†Ô∏è utils_shared.py no encontrado, usando configuraci√≥n manual")
    
    BASE_PATH = pathlib.Path.cwd()
    if BASE_PATH.name == "notebooks":
        BASE_PATH = BASE_PATH.parent
    
    DATA_PATH = BASE_PATH / "data"
    FORK_PATH = BASE_PATH / "Spanish_Psych_Phenotyping_PY"
    SPLITS_PATH = DATA_PATH / "splits"
    FIGS_PATH = DATA_PATH / "figs"
    
    # Crear directorios si no existen
    DATA_PATH.mkdir(exist_ok=True)
    SPLITS_PATH.mkdir(exist_ok=True)
    FIGS_PATH.mkdir(exist_ok=True)

# En Colab ‚Üí permitir montaje de Google Drive
if IN_COLAB:
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=False)
        DRIVE_DATA_PATH = pathlib.Path("/content/drive/MyDrive/psych-data")
        
        if DRIVE_DATA_PATH.exists():
            DATA_PATH = DRIVE_DATA_PATH
            SPLITS_PATH = DATA_PATH / "splits"
            FIGS_PATH = DATA_PATH / "figs"
            print("üìÅ Usando datos desde Google Drive")
    except Exception as e:
        print(f"‚ö†Ô∏è No se pudo montar Google Drive: {e}")

print(f"\nüìÅ Paths configurados:")
print(f"   BASE_PATH:   {BASE_PATH}")
print(f"   DATA_PATH:   {DATA_PATH}")
print(f"   FORK_PATH:   {FORK_PATH}")
print(f"   SPLITS_PATH: {SPLITS_PATH}")
print(f"   FIGS_PATH:   {FIGS_PATH}")

‚úÖ Cargando paths desde utils_shared.py
   Usando configuraci√≥n centralizada

üìÅ Paths configurados:
   BASE_PATH:   /Users/manuelnunez/Projects/psych-phenotyping-paraguay
   DATA_PATH:   /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data
   FORK_PATH:   /Users/manuelnunez/Projects/psych-phenotyping-paraguay/Spanish_Psych_Phenotyping_PY
   SPLITS_PATH: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/splits
   FIGS_PATH:   /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/figs


In [3]:
# ===============================================================
# 3. Verificaci√≥n de Estructura del Proyecto
# ===============================================================
#
# Verifica que existen las carpetas clave del proyecto.
# Si faltan, las crea autom√°ticamente (excepto FORK_PATH).
#
# ===============================================================

print("\nüîç Verificando estructura del proyecto...\n")

# Directorios que DEBEN existir (se crean si faltan)
auto_create_dirs = {
    'data/': DATA_PATH,
    'data/splits/': SPLITS_PATH,
    'data/figs/': FIGS_PATH,
    'notebooks/': BASE_PATH / "notebooks",
    'configs/': BASE_PATH / "configs"
}

# Directorios que NO se crean autom√°ticamente (requieren acci√≥n del usuario)
required_dirs = {
    'Spanish_Psych_Phenotyping_PY/': FORK_PATH
}

# Crear directorios faltantes
for name, path in auto_create_dirs.items():
    if path.exists():
        print(f"‚úÖ {name:30} {path}")
    else:
        path.mkdir(parents=True, exist_ok=True)
        print(f"üÜï {name:30} {path} (creado)")

# Validar directorios que requieren acci√≥n manual
for name, path in required_dirs.items():
    if path.exists():
        print(f"‚úÖ {name:30} {path}")
    else:
        print(f"‚ùå {name:30} {path} (NO ENCONTRADO)")
        print(f"   ‚ö†Ô∏è El fork es necesario para baseline rule-based")
        print(f"   üí° Clona desde: https://github.com/[owner]/Spanish_Psych_Phenotyping_PY")

print("\n" + "="*60)


üîç Verificando estructura del proyecto...

‚úÖ data/                          /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data
‚úÖ data/splits/                   /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/splits
‚úÖ data/figs/                     /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/figs
‚úÖ notebooks/                     /Users/manuelnunez/Projects/psych-phenotyping-paraguay/notebooks
‚úÖ configs/                       /Users/manuelnunez/Projects/psych-phenotyping-paraguay/configs
‚úÖ Spanish_Psych_Phenotyping_PY/  /Users/manuelnunez/Projects/psych-phenotyping-paraguay/Spanish_Psych_Phenotyping_PY



In [4]:
# ===============================================================
# 4. Verificaci√≥n de Dependencias de Python
# ===============================================================
#
# Verifica que las librer√≠as necesarias est√°n instaladas.
# Agrupa por categor√≠a para diagn√≥stico m√°s claro.
#
# ===============================================================

print("\nüîç Verificando dependencias de Python...\n")

dependencies = {
    'Core Data Science': ['pandas', 'numpy', 'scipy'],
    'Machine Learning': ['sklearn', 'transformers', 'torch'],
    'Visualizaci√≥n': ['matplotlib', 'seaborn', 'plotly'],
    'NLP/Texto': ['nltk', 'spacy'],
    'Utilities': ['yaml', 'tqdm']
}

all_ok = True

for category, libs in dependencies.items():
    print(f"üì¶ {category}:")
    for lib in libs:
        try:
            __import__(lib)
            print(f"   ‚úÖ {lib}")
        except ImportError:
            print(f"   ‚ùå {lib} (NO INSTALADO)")
            all_ok = False
    print()

if all_ok:
    print("‚úÖ Todas las dependencias est√°n instaladas correctamente")
else:
    print("‚ö†Ô∏è Faltan dependencias. Instalar con:")
    print("   pip install -r requirements.txt")

print("="*60)


üîç Verificando dependencias de Python...

üì¶ Core Data Science:
   ‚úÖ pandas
   ‚úÖ numpy
   ‚úÖ scipy

üì¶ Machine Learning:
   ‚úÖ sklearn
   ‚úÖ transformers
   ‚úÖ torch

üì¶ Visualizaci√≥n:
   ‚úÖ matplotlib
   ‚úÖ seaborn
   ‚úÖ plotly

üì¶ NLP/Texto:
   ‚úÖ transformers
   ‚úÖ torch

üì¶ Visualizaci√≥n:
   ‚úÖ matplotlib
   ‚úÖ seaborn
   ‚úÖ plotly

üì¶ NLP/Texto:
   ‚úÖ nltk
   ‚úÖ nltk
   ‚úÖ spacy

üì¶ Utilities:
   ‚úÖ yaml
   ‚úÖ tqdm

‚úÖ Todas las dependencias est√°n instaladas correctamente
   ‚úÖ spacy

üì¶ Utilities:
   ‚úÖ yaml
   ‚úÖ tqdm

‚úÖ Todas las dependencias est√°n instaladas correctamente


In [5]:
# ===============================================================
# 5. Verificaci√≥n de Archivos de Datos
# ===============================================================
#
# Verifica la disponibilidad de archivos clave en el flujo de trabajo:
# - ips_raw.csv (entrada original)
# - ips_clean.csv (salida de 01_eda)
# - splits/ (salida de 02_create_splits)
#
# ===============================================================

print("\nüîç Verificando archivos de datos...\n")

# Archivos clave y su prop√≥sito
data_files = {
    'ips_raw.csv': {
        'path': DATA_PATH / 'ips_raw.csv',
        'required': True,
        'source': 'Archivo original del dataset',
        'used_by': '01_eda_understanding.ipynb'
    },
    'ips_clean.csv': {
        'path': DATA_PATH / 'ips_clean.csv',
        'required': False,
        'source': 'Generado por 01_eda_understanding.ipynb',
        'used_by': '02_create_splits.ipynb'
    },
    'splits/dataset_base.csv': {
        'path': SPLITS_PATH / 'dataset_base.csv',
        'required': False,
        'source': 'Generado por 02_create_splits.ipynb',
        'used_by': 'Todos los 02_baseline_*.ipynb'
    },
    'splits/train_indices.csv': {
        'path': SPLITS_PATH / 'train_indices.csv',
        'required': False,
        'source': 'Generado por 02_create_splits.ipynb',
        'used_by': 'Todos los 02_baseline_*.ipynb'
    },
    'splits/val_indices.csv': {
        'path': SPLITS_PATH / 'val_indices.csv',
        'required': False,
        'source': 'Generado por 02_create_splits.ipynb',
        'used_by': 'Todos los 02_baseline_*.ipynb'
    }
}

missing_critical = []

for name, info in data_files.items():
    exists = info['path'].exists()
    required = info['required']
    
    if exists:
        print(f"‚úÖ {name:30}")
        print(f"   üìÑ {info['path']}")
        
        # Mostrar info adicional si es CSV
        if info['path'].suffix == '.csv':
            try:
                df_info = pd.read_csv(info['path'], nrows=1)
                print(f"   üìã Columnas: {list(df_info.columns)}")
            except:
                pass
    else:
        status = "‚ùå CR√çTICO" if required else "‚ö†Ô∏è Pendiente"
        print(f"{status} {name:30} (NO ENCONTRADO)")
        print(f"   üí° {info['source']}")
        print(f"   üîó Usado por: {info['used_by']}")
        
        if required:
            missing_critical.append(name)
    
    print()

# Resumen final
print("="*60)
if missing_critical:
    print(f"‚ùå Faltan {len(missing_critical)} archivo(s) cr√≠tico(s):")
    for f in missing_critical:
        print(f"   - {f}")
    print("\n‚ö†Ô∏è No podr√°s ejecutar los notebooks hasta tener estos archivos")
else:
    print("‚úÖ Archivos cr√≠ticos disponibles")
    print("üí° Puedes ejecutar 01_eda_understanding.ipynb para comenzar")


üîç Verificando archivos de datos...

‚úÖ ips_raw.csv                   
   üìÑ /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/ips_raw.csv
   üìã Columnas: ['Archivo', 'Prontuario', 'Nombre Paciente', 'Sexo', 'Fecha Nacimiento', 'N¬∞ Consulta', 'Id', 'Fecha Consulta', 'Motivo Consulta', 'Tipo']

‚úÖ ips_clean.csv                 
   üìÑ /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/ips_clean.csv
   üìã Columnas: ['id_paciente', 'fecha', 'etiqueta', 'texto']

‚úÖ splits/dataset_base.csv       
   üìÑ /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/splits/dataset_base.csv
   üìã Columnas: ['row_id', 'texto', 'etiqueta']

‚úÖ splits/train_indices.csv      
   üìÑ /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/splits/train_indices.csv
   üìã Columnas: ['row_id']

‚úÖ splits/val_indices.csv        
   üìÑ /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/splits/val_indices.csv
   üìã Columnas: ['row_id']

‚úÖ Archivos