In [23]:
# ===============================================================
# 00_setup.ipynb - Configuración Inicial del Proyecto
# ===============================================================
#
# **Propósito**: 
# - Validar estructura del proyecto
# - Verificar dependencias instaladas
# - Configurar paths usando utils_shared.py
# - Preparar entorno para ejecutar notebooks 01_ y 02_
#
# **Cuándo ejecutar**:
# - Primera vez que clonas el repositorio
# - Después de cambios en la estructura de carpetas
# - Para verificar que todo está correctamente instalado
#
# **Qué NO hace**:
# - No procesa datos (eso es 01_eda_understanding.ipynb)
# - No crea splits (eso es 02_create_splits.ipynb)
# - No entrena modelos (eso es 02_baseline_*.ipynb)
#
# ===============================================================

import os, sys, pathlib, yaml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ---------------------------------------------------------------
# 1. Detección de entorno (Local vs Google Colab)
# ---------------------------------------------------------------
try:
    import google.colab
    IN_COLAB = True
    print("[INFO] Entorno: Google Colab")
except ImportError:
    IN_COLAB = False
    print("[INFO] Entorno: Local (Jupyter/VSCode)")

if IN_COLAB:
    print("  En Colab puedes montar Google Drive para persistir datos")
else:
    print("  Ejecutando en entorno local")

[INFO] Entorno: Local (Jupyter/VSCode)
  Ejecutando en entorno local


In [24]:
# ===============================================================
# 2. Configuración de Paths usando utils_shared.py
# ===============================================================
#
# Intentamos usar utils_shared.py para centralizar configuración.
# Si no está disponible, usamos configuración manual.
#
# ===============================================================

try:
    from utils_shared import setup_paths
    print("[INFO] Cargando paths desde utils_shared.py")
    
    paths = setup_paths()
    BASE_PATH = paths['BASE_PATH']
    DATA_PATH = paths['DATA_PATH']
    FORK_PATH = paths['FORK_PATH']
    SPLITS_PATH = paths['SPLITS_PATH']
    FIGS_PATH = paths['FIGS_PATH']
    
    print("  Usando configuración centralizada")
    
except ImportError:
    print("[WARNING] utils_shared.py no encontrado, usando configuración manual")
    
    BASE_PATH = pathlib.Path.cwd()
    if BASE_PATH.name == "notebooks":
        BASE_PATH = BASE_PATH.parent
    
    DATA_PATH = BASE_PATH / "data"
    FORK_PATH = BASE_PATH / "Spanish_Psych_Phenotyping_PY"
    SPLITS_PATH = DATA_PATH / "splits"
    FIGS_PATH = DATA_PATH / "figs"
    
    # Crear directorios si no existen
    DATA_PATH.mkdir(exist_ok=True)
    SPLITS_PATH.mkdir(exist_ok=True)
    FIGS_PATH.mkdir(exist_ok=True)

# En Colab → permitir montaje de Google Drive
if IN_COLAB:
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=False)
        DRIVE_DATA_PATH = pathlib.Path("/content/drive/MyDrive/psych-data")
        
        if DRIVE_DATA_PATH.exists():
            DATA_PATH = DRIVE_DATA_PATH
            SPLITS_PATH = DATA_PATH / "splits"
            FIGS_PATH = DATA_PATH / "figs"
            print("[INFO] Usando datos desde Google Drive")
    except Exception as e:
        print(f"[WARNING] No se pudo montar Google Drive: {e}")

print(f"\n[INFO] Paths configurados:")
print(f"  BASE_PATH:   {BASE_PATH}")
print(f"  DATA_PATH:   {DATA_PATH}")
print(f"  FORK_PATH:   {FORK_PATH}")
print(f"  SPLITS_PATH: {SPLITS_PATH}")
print(f"  FIGS_PATH:   {FIGS_PATH}")

[INFO] Cargando paths desde utils_shared.py
  Usando configuración centralizada

[INFO] Paths configurados:
  BASE_PATH:   /Users/manuelnunez/Projects/psych-phenotyping-paraguay
  DATA_PATH:   /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data
  FORK_PATH:   /Users/manuelnunez/Projects/psych-phenotyping-paraguay/Spanish_Psych_Phenotyping_PY
  SPLITS_PATH: /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/splits
  FIGS_PATH:   /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/figs


In [25]:
# Verificación de estructura del proyecto
print("\n[INFO] Verificando estructura del proyecto...")

auto_create_dirs = {
    'data/': DATA_PATH,
    'data/splits/': SPLITS_PATH,
    'data/figs/': FIGS_PATH,
    'notebooks/': BASE_PATH / "notebooks",
    'configs/': BASE_PATH / "configs"
}

required_dirs = {
    'Spanish_Psych_Phenotyping_PY/': FORK_PATH
}

for name, path in auto_create_dirs.items():
    if path.exists():
        print(f"[OK] {name:30} {path}")
    else:
        path.mkdir(parents=True, exist_ok=True)
        print(f"[NEW] {name:30} {path}")

for name, path in required_dirs.items():
    if path.exists():
        print(f"[OK] {name:30} {path}")
    else:
        print(f"[ERROR] {name:30} {path} (NO ENCONTRADO)")
        print(f"        El fork es necesario para baseline rule-based")

print("="*60)


[INFO] Verificando estructura del proyecto...
[OK] data/                          /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data
[OK] data/splits/                   /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/splits
[OK] data/figs/                     /Users/manuelnunez/Projects/psych-phenotyping-paraguay/data/figs
[OK] notebooks/                     /Users/manuelnunez/Projects/psych-phenotyping-paraguay/notebooks
[OK] configs/                       /Users/manuelnunez/Projects/psych-phenotyping-paraguay/configs
[OK] Spanish_Psych_Phenotyping_PY/  /Users/manuelnunez/Projects/psych-phenotyping-paraguay/Spanish_Psych_Phenotyping_PY


In [26]:
# Verificación de dependencias
print("\n[INFO] Verificando dependencias de Python...")

dependencies = {
    'Core Data Science': ['pandas', 'numpy', 'scipy'],
    'Machine Learning': ['sklearn', 'transformers', 'torch'],
    'Visualización': ['matplotlib', 'seaborn', 'plotly'],
    'NLP/Texto': ['nltk', 'spacy'],
    'Utilities': ['yaml', 'tqdm']
}

all_ok = True

for category, libs in dependencies.items():
    print(f"\n{category}:")
    for lib in libs:
        try:
            __import__(lib)
            print(f"  [OK] {lib}")
        except ImportError:
            print(f"  [ERROR] {lib} (NO INSTALADO)")
            all_ok = False

if all_ok:
    print("\n[INFO] Todas las dependencias instaladas correctamente")
else:
    print("\n[WARNING] Faltan dependencias. Instalar con: pip install -r requirements.txt")

print("="*60)


[INFO] Verificando dependencias de Python...

Core Data Science:
  [OK] pandas
  [OK] numpy
  [OK] scipy

Machine Learning:
  [OK] sklearn
  [OK] transformers
  [OK] torch

Visualización:
  [OK] matplotlib
  [OK] seaborn
  [OK] plotly

NLP/Texto:
  [OK] nltk
  [OK] spacy

Utilities:
  [OK] yaml
  [OK] tqdm

[INFO] Todas las dependencias instaladas correctamente


In [27]:
# Verificación de archivos de datos
print("\n[INFO] Verificando archivos de datos...")

data_files = {
    'ips_raw.csv': {
        'path': DATA_PATH / 'ips_raw.csv',
        'required': True,
        'source': 'Archivo original del dataset',
        'used_by': '01_eda_understanding.ipynb'
    },
    'ips_clean.csv': {
        'path': DATA_PATH / 'ips_clean.csv',
        'required': False,
        'source': 'Generado por 01_eda_understanding.ipynb',
        'used_by': '02_create_splits.ipynb'
    },
    'splits/dataset_base.csv': {
        'path': SPLITS_PATH / 'dataset_base.csv',
        'required': False,
        'source': 'Generado por 02_create_splits.ipynb',
        'used_by': 'Todos los 02_baseline_*.ipynb'
    },
    'splits/train_indices.csv': {
        'path': SPLITS_PATH / 'train_indices.csv',
        'required': False,
        'source': 'Generado por 02_create_splits.ipynb',
        'used_by': 'Todos los 02_baseline_*.ipynb'
    },
    'splits/val_indices.csv': {
        'path': SPLITS_PATH / 'val_indices.csv',
        'required': False,
        'source': 'Generado por 02_create_splits.ipynb',
        'used_by': 'Todos los 02_baseline_*.ipynb'
    }
}

missing_critical = []

for name, info in data_files.items():
    exists = info['path'].exists()
    required = info['required']
    
    if exists:
        print(f"[OK] {name:30}")
        if info['path'].suffix == '.csv':
            try:
                df_info = pd.read_csv(info['path'], nrows=1)
                print(f"     Columnas: {list(df_info.columns)}")
            except:
                pass
    else:
        status = "[ERROR]" if required else "[WARNING]"
        print(f"{status} {name:30} (NO ENCONTRADO)")
        print(f"         {info['source']}")
        print(f"         Usado por: {info['used_by']}")
        
        if required:
            missing_critical.append(name)
    
    print()

print("="*60)
if missing_critical:
    print(f"[ERROR] Faltan {len(missing_critical)} archivo(s) crítico(s): {', '.join(missing_critical)}")
else:
    print("[INFO] Todos los archivos críticos disponibles")


[INFO] Verificando archivos de datos...
[OK] ips_raw.csv                   
     Columnas: ['Archivo', 'Prontuario', 'Nombre Paciente', 'Sexo', 'Fecha Nacimiento', 'N° Consulta', 'Id', 'Fecha Consulta', 'Motivo Consulta', 'Tipo']

         Generado por 01_eda_understanding.ipynb
         Usado por: 02_create_splits.ipynb

         Generado por 02_create_splits.ipynb
         Usado por: Todos los 02_baseline_*.ipynb

         Generado por 02_create_splits.ipynb
         Usado por: Todos los 02_baseline_*.ipynb

         Generado por 02_create_splits.ipynb
         Usado por: Todos los 02_baseline_*.ipynb

[INFO] Todos los archivos críticos disponibles
