# Cargar y ver dataset

In [1]:
# Carga de Dataset - Gen√©rico

## Objetivo:
### ‚úî Cargar datos usando configuraci√≥n
### ‚úî Inspecci√≥n inicial autom√°tica
### ‚úî Validaci√≥n de estructura

import sys
sys.path.append('../src')

from data_loader import load_raw, get_feature_target_columns
from config_loader import load_config

# Cargar configuraci√≥n
config = load_config("../config.yaml")  # Especificar ruta relativa
print("üìã Configuraci√≥n cargada:")
print(f"   Dataset: {config['dataset']['path']}")
print(f"   Columna temporal: {config['dataset']['datetime_col']}")
print(f"   Columna target: {config['dataset']['target_col']}")

# Cargar dataset
df = load_raw("../config.yaml")

# Informaci√≥n b√°sica
print(f"\nüìä INFORMACI√ìN DEL DATASET:")
print(f"   Forma: {df.shape}")
print(f"   Memoria: {df.memory_usage().sum() / 1024**2:.1f} MB")

# Primeras filas
print(f"\nüîç PRIMERAS 5 FILAS:")
display(df.head())

# Informaci√≥n detallada
print(f"\nüìà INFORMACI√ìN DETALLADA:")
df.info()

# Obtener columnas seg√∫n configuraci√≥n
feature_cols, target_col = get_feature_target_columns("../config.yaml")
print(f"\nüéØ COLUMNAS SEG√öN CONFIGURACI√ìN:")
print(f"   Features: {feature_cols}")
print(f"   Target: {target_col}")

# Verificar que las columnas existen
missing_cols = set(feature_cols + [target_col]) - set(df.columns)
if missing_cols:
    print(f"   ‚ö†Ô∏è  Columnas faltantes: {missing_cols}")
else:
    print(f"   ‚úÖ Todas las columnas configuradas est√°n presentes")

# Rango temporal
datetime_col = config['dataset']['datetime_col']
print(f"\nüìÖ RANGO TEMPORAL:")
print(f"   Inicio: {df[datetime_col].min()}")
print(f"   Final: {df[datetime_col].max()}")
print(f"   Duraci√≥n: {df[datetime_col].max() - df[datetime_col].min()}")

# Valores nulos
print(f"\nüîç VALORES NULOS:")
null_counts = df.isnull().sum()
if null_counts.sum() == 0:
    print("   ‚úÖ No hay valores nulos")
else:
    print(null_counts[null_counts > 0])

üìã Configuraci√≥n cargada:
   Dataset: data/raw/industrial_timeseries.csv
   Columna temporal: timestamp
   Columna target: value
‚úÖ Dataset raw cargado desde: ../data/raw/industrial_timeseries.csv
   Shape: (17520, 10)
   Columnas: ['timestamp', 'value', 'temperature', 'demand_factor', 'operational_efficiency', 'energy_price', 'hour', 'day_of_week', 'month', 'is_weekend']
   Rango temporal: 2022-01-01 00:00:00 a 2023-12-31 23:00:00

üìä INFORMACI√ìN DEL DATASET:
   Forma: (17520, 10)
   Memoria: 1.3 MB

üîç PRIMERAS 5 FILAS:


Unnamed: 0,timestamp,value,temperature,demand_factor,operational_efficiency,energy_price,hour,day_of_week,month,is_weekend
0,2022-01-01 00:00:00,1188.15,20.99,0.506,0.8424,81.83,0,5,1,1
1,2022-01-01 01:00:00,1211.43,21.8,0.5478,0.8637,83.15,1,5,1,1
2,2022-01-01 02:00:00,1165.22,25.32,0.4807,0.8237,78.85,2,5,1,1
3,2022-01-01 03:00:00,1221.48,28.74,0.6202,0.8268,93.94,3,5,1,1
4,2022-01-01 04:00:00,1186.6,26.5,0.4447,0.8438,85.12,4,5,1,1



üìà INFORMACI√ìN DETALLADA:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17520 entries, 0 to 17519
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   timestamp               17520 non-null  datetime64[ns]
 1   value                   17520 non-null  float64       
 2   temperature             17520 non-null  float64       
 3   demand_factor           17520 non-null  float64       
 4   operational_efficiency  17520 non-null  float64       
 5   energy_price            17520 non-null  float64       
 6   hour                    17520 non-null  int64         
 7   day_of_week             17520 non-null  int64         
 8   month                   17520 non-null  int64         
 9   is_weekend              17520 non-null  int64         
dtypes: datetime64[ns](1), float64(5), int64(4)
memory usage: 1.3 MB

üéØ COLUMNAS SEG√öN CONFIGURACI√ìN:
   Features: ['temperature', 'dema