# Configuración Inicial
Este bloque configura las rutas, variables y estructuras necesarias para procesar los datos.


In [39]:
import os
import pandas as pd
import re

# Configuración general
YEAR = 2022
RAW_FOLDER = "../data/raw"
PROCESSED_FOLDER = "../data/processed"
FILE_PATH_XLSX = f"{RAW_FOLDER}/data_{YEAR}.xlsx"
FILE_PATH_XLSB = f"{RAW_FOLDER}/data_{YEAR}.xlsb"
OUTPUT_PATH = f"{PROCESSED_FOLDER}/pluviometros-eda-{YEAR}.csv"

# Mapeo de columnas
COLUMN_MAPPING = {
    'Fecha': ['Fecha', 'Date', 'Día', 'Dia'],
    'Obrador Kiteni': ['Base Operativa Kiteni', 'Kiteni', 'Obrador Kiteni'],
    # Más columnas...
}


# Funciones Auxiliares
Este bloque contiene las funciones necesarias para procesar los datos:
1. Detección de encabezados.
2. Identificación de columnas relevantes.
3. Manejo de valores faltantes (`NaNs`).
4. Agregar información sobre rupturas.
5. Identificar outliers.


In [40]:
def detect_header_row(df, max_rows=15):
    """Detecta la fila que contiene los encabezados reales."""
    for row in range(max_rows):
        possible_header = df.iloc[row].values
        if any(isinstance(val, str) and len(val.strip()) > 0 for val in possible_header):
            return row
    return 0  # Si no se detecta, usar la primera fila

def detect_relevant_columns(df, column_mapping):
    """Detecta columnas relevantes basadas en nombres similares."""
    detected_columns = {}
    for expected_col, possible_names in column_mapping.items():
        for col in df.columns:
            if any(re.search(rf"\b{re.escape(name)}\b", str(col), re.IGNORECASE) for name in possible_names):
                detected_columns[expected_col] = col
                break
    return detected_columns

def handle_missing_values(df, method='global_mean', monthly_averages=None):
    """Reemplaza NaNs con el promedio por columna (global o mensual)."""
    for col in df.columns:
        if col != 'Fecha':
            if method == 'global_mean':
                # Promedio global por columna
                mean_value = df[col].replace(0, pd.NA).mean(skipna=True)
                df[col] = df[col].fillna(mean_value)
            elif method == 'monthly_mean' and monthly_averages is not None:
                # Promedio mensual basado en datos disponibles
                df['Mes'] = pd.to_datetime(df['Fecha']).dt.month
                df[col] = df.apply(
                    lambda row: monthly_averages.loc[row['Mes'], col]
                    if pd.isnull(row[col]) else row[col], axis=1
                )
    return df

def add_rupture_column(df, rupture_dates):
    """Agrega una columna con información sobre rupturas basada en fechas específicas."""
    df['Ruptura'] = df['Fecha'].apply(lambda x: 1 if x in rupture_dates else 0)
    return df

def detect_outliers(df):
    """Detecta días con valores extremos de lluvia basados en el percentil 95."""
    outlier_thresholds = {}
    for col in df.columns:
        if col not in ['Fecha', 'Ruptura', 'Mes']:
            threshold = df[col].quantile(0.95)
            outlier_thresholds[col] = threshold
            print(f"Outlier umbral para {col}: {threshold}")
    return outlier_thresholds


# Carga de Datos
Este bloque carga los datos desde el archivo Excel y consolida todas las hojas.


In [41]:
if os.path.exists(FILE_PATH_XLSX):
    file_path, engine = FILE_PATH_XLSX, None
elif os.path.exists(FILE_PATH_XLSB):
    file_path, engine = FILE_PATH_XLSB, 'pyxlsb'
else:
    raise FileNotFoundError(f"No se encontró archivo para el año {YEAR}")

excel_data = pd.ExcelFile(file_path, engine=engine)
sheet_names = excel_data.sheet_names
print(f"Hojas disponibles: {sheet_names}")

all_processed_data = pd.DataFrame()

Hojas disponibles: ['DATOS', 'OCT', 'NOV', 'DIC']


In [42]:
for sheet_name in sheet_names:
    # Filtrar hojas irrelevantes
    if sheet_name.upper() == "DATOS":
        print(f"Hoja {sheet_name} no es relevante. Omitiendo.")
        continue

    print(f"\nProcesando hoja: {sheet_name}")

    try:
        # Cargar una muestra de la hoja para detectar encabezados
        sheet_preview = excel_data.parse(sheet_name, nrows=15)
        header_row = detect_header_row(sheet_preview)
        print(f"Encabezado detectado en la fila: {header_row}")

        # Carga completa de datos
        sheet_data = excel_data.parse(sheet_name, skiprows=header_row)
        print(f"Columnas disponibles antes del mapeo: {sheet_data.columns.tolist()}")
        display(sheet_data.head())

        # Eliminar columnas 'Unnamed'
        sheet_data = sheet_data.loc[:, ~sheet_data.columns.str.contains('^Unnamed')]
        print(f"Columnas después de eliminar 'Unnamed': {sheet_data.columns.tolist()}")

        # Validar si la hoja tiene datos significativos
        if sheet_data.empty or all(sheet_data.columns.str.contains('^Unnamed')):
            print(f"Hoja {sheet_name} no contiene datos relevantes. Omitiendo.")
            continue

        # Renombrar columnas según el mapeo directo
        new_column_names = {}
        for expected_col, possible_names in COLUMN_MAPPING.items():
            for col in sheet_data.columns:
                if any(re.search(rf"\b{re.escape(name)}\b", str(col), re.IGNORECASE) for name in possible_names):
                    new_column_names[col] = expected_col

        # Aplicar el renombramiento
        sheet_data = sheet_data.rename(columns=new_column_names)
        print(f"Columnas después del mapeo: {sheet_data.columns.tolist()}")

        # Generar columna Fecha
        if 'Fecha' not in sheet_data.columns and 'Dia' in sheet_data.columns:
            sheet_data['Fecha'] = sheet_data['Dia'].apply(
                lambda x: f"{YEAR}-{sheet_name}-{int(x):02d}" if pd.notnull(x) and isinstance(x, (int, float)) else None
            )
            sheet_data.drop(columns=['Dia'], inplace=True)

        # Concatenar datos procesados
        all_processed_data = pd.concat([all_processed_data, sheet_data], ignore_index=True)

    except Exception as e:
        print(f"Error procesando la hoja {sheet_name}: {e}")

print("\nDatos consolidados:")
display(all_processed_data.head())


Hoja DATOS no es relevante. Omitiendo.

Procesando hoja: OCT
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 126+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007', 'Unnamed: 22', 'Unnamed: 23']


Unnamed: 0,Día,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 090+000,KP 095+000,PS#2,KP 126+000,KP 181+900,KP 184+039,PS#3,PS#4/XV10007,Unnamed: 22,Unnamed: 23
0,44835,1.0,0.2,0.0,0.0,0.0,0.0,0,5.0,4.0,...,0.8,3.0,1.1,1.1,0.0,0,0.0,0,0,0
1,44836,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0,6.0,0,0,0
2,44837,0.0,0.0,0.0,0.0,0.0,1.2,0,0.0,0.0,...,1.2,0.0,0.2,0.0,0.0,0,6.0,0,0,0
3,44838,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0
4,44839,0.0,0.0,0.0,0.0,0.0,2.0,0,2.0,14.0,...,0.0,0.0,2.5,5.4,0.0,0,0.0,0,0,0


Columnas después de eliminar 'Unnamed': ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 126+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 126+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']

Procesando hoja: NOV
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 126+000', 'KP 181+900', 'KP

Unnamed: 0,Día,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 090+000,KP 095+000,PS#2,KP 126+000,KP 181+900,KP 184+039,PS#3,PS#4/XV10007,Unnamed: 22,Unnamed: 23
0,44866,0.0,0.2,4.3,14.0,8.2,2.2,0,4.5,13.0,...,2.7,1.5,4.0,2.0,0,0,4.0,0,0,0
1,44867,7.0,0.5,14.2,10.1,12.2,28.0,0,35.0,27.5,...,44.4,25.1,15.0,3.5,0,0,13.0,2,0,0
2,44868,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0
3,44869,0.2,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0
4,44870,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0


Columnas después de eliminar 'Unnamed': ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 126+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 126+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']

Procesando hoja: DIC
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 126+000', 'KP 181+900', 'KP

Unnamed: 0,Día,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 090+000,KP 095+000,PS#2,KP 126+000,KP 181+900,KP 184+039,PS#3,PS#4/XV10007,Unnamed: 22,Unnamed: 23
0,44896,0.0,0.0,0.0,0.0,4.2,17.0,0,9.0,22.5,...,5.2,0.0,0.0,0.0,0,0,0.0,0,0,0
1,44897,0.0,0.0,0.0,6.4,33.0,34.0,0,54.0,65.0,...,9.2,11.0,14.5,4.2,0,0,0.0,0,0,0
2,44898,4.0,12.5,80.0,50.4,14.4,31.0,0,52.0,22.0,...,6.2,7.0,25.0,5.2,0,0,0.0,0,0,0
3,44899,0.0,14.6,0.0,6.2,3.0,7.2,0,1.0,9.0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0
4,44900,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0


Columnas después de eliminar 'Unnamed': ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 126+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 126+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']

Datos consolidados:


Unnamed: 0,Fecha,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 071+000,KP 082+000,KP 090+000,KP 095+000,PS#2,KP 126+000,KP 181+900,KP 184+039,PS#3,PS#4/XV10007
0,44835,1.0,0.2,0.0,0.0,0.0,0.0,0,5.0,4.0,...,0.0,0.0,0.8,3.0,1.1,1.1,0.0,0,0.0,0
1,44836,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,6.0,0
2,44837,0.0,0.0,0.0,0.0,0.0,1.2,0,0.0,0.0,...,0.0,6.2,1.2,0.0,0.2,0.0,0.0,0,6.0,0
3,44838,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
4,44839,0.0,0.0,0.0,0.0,0.0,2.0,0,2.0,14.0,...,0.0,0.0,0.0,0.0,2.5,5.4,0.0,0,0.0,0


# Limpieza de Datos y Análisis
Este bloque limpia los datos (manejo de `NaNs`), agrega información de rupturas, detecta outliers y calcula promedios mensuales.


In [43]:
# Calcular promedios mensuales para manejar NaNs
monthly_averages = all_processed_data.groupby(pd.to_datetime(all_processed_data['Fecha']).dt.month).mean(numeric_only=True)

# Manejo de valores faltantes
all_processed_data = handle_missing_values(all_processed_data, method='monthly_mean', monthly_averages=monthly_averages)

# Agregar columna de rupturas
rupture_dates = ['2020-01-15', '2020-05-20']
all_processed_data = add_rupture_column(all_processed_data, rupture_dates)

# Detectar outliers
outlier_thresholds = detect_outliers(all_processed_data)


Outlier umbral para Obrador Kiteni: 21.95999999999998
Outlier umbral para PS#1: 38.43999999999998
Outlier umbral para KP 007+400: 53.51999999999998
Outlier umbral para KP 018+000: 42.29999999999996
Outlier umbral para KP 030+000: 54.49999999999999
Outlier umbral para KP 040+000: 39.19999999999993
Outlier umbral para KP 043+600: 0.0
Outlier umbral para KP 046+100: 53.039999999999985
Outlier umbral para KP 050+000: 50.79999999999994
Outlier umbral para KP 057+000: 68.39999999999995
Outlier umbral para KP 064+000: 28.51999999999999
Outlier umbral para KP 071+000: 38.63999999999995
Outlier umbral para KP 082+000: 27.439999999999944
Outlier umbral para KP 090+000: 43.079999999999984
Outlier umbral para KP 095+000: 35.399999999999906
Outlier umbral para PS#2: 26.819999999999997
Outlier umbral para KP 126+000: 37.599999999999966
Outlier umbral para KP 181+900: 12.169999999999984
Outlier umbral para KP 184+039: 0.0
Outlier umbral para PS#3: 17.959999999999997
Outlier umbral para PS#4/XV10007: 

# Exportar Resultados
Guarda los datos procesados en un archivo CSV consolidado.


In [44]:
all_processed_data.to_csv(OUTPUT_PATH, index=False, encoding='utf-8')
print(f"Datos consolidados guardados en: {OUTPUT_PATH}")


Datos consolidados guardados en: ../data/processed/pluviometros-eda-2022.csv
