# Configuración Inicial
Este bloque configura las rutas, variables y estructuras necesarias para procesar los datos.


In [1]:
import os
import pandas as pd
import re

# Configuración general
YEAR = 2023
RAW_FOLDER = "../data/raw"
PROCESSED_FOLDER = "../data/processed"
FILE_PATH_XLSX = f"{RAW_FOLDER}/data_{YEAR}.xlsx"
FILE_PATH_XLSB = f"{RAW_FOLDER}/data_{YEAR}.xlsb"
OUTPUT_PATH = f"{PROCESSED_FOLDER}/pluviometros-eda-{YEAR}.csv"

# Mapeo de columnas
COLUMN_MAPPING = {
    'Fecha': ['Fecha', 'Date', 'Día', 'Dia'],
    'Obrador Kiteni': ['Base Operativa Kiteni', 'Kiteni', 'Obrador Kiteni'],
    # Más columnas...
}


# Funciones Auxiliares
Este bloque contiene las funciones necesarias para procesar los datos:
1. Detección de encabezados.
2. Identificación de columnas relevantes.
3. Manejo de valores faltantes (`NaNs`).
4. Agregar información sobre rupturas.
5. Identificar outliers.


In [2]:
def detect_header_row(df, max_rows=15):
    """Detecta la fila que contiene los encabezados reales."""
    for row in range(max_rows):
        possible_header = df.iloc[row].values
        if any(isinstance(val, str) and len(val.strip()) > 0 for val in possible_header):
            return row
    return 0  # Si no se detecta, usar la primera fila

def detect_relevant_columns(df, column_mapping):
    """Detecta columnas relevantes basadas en nombres similares."""
    detected_columns = {}
    for expected_col, possible_names in column_mapping.items():
        for col in df.columns:
            if any(re.search(rf"\b{re.escape(name)}\b", str(col), re.IGNORECASE) for name in possible_names):
                detected_columns[expected_col] = col
                break
    return detected_columns

def handle_missing_values(df, method='global_mean', monthly_averages=None):
    """Reemplaza NaNs con el promedio por columna (global o mensual)."""
    for col in df.columns:
        if col != 'Fecha':
            if method == 'global_mean':
                # Promedio global por columna
                mean_value = df[col].replace(0, pd.NA).mean(skipna=True)
                df[col] = df[col].fillna(mean_value)
            elif method == 'monthly_mean' and monthly_averages is not None:
                # Promedio mensual basado en datos disponibles
                df['Mes'] = pd.to_datetime(df['Fecha']).dt.month
                df[col] = df.apply(
                    lambda row: monthly_averages.loc[row['Mes'], col]
                    if pd.isnull(row[col]) else row[col], axis=1
                )
    return df

def add_rupture_column(df, rupture_dates):
    """Agrega una columna con información sobre rupturas basada en fechas específicas."""
    df['Ruptura'] = df['Fecha'].apply(lambda x: 1 if x in rupture_dates else 0)
    return df

def detect_outliers(df):
    """Detecta días con valores extremos de lluvia basados en el percentil 95."""
    outlier_thresholds = {}
    for col in df.columns:
        if col not in ['Fecha', 'Ruptura', 'Mes']:
            threshold = df[col].quantile(0.95)
            outlier_thresholds[col] = threshold
            print(f"Outlier umbral para {col}: {threshold}")
    return outlier_thresholds


# Carga de Datos
Este bloque carga los datos desde el archivo Excel y consolida todas las hojas.


In [3]:
if os.path.exists(FILE_PATH_XLSX):
    file_path, engine = FILE_PATH_XLSX, None
elif os.path.exists(FILE_PATH_XLSB):
    file_path, engine = FILE_PATH_XLSB, 'pyxlsb'
else:
    raise FileNotFoundError(f"No se encontró archivo para el año {YEAR}")

excel_data = pd.ExcelFile(file_path, engine=engine)
sheet_names = excel_data.sheet_names
print(f"Hojas disponibles: {sheet_names}")

all_processed_data = pd.DataFrame()

Hojas disponibles: ['DATOS', 'ENE', 'FEB', 'MAR', 'ABR', 'MAY', 'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV', 'DIC', 'ENE´24', 'ENE´24 - WEBGIS']


In [4]:
for sheet_name in sheet_names:
    # Filtrar hojas irrelevantes
    if sheet_name.upper() == "DATOS":
        print(f"Hoja {sheet_name} no es relevante. Omitiendo.")
        continue

    print(f"\nProcesando hoja: {sheet_name}")

    try:
        # Cargar una muestra de la hoja para detectar encabezados
        sheet_preview = excel_data.parse(sheet_name, nrows=15)
        header_row = detect_header_row(sheet_preview)
        print(f"Encabezado detectado en la fila: {header_row}")

        # Carga completa de datos
        sheet_data = excel_data.parse(sheet_name, skiprows=header_row)
        print(f"Columnas disponibles antes del mapeo: {sheet_data.columns.tolist()}")
        display(sheet_data.head())

        # Eliminar columnas 'Unnamed'
        sheet_data = sheet_data.loc[:, ~sheet_data.columns.str.contains('^Unnamed')]
        print(f"Columnas después de eliminar 'Unnamed': {sheet_data.columns.tolist()}")

        # Validar si la hoja tiene datos significativos
        if sheet_data.empty or all(sheet_data.columns.str.contains('^Unnamed')):
            print(f"Hoja {sheet_name} no contiene datos relevantes. Omitiendo.")
            continue

        # Renombrar columnas según el mapeo directo
        new_column_names = {}
        for expected_col, possible_names in COLUMN_MAPPING.items():
            for col in sheet_data.columns:
                if any(re.search(rf"\b{re.escape(name)}\b", str(col), re.IGNORECASE) for name in possible_names):
                    new_column_names[col] = expected_col

        # Aplicar el renombramiento
        sheet_data = sheet_data.rename(columns=new_column_names)
        print(f"Columnas después del mapeo: {sheet_data.columns.tolist()}")

        # Generar columna Fecha
        if 'Fecha' not in sheet_data.columns and 'Dia' in sheet_data.columns:
            sheet_data['Fecha'] = sheet_data['Dia'].apply(
                lambda x: f"{YEAR}-{sheet_name}-{int(x):02d}" if pd.notnull(x) and isinstance(x, (int, float)) else None
            )
            sheet_data.drop(columns=['Dia'], inplace=True)

        # Concatenar datos procesados
        all_processed_data = pd.concat([all_processed_data, sheet_data], ignore_index=True)

    except Exception as e:
        print(f"Error procesando la hoja {sheet_name}: {e}")

print("\nDatos consolidados:")
display(all_processed_data.head())


Hoja DATOS no es relevante. Omitiendo.

Procesando hoja: ENE
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 126+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']


Unnamed: 0,Día,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 071+000,KP 082+000,KP 090+000,KP 095+000,PS#2,KP 126+000,KP 181+900,KP 184+039,PS#3,PS#4/XV10007
0,44927,2.54,21.7,2.4,0.4,0.0,1.6,0,44.5,39.0,...,0.0,1.8,4.6,3.8,1.6,3.4,0,0,3.8,4.0
1,44928,1.0,2.7,0.0,7.0,8.2,6.3,0,20.0,40.0,...,0.0,1.2,0.0,0.0,0.6,14.4,0,0,1.0,1.0
2,44929,7.62,0.0,0.0,11.3,5.0,24.5,0,58.0,23.0,...,2.0,4.6,1.0,0.0,0.6,8.2,0,0,1.6,2.0
3,44930,10.16,0.9,14.4,30.4,65.5,40.2,0,53.0,45.0,...,31.0,33.0,56.6,24.7,18.6,12.2,0,0,3.4,3.0
4,44931,35.56,6.3,9.0,13.4,18.2,17.6,0,21.6,28.0,...,37.2,27.6,55.2,43.3,17.8,20.4,0,0,25.4,24.0


Columnas después de eliminar 'Unnamed': ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 126+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 126+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']

Procesando hoja: FEB
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP

Unnamed: 0,Día,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 090+000,KP 095+000,PS#2,KP 127+000,KP 181+900,KP 184+039,PS#3,PS#4/XV10007,Unnamed: 22,Unnamed: 23
0,44958,22.86,1.0,3.1,5.0,15.5,53.0,0,192.0,24.0,...,0,0,24.5,0,0,0,19.5,21.0,0,0
1,44959,17.78,2.0,0.0,6.0,18.0,28.0,0,142.0,65.0,...,0,0,15.5,0,0,0,24.1,26.0,0,0
2,44960,18.0,25.2,42.2,36.5,110.0,144.0,0,352.0,70.0,...,0,0,15.8,0,0,0,31.0,30.0,0,0
3,44961,7.62,15.8,13.2,6.0,9.5,12.0,0,30.0,30.0,...,0,0,4.5,0,0,0,6.6,6.0,0,0
4,44962,0.0,34.0,33.0,6.0,24.0,15.0,0,60.7,4.0,...,0,0,36.0,0,0,0,9.4,10.0,0,0


Columnas después de eliminar 'Unnamed': ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']

Procesando hoja: MAR
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP

Unnamed: 0,Día,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 071+000,KP 082+000,KP 090+000,KP 095+000,PS#2,KP 127+000,KP 181+900,KP 184+039,PS#3,PS#4/XV10007
0,44986,0.2,5.8,42.0,31.5,3.0,6.0,0,18.0,10.0,...,4.0,0.0,0,0,0.6,0,0,0,8.0,8.0
1,44987,1.0,0.8,0.0,0.0,0.0,10.1,0,14.0,22.0,...,0.0,0.0,0,0,3.6,0,0,0,0.0,0.0
2,44988,5.08,0.4,0.0,0.0,0.0,0.0,0,0.0,5.0,...,8.0,3.2,0,0,20.0,0,0,0,0.0,0.0
3,44989,0.0,23.7,59.0,7.5,31.0,0.0,0,11.0,10.0,...,5.0,1.8,0,0,6.4,0,0,0,0.0,0.0
4,44990,2.54,0.0,7.3,0.0,0.0,0.0,0,4.0,20.0,...,7.0,3.2,0,0,37.4,0,0,0,8.6,10.0


Columnas después de eliminar 'Unnamed': ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']

Procesando hoja: ABR
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP

Unnamed: 0,Día,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 071+000,KP 082+000,KP 090+000,KP 095+000,PS#2,KP 127+000,KP 181+900,KP 184+039,PS#3,PS#4/XV10007
0,45017,1.7,0.0,0.0,0.0,0.0,0.0,0,0.0,5.0,...,0.0,0.0,0,0,0.6,0,0,0,0.0,0.0
1,45018,0.0,0.0,2.0,0.0,0.0,10.0,0,3.4,0.0,...,0.0,0.0,0,0,2.0,0,0,0,0.0,0.0
2,45019,2.3,0.0,0.0,0.0,0.0,2.0,0,4.4,0.0,...,0.0,0.0,0,0,0.2,0,0,0,2.5,0.8
3,45020,2.0,72.3,38.2,50.0,77.4,31.0,0,37.0,34.0,...,2.0,0.0,0,0,17.2,0,0,0,0.0,0.0
4,45021,3.1,92.6,27.7,21.0,58.0,96.0,0,86.0,55.8,...,17.2,27.0,0,0,32.0,0,0,0,9.1,12.2


Columnas después de eliminar 'Unnamed': ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']

Procesando hoja: MAY
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 126+000', 'KP 181+900', 'KP

Unnamed: 0,Día,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 071+000,KP 082+000,KP 090+000,KP 095+000,PS#2,KP 126+000,KP 181+900,KP 184+039,PS#3,PS#4/XV10007
0,45047,0.0,35.1,12.0,28.0,4.0,19.4,0,62.5,87.4,...,5.0,4.2,0,0,0.0,0,0,0,0.0,0.0
1,45048,0.0,18.1,22.0,29.0,31.0,42.2,0,59.5,32.8,...,12.0,1.4,0,0,0.0,0,0,0,0.0,0.0
2,45049,1.0,40.4,62.0,30.0,49.2,22.0,0,24.5,29.0,...,19.0,13.4,0,0,0.3,0,0,0,3.56,0.11
3,45050,5.08,60.4,15.0,36.0,58.4,52.0,0,75.5,101.0,...,13.0,0.0,0,0,10.5,0,0,0,0.0,0.0
4,45051,10.16,16.5,4.0,20.0,52.4,64.0,0,44.0,32.5,...,36.0,20.0,0,0,43.9,0,0,0,0.0,0.0


Columnas después de eliminar 'Unnamed': ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 126+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 126+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']

Procesando hoja: JUN
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP

Unnamed: 0,Día,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 090+000,KP 095+000,PS#2,KP 127+000,KP 181+900,KP 184+039,PS#3,PS#4/XV10007,Unnamed: 22,Unnamed: 23
0,45078,0.0,0.0,0,0.0,0.0,0.0,0,0.0,4.2,...,0,0,0.2,0,0,0,0.0,0,0,0
1,45079,0.0,0.0,0,0.0,0.0,1.0,0,2.5,6.2,...,0,0,1.1,0,0,0,0.0,0,0,0
2,45080,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,...,0,0,0.0,0,0,0,0.0,0,0,0
3,45081,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,...,0,0,0.0,0,0,0,0.0,0,0,0
4,45082,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,...,0,0,0.0,0,0,0,0.0,0,0,0


Columnas después de eliminar 'Unnamed': ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']

Procesando hoja: JUL
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP

Unnamed: 0,Día,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 071+000,KP 082+000,KP 090+000,KP 095+000,PS#2,KP 127+000,KP 181+900,KP 184+039,PS#3,PS#4/XV10007
0,45108,0.3,0.3,0,0.0,0.0,3.0,0,0.0,0.0,...,0.0,0.0,0,0,0.0,0,0,0,0.0,0.0
1,45109,0.0,0.4,0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,2.0,0,0,3.0,0,0,0,0.0,0.0
2,45110,0.0,0.2,0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0,0,0.0,0,0,0,0.0,0.0
3,45111,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0,0,0.0,0,0,0,0.0,0.0
4,45112,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0,0,0.0,0,0,0,0.0,0.0


Columnas después de eliminar 'Unnamed': ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']

Procesando hoja: AGO
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP

Unnamed: 0,Día,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 090+000,KP 095+000,PS#2,KP 127+000,KP 181+900,KP 184+039,PS#3,PS#4/XV10007,Unnamed: 22,Unnamed: 23
0,45139,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0
1,45140,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0
2,45141,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0
3,45142,0.0,0.0,0.0,0.0,16.0,2.1,0,0.0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0
4,45143,0.0,0.0,2.0,0.0,8.0,31.0,0,32.0,16.0,...,0,0,0.0,0,0,0,2.54,0.9,0,0


Columnas después de eliminar 'Unnamed': ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 181+900', 'KP 184+039', 'PS#3', 'PS#4/XV10007']

Procesando hoja: SET
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 142+000', 'KP

Unnamed: 0,Día,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 071+000,KP 082+000,KP 090+000,KP 095+000,PS#2,KP 127+000,KP 142+000,KP 184+039,PS#3,PS#4/XV10007
0,45170,22.86,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,9.2,26.4,0,0,18.4,0,0.0,0,0.0,0.0
1,45171,2.54,0.0,0.0,0.0,0.0,3.0,0,0.0,4.0,...,28.0,16.2,0,0,7.4,0,0.0,0,1.27,2.0
2,45172,0.0,0.0,0.0,12.0,13.0,8.0,0,4.5,2.0,...,0.0,0.0,0,0,1.0,0,0.0,0,0.0,0.0
3,45173,5.08,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0,0,3.8,0,0.0,0,0.0,0.0
4,45174,12.7,13.1,0.0,0.0,0.0,10.0,0,16.0,22.0,...,2.0,0.0,0,0,1.6,0,0.0,0,1.27,0.4


Columnas después de eliminar 'Unnamed': ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 142+000', 'KP 184+039', 'PS#3', 'PS#4/XV10007']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 142+000', 'KP 184+039', 'PS#3', 'PS#4/XV10007']

Procesando hoja: OCT
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 142+000', 'KP

Unnamed: 0,Día,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 071+000,KP 082+000,KP 090+000,KP 095+000,PS#2,KP 127+000,KP 142+000,KP 184+039,PS#3,PS#4/XV10007
0,45200,0.0,26.4,6.0,0.0,0.0,0.0,0,0,40.0,...,2.0,0.0,0,0,0.0,0,0.0,0,0.0,0.0
1,45201,10.16,0.0,2.0,0.0,2.0,5.0,0,0,2.0,...,2.0,1.2,0,0,10.0,0,104.0,0,1.3,2.0
2,45202,5.08,1.5,2.0,9.8,7.0,6.0,0,4,0.0,...,2.0,6.4,0,0,2.2,0,2.3,0,0.0,0.0
3,45203,18.0,0.3,3.0,2.3,5.0,2.0,0,0,22.0,...,8.0,4.1,0,0,16.0,0,3.0,0,31.0,32.0
4,45204,0.0,0.0,4.0,4.6,33.2,32.0,0,36,35.0,...,0.0,4.2,0,0,0.2,0,3.8,0,0.0,4.0


Columnas después de eliminar 'Unnamed': ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 142+000', 'KP 184+039', 'PS#3', 'PS#4/XV10007']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 142+000', 'KP 184+039', 'PS#3', 'PS#4/XV10007']

Procesando hoja: NOV
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 142+000', 'KP

Unnamed: 0,Día,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 071+000,KP 082+000,KP 090+000,KP 095+000,PS#2,KP 127+000,KP 142+000,KP 184+039,PS#3,PS#4/XV10007
0,45231,10.1,20.8,20,18.0,11.0,10.3,0,9.0,27.0,...,13.4,21.2,0,0,1.0,1.2,18.0,0,0.0,0.0
1,45232,0.0,48.5,72,48.0,19.0,8.2,0,10.0,23.0,...,0.0,0.0,0,0,0.0,0.0,12.0,0,0.0,0.0
2,45233,0.0,0.0,0,0.0,0.0,6.4,0,1.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0.0,0,0.0,0.0
3,45234,3.0,0.0,0,0.0,2.1,1.3,0,0.0,0.0,...,0.0,4.1,0,0,2.2,3.1,10.0,0,7.0,7.0
4,45235,10.0,0.0,0,0.0,4.1,10.5,0,3.7,16.0,...,4.5,6.0,0,0,19.6,17.13,26.0,0,13.4,34.0


Columnas después de eliminar 'Unnamed': ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 142+000', 'KP 184+039', 'PS#3', 'PS#4/XV10007']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 142+000', 'KP 184+039', 'PS#3', 'PS#4/XV10007']

Procesando hoja: DIC
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 142+000', 'KP

Unnamed: 0,Día,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 090+000,KP 095+000,PS#2,KP 127+000,KP 142+000,KP 184+039,PS#3,PS#4/XV10007,Unnamed: 22,Unnamed: 23
0,45261,35.0,0.0,0.0,0.0,5.0,33.3,0,23.0,11.0,...,0,0,57.2,31.6,0,0,18.0,20.0,0,0
1,45262,20.1,1.4,7.5,0.0,7.2,2.3,0,7.0,7.0,...,0,0,42.8,16.2,0,0,8.0,8.0,0,0
2,45263,0.5,0.0,0.0,0.0,13.0,8.8,0,20.0,27.0,...,0,0,0.2,1.5,0,0,2.5,2.0,0,0
3,45264,11.0,5.1,14.0,7.5,7.0,9.5,0,8.0,8.0,...,0,0,5.2,11.6,0,0,0.0,0.0,0,0
4,45265,8.4,64.5,63.0,63.0,42.3,23.4,0,40.0,93.0,...,0,0,11.1,4.4,0,0,12.5,12.0,0,0


Columnas después de eliminar 'Unnamed': ['Día', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 142+000', 'KP 184+039', 'PS#3', 'PS#4/XV10007']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS#1', 'KP 007+400', 'KP 018+000', 'KP 030+000', 'KP 040+000', 'KP 043+600', 'KP 046+100', 'KP 050+000', 'KP 057+000', 'KP 064+000', 'KP 071+000', 'KP 082+000', 'KP 090+000', 'KP 095+000', 'PS#2', 'KP 127+000', 'KP 142+000', 'KP 184+039', 'PS#3', 'PS#4/XV10007']

Procesando hoja: ENE´24
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnam

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56
0,,,,,,,,,,,...,,Código: ST26008.950.225000,,,,,,,,
1,,,,,,,,,,,...,,Versión: 1.0,Vigente desde: 17/03/2021,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,FORMATO PARA PRECIPITACIÓN DE LLUVIAS,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


Columnas después de eliminar 'Unnamed': []
Hoja ENE´24 no contiene datos relevantes. Omitiendo.

Procesando hoja: ENE´24 - WEBGIS
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['CODE', 'READINGDATE', 'DAILYPRECIPITATION', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 45631.40948599537]


Unnamed: 0,CODE,READINGDATE,DAILYPRECIPITATION,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,45631.409486
0,,,,,,,,,,,,,,,,,día,5
1,,,,,,,,,,,,,,,,,mes,12
2,,,,,,,,,,,,,,,,,año,24
3,,,,,,,,,,,,,,,,,hora,9
4,,,,,,,,,,,,,,,,,min,49


Error procesando la hoja ENE´24 - WEBGIS: bad operand type for unary ~: 'float'

Datos consolidados:


Unnamed: 0,Fecha,Obrador Kiteni,PS#1,KP 007+400,KP 018+000,KP 030+000,KP 040+000,KP 043+600,KP 046+100,KP 050+000,...,KP 090+000,KP 095+000,PS#2,KP 126+000,KP 181+900,KP 184+039,PS#3,PS#4/XV10007,KP 127+000,KP 142+000
0,44927,2.54,21.7,2.4,0.4,0.0,1.6,0,44.5,39.0,...,4.6,3.8,1.6,3.4,0.0,0,3.8,4.0,,
1,44928,1.0,2.7,0.0,7.0,8.2,6.3,0,20.0,40.0,...,0.0,0.0,0.6,14.4,0.0,0,1.0,1.0,,
2,44929,7.62,0.0,0.0,11.3,5.0,24.5,0,58.0,23.0,...,1.0,0.0,0.6,8.2,0.0,0,1.6,2.0,,
3,44930,10.16,0.9,14.4,30.4,65.5,40.2,0,53.0,45.0,...,56.6,24.7,18.6,12.2,0.0,0,3.4,3.0,,
4,44931,35.56,6.3,9.0,13.4,18.2,17.6,0,21.6,28.0,...,55.2,43.3,17.8,20.4,0.0,0,25.4,24.0,,


# Limpieza de Datos y Análisis
Este bloque limpia los datos (manejo de `NaNs`), agrega información de rupturas, detecta outliers y calcula promedios mensuales.


In [5]:
# Calcular promedios mensuales para manejar NaNs
monthly_averages = all_processed_data.groupby(pd.to_datetime(all_processed_data['Fecha']).dt.month).mean(numeric_only=True)

# Manejo de valores faltantes
all_processed_data = handle_missing_values(all_processed_data, method='monthly_mean', monthly_averages=monthly_averages)

# Agregar columna de rupturas
rupture_dates = ['2020-01-15', '2020-05-20']
all_processed_data = add_rupture_column(all_processed_data, rupture_dates)

# Detectar outliers
outlier_thresholds = detect_outliers(all_processed_data)


Outlier umbral para Obrador Kiteni: 25.0
Outlier umbral para PS#1: 55.65999999999999
Outlier umbral para KP 007+400: 49.97999999999995
Outlier umbral para KP 018+000: 46.89999999999998
Outlier umbral para KP 030+000: 60.189999999999976
Outlier umbral para KP 040+000: 59.799999999999955
Outlier umbral para KP 043+600: 0.0
Outlier umbral para KP 046+100: 64.44999999999999
Outlier umbral para KP 050+000: 66.34999999999997
Outlier umbral para KP 057+000: 55.0
Outlier umbral para KP 064+000: 39.44999999999999
Outlier umbral para KP 071+000: 39.64999999999999
Outlier umbral para KP 082+000: 24.889999999999997
Outlier umbral para KP 090+000: 0.0
Outlier umbral para KP 095+000: 0.0
Outlier umbral para PS#2: 30.179999999999996
Outlier umbral para KP 126+000: 2.061290322580645
Outlier umbral para KP 181+900: 0.0
Outlier umbral para KP 184+039: 0.0
Outlier umbral para PS#3: 22.00199999999998
Outlier umbral para PS#4/XV10007: 23.44999999999999
Outlier umbral para KP 127+000: 17.251499999999997
Out

# Exportar Resultados
Guarda los datos procesados en un archivo CSV consolidado.


In [6]:
all_processed_data.to_csv(OUTPUT_PATH, index=False, encoding='utf-8')
print(f"Datos consolidados guardados en: {OUTPUT_PATH}")


Datos consolidados guardados en: ../data/processed/pluviometros-eda-2023.csv
