# Configuración Inicial
Este bloque configura las rutas, variables y estructuras necesarias para procesar los datos.


In [7]:
import os
import pandas as pd
import re

# Configuración general
YEAR = 2021
RAW_FOLDER = "../data/raw"
PROCESSED_FOLDER = "../data/processed"
FILE_PATH_XLSX = f"{RAW_FOLDER}/data_{YEAR}.xlsx"
FILE_PATH_XLSB = f"{RAW_FOLDER}/data_{YEAR}.xlsb"
OUTPUT_PATH = f"{PROCESSED_FOLDER}/pluviometros-eda-{YEAR}.csv"

# Mapeo de columnas
COLUMN_MAPPING = {
    'Fecha': ['Fecha', 'Date', 'Día', 'Dia'],
    'Obrador Kiteni': ['Base Operativa Kiteni', 'Kiteni', 'Obrador Kiteni'],
    # Más columnas...
}


# Funciones Auxiliares
Este bloque contiene las funciones necesarias para procesar los datos:
1. Detección de encabezados.
2. Identificación de columnas relevantes.
3. Manejo de valores faltantes (`NaNs`).
4. Agregar información sobre rupturas.
5. Identificar outliers.


In [8]:
def detect_header_row(df, max_rows=15):
    """Detecta la fila que contiene los encabezados reales."""
    for row in range(max_rows):
        possible_header = df.iloc[row].values
        if any(isinstance(val, str) and len(val.strip()) > 0 for val in possible_header):
            return row
    return 0  # Si no se detecta, usar la primera fila

def detect_relevant_columns(df, column_mapping):
    """Detecta columnas relevantes basadas en nombres similares."""
    detected_columns = {}
    for expected_col, possible_names in column_mapping.items():
        for col in df.columns:
            if any(re.search(rf"\b{re.escape(name)}\b", str(col), re.IGNORECASE) for name in possible_names):
                detected_columns[expected_col] = col
                break
    return detected_columns

def handle_missing_values(df, method='global_mean', monthly_averages=None):
    """Reemplaza NaNs con el promedio por columna (global o mensual)."""
    for col in df.columns:
        if col != 'Fecha':
            if method == 'global_mean':
                # Promedio global por columna
                mean_value = df[col].replace(0, pd.NA).mean(skipna=True)
                df[col] = df[col].fillna(mean_value)
            elif method == 'monthly_mean' and monthly_averages is not None:
                # Promedio mensual basado en datos disponibles
                df['Mes'] = pd.to_datetime(df['Fecha']).dt.month
                df[col] = df.apply(
                    lambda row: monthly_averages.loc[row['Mes'], col]
                    if pd.isnull(row[col]) else row[col], axis=1
                )
    return df

def add_rupture_column(df, rupture_dates):
    """Agrega una columna con información sobre rupturas basada en fechas específicas."""
    df['Ruptura'] = df['Fecha'].apply(lambda x: 1 if x in rupture_dates else 0)
    return df

def detect_outliers(df):
    """Detecta días con valores extremos de lluvia basados en el percentil 95."""
    outlier_thresholds = {}
    for col in df.columns:
        if col not in ['Fecha', 'Ruptura', 'Mes']:
            threshold = df[col].quantile(0.95)
            outlier_thresholds[col] = threshold
            print(f"Outlier umbral para {col}: {threshold}")
    return outlier_thresholds


# Carga de Datos
Este bloque carga los datos desde el archivo Excel y consolida todas las hojas.


In [9]:
if os.path.exists(FILE_PATH_XLSX):
    file_path, engine = FILE_PATH_XLSX, None
elif os.path.exists(FILE_PATH_XLSB):
    file_path, engine = FILE_PATH_XLSB, 'pyxlsb'
else:
    raise FileNotFoundError(f"No se encontró archivo para el año {YEAR}")

excel_data = pd.ExcelFile(file_path, engine=engine)
sheet_names = excel_data.sheet_names
print(f"Hojas disponibles: {sheet_names}")

all_processed_data = pd.DataFrame()

Hojas disponibles: ['ENE', 'FEB', 'MAR', 'ABR', 'MAY', 'JUN', 'JUL', 'AGO', 'SEP', 'OCT', 'NOV', 'DIC']


In [10]:
for sheet_name in sheet_names:
    print(f"\nProcesando hoja: {sheet_name}")

    try:
        # Cargar una muestra de la hoja
        sheet_preview = excel_data.parse(sheet_name, nrows=15)
        header_row = detect_header_row(sheet_preview)
        print(f"Encabezado detectado en la fila: {header_row}")

        # Carga completa de datos
        sheet_data = excel_data.parse(sheet_name, skiprows=header_row)
        print(f"Columnas disponibles antes del mapeo: {sheet_data.columns.tolist()}")
        display(sheet_data.head())

        # Eliminar columnas 'Unnamed'
        sheet_data = sheet_data.loc[:, ~sheet_data.columns.str.contains('^Unnamed')]
        print(f"Columnas después de eliminar 'Unnamed': {sheet_data.columns.tolist()}")

        # Validar si la hoja tiene datos significativos
        if sheet_data.empty or all(sheet_data.columns.str.contains('^Unnamed')):
            print(f"Hoja {sheet_name} no contiene datos relevantes. Omitiendo.")
            continue

        # Renombrar columnas según el mapeo directo
        new_column_names = {}
        for expected_col, possible_names in COLUMN_MAPPING.items():
            for col in sheet_data.columns:
                if any(re.search(rf"\b{re.escape(name)}\b", str(col), re.IGNORECASE) for name in possible_names):
                    new_column_names[col] = expected_col

        # Aplicar el renombramiento
        sheet_data = sheet_data.rename(columns=new_column_names)
        print(f"Columnas después del mapeo: {sheet_data.columns.tolist()}")

        # Generar columna Fecha
        if 'Fecha' not in sheet_data.columns and 'Dia' in sheet_data.columns:
            sheet_data['Fecha'] = sheet_data['Dia'].apply(
                lambda x: f"{YEAR}-{sheet_name}-{int(x):02d}" if pd.notnull(x) and isinstance(x, (int, float)) else None
            )
            sheet_data.drop(columns=['Dia'], inplace=True)

        # Concatenar datos procesados
        all_processed_data = pd.concat([all_processed_data, sheet_data], ignore_index=True)

    except Exception as e:
        print(f"Error procesando la hoja {sheet_name}: {e}")

print("\nDatos consolidados:")
display(all_processed_data.head())



Procesando hoja: ENE
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244', 'Unnamed: 23']


Unnamed: 0,Día,Base Operativa Kiteni,PS-1,KP-7+400,KP-18,KP-30,KP-40,KP-43+600,KP-46+100,KP-50,...,KP-90,KP-95,PS-2,KP-126,KP-181+900,KP-184,PS-3,PS-4,KP-244,Unnamed: 23
0,2020-01-01,6.4,57.6,40.0,25.0,54.2,56.2,55.6,48.4,30.6,...,13.2,9.0,24.0,12.6,4.4,8.4,14,2,0.0,0
1,2020-01-02,1.8,16.6,36.5,14.5,4.1,8.1,27.4,26.0,30.2,...,1.4,3.2,0.0,2.0,101.6,38.4,0,0,0.0,0
2,2020-01-03,21.9,20.4,11.4,6.0,34.0,74.3,13.6,15.3,32.8,...,16.2,14.4,37.0,27.0,37.4,34.2,0,0,0.0,0
3,2020-01-04,1.8,1.8,7.6,16.0,6.2,0.0,5.6,10.3,8.6,...,5.0,17.0,1.8,3.2,1.8,2.0,0,0,10.5,0
4,2020-01-05,48.5,34.6,47.2,0.9,4.2,22.0,27.4,67.0,69.6,...,9.2,26.2,9.8,41.0,35.0,24.0,0,0,0.0,0


Columnas después de eliminar 'Unnamed': ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']

Procesando hoja: FEB
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244', 'Unnamed: 23']


Unnamed: 0,Día,Base Operativa Kiteni,PS-1,KP-7+400,KP-18,KP-30,KP-40,KP-43+600,KP-46+100,KP-50,...,KP-90,KP-95,PS-2,KP-126,KP-181+900,KP-184,PS-3,PS-4,KP-244,Unnamed: 23
0,2021-02-01,0,0.0,0.0,0.2,0.0,0.2,1.0,0.0,0.0,...,0.0,0.0,0.0,0.1,0.0,0,0.0,2.0,0,0
1,2021-02-02,0,0.0,1.6,20.0,8.4,24.4,46.2,0.0,0.0,...,0.0,0.0,0.0,1.2,0.0,0,6.2,0.0,0,0
2,2021-02-03,0,0.0,0.0,0.3,18.4,2.4,0.0,0.0,0.0,...,0.0,4.2,0.0,1.1,0.0,0,18.6,9.2,0,0
3,2021-02-04,0,0.0,22.0,33.6,0.0,70.2,39.0,45.9,0.0,...,0.0,36.5,54.8,1.4,0.0,0,16.4,2.6,0,0
4,2021-02-05,0,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.1,1.0,0.0,0,16.0,18.6,0,0


Columnas después de eliminar 'Unnamed': ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']

Procesando hoja: MAR
Encabezado detectado en la fila: 3
Columnas disponibles antes del mapeo: ['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'U

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43
0,FORMATO PARA PRECIPITACIÓN DE LLUVIAS,,,,,,,,,,...,,,,,,,,,NaT,
1,,,,,,,,,,,...,,,,,,,,,NaT,
2,1. INFORMACIÓN GENERAL,,,,,,,,,,...,,,,,,,,,NaT,
3,PROCESOS:,,,,,,,,,,...,,,,,,,,,NaT,
4,DEPENDENCIA O PROYECTO:,,,,,,,,,,...,,,,,,,,MES/AÑO,NaT,MARZO 2021


Columnas después de eliminar 'Unnamed': []
Hoja MAR no contiene datos relevantes. Omitiendo.

Procesando hoja: ABR
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244', 'Unnamed: 23']


Unnamed: 0,Día,Base Operativa Kiteni,PS-1,KP-7+400,KP-18,KP-30,KP-40,KP-43+600,KP-46+100,KP-50,...,KP-90,KP-95,PS-2,KP-126,KP-181+900,KP-184,PS-3,PS-4,KP-244,Unnamed: 23
0,2021-04-01,0,11.6,0.0,0.0,0.3,1.0,0.4,0.0,3.4,...,4.5,4.2,8.0,19.2,18.6,0,18.2,12.0,0,0
1,2021-04-02,0,0.0,3.2,0.0,33.6,14.0,6.7,36.0,3.5,...,5.0,0.0,0.2,0.0,18.2,0,12.2,2.6,0,0
2,2021-04-03,0,10.4,3.4,8.0,11.8,30.0,41.5,47.4,28.0,...,11.0,2.2,17.2,22.2,14.2,0,4.2,2.0,0,0
3,2021-04-04,0,8.8,4.3,4.0,4.6,4.0,4.8,8.2,6.0,...,7.0,6.2,18.6,6.2,38.8,0,46.0,18.6,0,0
4,2021-04-05,0,0.0,3.0,6.0,4.8,1.0,0.4,10.0,13.3,...,1.2,0.0,1.2,6.0,2.0,0,6.8,4.6,0,0


Columnas después de eliminar 'Unnamed': ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']

Procesando hoja: MAY
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244', 'Unnamed: 23']


Unnamed: 0,Día,Base Operativa Kiteni,PS-1,KP-7+400,KP-18,KP-30,KP-40,KP-43+600,KP-46+100,KP-50,...,KP-90,KP-95,PS-2,KP-126,KP-181+900,KP-184,PS-3,PS-4,KP-244,Unnamed: 23
0,2021-05-01,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0,2.0,2.0,0,0
1,2021-05-02,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0
2,2021-05-03,0,0.0,0.0,0.0,6.0,8.3,6.2,12.2,4,...,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0,0
3,2021-05-04,0,0.0,0.3,9.6,6.8,28.9,33.3,37.7,58,...,5.1,18.0,39.4,1.3,0.0,0,0.0,0.0,0,0
4,2021-05-05,0,0.0,0.9,22.2,10.2,28.5,16.8,34.7,20,...,0.0,0.0,0.2,0.0,0.0,0,0.0,0.0,0,0


Columnas después de eliminar 'Unnamed': ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']

Procesando hoja: JUN
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244', 'Unnamed: 23']


Unnamed: 0,Día,Base Operativa Kiteni,PS-1,KP-7+400,KP-18,KP-30,KP-40,KP-43+600,KP-46+100,KP-50,...,KP-90,KP-95,PS-2,KP-126,KP-181+900,KP-184,PS-3,PS-4,KP-244,Unnamed: 23
0,2021-06-01,0,4.2,16.5,24.0,5.2,12.0,36.5,34.4,30.2,...,0.0,0.0,8.8,0.0,2.6,0,2.0,2.0,0,0
1,2021-06-02,0,0.0,1.3,0.0,0.4,6.2,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,2.6,0,0.0,2.6,0,0
2,2021-06-03,0,5.4,3.3,2.0,2.4,2.8,3.5,3.0,7.0,...,2.0,0.4,1.5,0.1,0.0,0,0.0,0.0,0,0
3,2021-06-04,0,4.8,16.5,6.0,10.4,20.2,44.5,28.6,2.4,...,5.8,15.4,1.6,0.4,6.2,0,4.0,2.6,0,0
4,2021-06-05,0,0.4,0.0,0.0,0.4,16.8,4.3,4.4,0.0,...,1.0,4.1,7.5,0.3,24.2,0,6.2,7.4,0,0


Columnas después de eliminar 'Unnamed': ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']

Procesando hoja: JUL
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']


Unnamed: 0,Día,Base Operativa Kiteni,PS-1,KP-7+400,KP-18,KP-30,KP-40,KP-43+600,KP-46+100,KP-50,...,KP-82,KP-90,KP-95,PS-2,KP-126,KP-181+900,KP-184,PS-3,PS-4,KP-244
0,2021-07-01,0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,...,0.2,0.0,0.0,0.0,0.0,1.0,0,1.0,2.0,0
1,2021-07-02,0,0.0,0.0,0.1,0.0,0.1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
2,2021-07-03,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
3,2021-07-04,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.3,0.0,0,0.0,0.0,0
4,2021-07-05,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0


Columnas después de eliminar 'Unnamed': ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']

Procesando hoja: AGO
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']


Unnamed: 0,Día,Base Operativa Kiteni,PS-1,KP-7+400,KP-18,KP-30,KP-40,KP-43+600,KP-46+100,KP-50,...,KP-82,KP-90,KP-95,PS-2,KP-126,KP-181+900,KP-184,PS-3,PS-4,KP-244
0,2021-08-01,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,2.0,2.0,0
1,2021-08-02,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,13.0,2.0,0,8.0,6.0,0
2,2021-08-03,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0,1.0,1.5,0
3,2021-08-04,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0,3.0,3.0,0
4,2021-08-05,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0


Columnas después de eliminar 'Unnamed': ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']

Procesando hoja: SEP
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244', 'Unnamed: 23']


Unnamed: 0,Día,Base Operativa Kiteni,PS-1,KP-7+400,KP-18,KP-30,KP-40,KP-43+600,KP-46+100,KP-50,...,KP-90,KP-95,PS-2,KP-126,KP-181+900,KP-184,PS-3,PS-4,KP-244,Unnamed: 23
0,2021-09-01,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,8.2,8.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
1,2021-09-02,0,0.0,0.0,0.0,2.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0
2,2021-09-03,0,0.0,0.0,0.0,0.0,0.0,0,3.0,16.0,...,0.0,0.0,0.0,0.0,1.0,0,0.0,0.0,0.0,0
3,2021-09-04,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,0,8.0,8.0,0.0,0
4,2021-09-05,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.8,0.8,1.1,2.1,8.0,0,9.0,9.0,1.8,0


Columnas después de eliminar 'Unnamed': ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']

Procesando hoja: OCT
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244', 'Unnamed: 23']


Unnamed: 0,Día,Base Operativa Kiteni,PS-1,KP-7+400,KP-18,KP-30,KP-40,KP-43+600,KP-46+100,KP-50,...,KP-90,KP-95,PS-2,KP-126,KP-181+900,KP-184,PS-3,PS-4,KP-244,Unnamed: 23
0,2021-10-01,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0
1,2021-10-02,0,0.0,0.0,1.4,4.4,4.0,0,18.5,32.2,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0
2,2021-10-03,0,0.0,4.8,4.0,1.4,4.1,0,5.3,6.0,...,0.0,11.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,2021-10-04,0,0.0,0.4,3.0,4.4,0.0,0,0.0,0.0,...,0.0,22.0,15.7,8.0,0.0,0.0,2.0,2.0,8.2,0
4,2021-10-05,0,17.3,10.6,7.0,23.2,11.0,0,26.7,56.2,...,18.0,20.4,27.0,16.0,0.0,0.0,12.0,10.0,16.8,0


Columnas después de eliminar 'Unnamed': ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']

Procesando hoja: NOV
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244', 'Unnamed: 23']


Unnamed: 0,Día,Base Operativa Kiteni,PS-1,KP-7+400,KP-18,KP-30,KP-40,KP-43+600,KP-46+100,KP-50,...,KP-90,KP-95,PS-2,KP-126,KP-181+900,KP-184,PS-3,PS-4,KP-244,Unnamed: 23
0,2021-11-01,0,0.0,9.7,0.0,0.0,1.0,0.0,0.0,8.2,...,0.0,0.0,0.0,23.8,2.0,10.0,1.0,0.0,6.0,0
1,2021-11-02,0,2.0,2.1,1.2,0.0,1.6,0.0,0.0,0.4,...,1.8,0.4,34.8,30.4,1.0,23.0,1.0,0.0,24.0,0
2,2021-11-03,0,70.4,54.4,25.2,70.6,90.0,85.0,75.0,97.8,...,48.6,25.4,41.0,68.0,3.5,16.0,2.5,0.5,70.0,0
3,2021-11-04,0,0.0,3.1,0.2,1.2,0.0,0.0,0.0,0.0,...,0.0,0.0,1.1,7.6,0.0,0.0,0.0,0.0,0.0,0
4,2021-11-05,0,0.0,0.0,0.0,0.4,3.0,37.0,25.0,31.0,...,0.0,19.0,5.0,2.3,5.0,7.0,3.0,0.0,2.0,0


Columnas después de eliminar 'Unnamed': ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']

Procesando hoja: DIC
Encabezado detectado en la fila: 0
Columnas disponibles antes del mapeo: ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244', 'Unnamed: 23']


Unnamed: 0,Día,Base Operativa Kiteni,PS-1,KP-7+400,KP-18,KP-30,KP-40,KP-43+600,KP-46+100,KP-50,...,KP-90,KP-95,PS-2,KP-126,KP-181+900,KP-184,PS-3,PS-4,KP-244,Unnamed: 23
0,2021-12-01,0,1.8,16.6,26.0,19.2,18.7,15.0,5.3,6.6,...,8.0,5.8,11.9,4.3,4,8.7,4,2,4.2,0
1,2021-12-02,0,64.7,17.2,20.0,24.0,18.4,26.0,18.2,13.0,...,5.2,5.8,9.1,5.8,0,22.5,4,2,6.0,0
2,2021-12-03,0,53.1,26.6,0.2,25.2,61.3,45.0,37.3,28.4,...,12.0,20.3,15.2,9.8,4,4.2,6,6,16.3,0
3,2021-12-04,0,7.2,15.0,0.0,25.2,41.4,75.0,45.0,59.0,...,24.0,21.5,26.0,29.5,6,38.2,6,4,36.0,0
4,2021-12-05,0,27.4,9.2,24.0,34.0,38.3,56.0,60.3,70.3,...,6.0,6.8,12.4,9.5,0,8.2,0,0,7.0,0


Columnas después de eliminar 'Unnamed': ['Día', 'Base Operativa Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']
Columnas después del mapeo: ['Fecha', 'Obrador Kiteni', 'PS-1', 'KP-7+400', 'KP-18', 'KP-30', 'KP-40', 'KP-43+600', 'KP-46+100', 'KP-50', 'KP-57', 'KP-64', 'KP-71', 'KP-82', 'KP-90', 'KP-95', 'PS-2', 'KP-126', 'KP-181+900', 'KP-184', 'PS-3', 'PS-4', 'KP-244']

Datos consolidados:


Unnamed: 0,Fecha,Obrador Kiteni,PS-1,KP-7+400,KP-18,KP-30,KP-40,KP-43+600,KP-46+100,KP-50,...,KP-82,KP-90,KP-95,PS-2,KP-126,KP-181+900,KP-184,PS-3,PS-4,KP-244
0,2020-01-01,6.4,57.6,40.0,25.0,54.2,56.2,55.6,48.4,30.6,...,13.4,13.2,9.0,24.0,12.6,4.4,8.4,14.0,2.0,0.0
1,2020-01-02,1.8,16.6,36.5,14.5,4.1,8.1,27.4,26.0,30.2,...,3.0,1.4,3.2,0.0,2.0,101.6,38.4,0.0,0.0,0.0
2,2020-01-03,21.9,20.4,11.4,6.0,34.0,74.3,13.6,15.3,32.8,...,12.0,16.2,14.4,37.0,27.0,37.4,34.2,0.0,0.0,0.0
3,2020-01-04,1.8,1.8,7.6,16.0,6.2,0.0,5.6,10.3,8.6,...,13.3,5.0,17.0,1.8,3.2,1.8,2.0,0.0,0.0,10.5
4,2020-01-05,48.5,34.6,47.2,0.9,4.2,22.0,27.4,67.0,69.6,...,33.7,9.2,26.2,9.8,41.0,35.0,24.0,0.0,0.0,0.0


# Limpieza de Datos y Análisis
Este bloque limpia los datos (manejo de `NaNs`), agrega información de rupturas, detecta outliers y calcula promedios mensuales.


In [11]:
# Calcular promedios mensuales para manejar NaNs
monthly_averages = all_processed_data.groupby(pd.to_datetime(all_processed_data['Fecha']).dt.month).mean(numeric_only=True)

# Manejo de valores faltantes
all_processed_data = handle_missing_values(all_processed_data, method='monthly_mean', monthly_averages=monthly_averages)

# Agregar columna de rupturas
rupture_dates = ['2020-01-15', '2020-05-20']
all_processed_data = add_rupture_column(all_processed_data, rupture_dates)

# Detectar outliers
outlier_thresholds = detect_outliers(all_processed_data)


Outlier umbral para Obrador Kiteni: 4.4
Outlier umbral para PS-1: 53.025
Outlier umbral para KP-7+400: 57.575
Outlier umbral para KP-18: 50.55
Outlier umbral para KP-30: 51.125
Outlier umbral para KP-40: 70.0
Outlier umbral para KP-43+600: 79.7
Outlier umbral para KP-46+100: 69.0
Outlier umbral para KP-50: 79.05000000000001
Outlier umbral para KP-57: 80.8
Outlier umbral para KP-64: 50.849999999999994
Outlier umbral para KP-71: 40.25
Outlier umbral para KP-82: 35.625
Outlier umbral para KP-90: 26.95
Outlier umbral para KP-95: 30.05
Outlier umbral para PS-2: 29.5
Outlier umbral para KP-126: 27.25
Outlier umbral para KP-181+900: 24.6
Outlier umbral para KP-184: 24.325000000000003
Outlier umbral para PS-3: 18.6
Outlier umbral para PS-4: 16.0
Outlier umbral para KP-244: 18.5


# Exportar Resultados
Guarda los datos procesados en un archivo CSV consolidado.


In [12]:
all_processed_data.to_csv(OUTPUT_PATH, index=False, encoding='utf-8')
print(f"Datos consolidados guardados en: {OUTPUT_PATH}")


Datos consolidados guardados en: ../data/processed/pluviometros-eda-2021.csv
