# **Default Cell**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Importacion De Librerias y Datos**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Lee el archivo .txt usando pandas
df_salud = pd.read_csv('/kaggle/input/data-sura/1.Informacion Salud 2019-2024-001.txt', delimiter='|')
df_prest = pd.read_excel("/kaggle/input/data-sura/2.Red Prestadores.xlsx")

# **Analisis Y Preparacion de los Datos**

In [None]:
print("-===Informacion de Salud===-")
df_salud.info()
print("-===Informacion de Prestadores===-")
df_prest.info()

In [None]:
print("-===ENCABEZADOS===-")

print("=====Datos salud========")
df_salud.head()

In [None]:
print("=====Datos Prestadores=====")
df_prest.head()

**Liempieza de Datos**

In [None]:
# Tratamiento de valores nulos
# Eliminar registros con nulos en columnas clave (pocos nulos)
df_salud.dropna(subset=["Numero_Uen_Arp", "TIPO_CITA"], inplace=True)

# Conversión de formatos
# Convertir fechas a datetime
df_salud["FECHA_ATENCION"] = pd.to_datetime(df_salud["FECHA_ATENCION"], errors="coerce")
df_salud["FECHA_PROCESO"] = pd.to_datetime(df_salud["FECHA_PROCESO"], errors="coerce")

# Estandarización de textos
# Normalizar nombres de municipios (ej: eliminar "?" y mayúsculas)
df_salud["Nombre_Municipio_IPS"] = df_salud["Nombre_Municipio_IPS"].str.replace("?", "").str.strip().str.upper()

# Imputar nulos en 'max_cantidad' (Prestadores) con la mediana por municipio
df_prest["max_cantidad"] = df_prest.groupby("Geogra_Municipio_Id")["max_cantidad"].transform(
    lambda x: x.fillna(x.median())
)

**Integracion** (NO PUEDE SER EJECUTADO EN KAGGLE)

In [None]:
#TODO LO DE ABAJO NO PUEDE SER EJECUTADO EN KAGGLE
# Unir ambos datasets usando Geogra_Municipio_Id
df_merged = pd.merge(
    df_salud,
    df_prest,
    on="Geogra_Municipio_Id",
    how="left",  # Mantener todos los registros de salud
    suffixes=("_salud", "_prestadores")
)

# Validar coincidencias
print(f"Registros sin coincidencia en Prestadores: {df_merged['HOMOLOGACION NIT'].isna().sum()}")

**Automatizacion y Reproducibilidad**

In [None]:
# Definir transformaciones
numeric_features = ["Cantidad", "max_cantidad"]
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

categorical_features = ["CODIGO_ESTADO", "Nombre_Tipo_Atencion_Arp"]
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Crear pipeline completo
pipeline = Pipeline(steps=[("preprocessor", preprocessor)])

# Aplicar pipeline
X_processed = pipeline.fit_transform(df_merged)

In [None]:
df_salud.head()
print("==============")
df_prest.head()