## Data importation: importing data, handling missing data and customizing columns and values

Importing libraries

In [1]:
import pandas as pd
import numpy as np

Defining a global function with local functions created to clean each dataframe. This is useful to save time and standarize the cleaining process over all the dataframes. Mainly, the columns of interest were filtered to be included, some datatypes changed and some values as well, to improve understanding.

In [4]:
def limpieza(y):

    """
    Function created to clean dataframes (20) individually before concatenating them
    Inside the function there are local ones created to perform the specific data cleaning actions
    """
    
    #Edit the formatting including the directory where you stored the data
    año_df = f"C:/Users/mirko/Desktop/EK/JyJ/artroplastia rodilla PAD/Egresos Hospitalarios/raw_data/Egresos_Hospitalarios_{y}.csv"
    df_limpio = pd.read_csv(año_df, encoding='ISO-8859-1', on_bad_lines='skip', sep = ";", low_memory=False)
    

    def importacion(df):
        df.columns = df.columns.str.strip()
        df = df[["ID_PACIENTE","SEXO","EDAD_A_OS","PREVISION", "PERTENENCIA_ESTABLECIMIENTO_SALUD", "GLOSA_ESTABLECIMIENTO_SALUD",
             "DIAS_ESTADA","CONDICION_EGRESO","DIAG1","GLOSA_DIAG1"]].copy()
        df.dropna(subset = ["SEXO","EDAD_A_OS","PREVISION", "PERTENENCIA_ESTABLECIMIENTO_SALUD",
             "DIAS_ESTADA","CONDICION_EGRESO","DIAG1","GLOSA_DIAG1"],inplace = True)
        df["Año"] = f"{y}"
        df["Año"] = pd.to_datetime(df["Año"]).dt.year
        df["SEXO"] = df["SEXO"].astype(str)
        df["ID_PACIENTE"] = df["ID_PACIENTE"].astype(str)
        df["PREVISION"] = df["PREVISION"].astype(str)
        df["CONDICION_EGRESO"] = df["CONDICION_EGRESO"].astype(str)
        return df

    def tipo_col(df):
        df["SEXO"] = df["SEXO"].astype(str)
        df["ID_PACIENTE"] = df["ID_PACIENTE"].astype(str)
        df["PREVISION"] = df["PREVISION"].astype(str)
        df["CONDICION_EGRESO"] = df["CONDICION_EGRESO"].astype(str)
        return df

    def sexo(x):
        if x == "2" or x == "2":
            x = "Mujer"
        elif x == "1" or x == "1.0":
            x = "Hombre"
        else:
            x = "Otro"
        
        return x

    def prevision(x):
        if x == "1" or x == "1.0":
            x = "Fonasa"
        elif x == "2" or x == "2.0":
            x = "Isapre"
        else:
            x = "Eliminar"
        
        return x

    def condicion(x):
        if x == "1.0" or x == "1":
            x = "Vivo"
        elif x == "2.0" or x == "2":
            x = "Muerto"
        
        return x

    def pubpriv(x):
       if x == "Pertenecientes al Sistema Nacional de Servicios de Salud, SNSS":
           x = "Publico"
       elif x == "No Pertenecientes al Sistema Nacional de Servicios de Salud, SNSS":
            x = "Privado"
        
       return x

    def col_transformation(data):
        data["SEXO"] = data["SEXO"].apply(sexo)
        data["PREVISION"] = data["PREVISION"].apply(prevision)
        data["CONDICION_EGRESO"] = data["CONDICION_EGRESO"].apply(condicion)
        data["PERTENENCIA_ESTABLECIMIENTO_SALUD"] = data["PERTENENCIA_ESTABLECIMIENTO_SALUD"].apply(pubpriv)
        data = data[data['PREVISION'] != 'Eliminar']
        data = data[data['SEXO'] != 'Otro']

        return data

    df_limpio = importacion(df_limpio)
    df_limpio = tipo_col(df_limpio)
    df_limpio = col_transformation(df_limpio)

    return df_limpio
    

Looping over the 20 dataframes to import, clean and store them dinamically. Again, this is more efficient than just applying the function over each dataframe separately

In [7]:
# List of file paths. Edit the formatting including the directory where you stored the data
archivos = [fr"C:\Users\mirko\Desktop\EK\JyJ\artroplastia rodilla PAD\Egresos Hospitalarios\raw_data\Egresos_Hospitalarios_{año}.csv" for año in range(2001, 2021)]

# Initialize an empty dictionary to store the DataFrames
dataframes = {}

# Loop through the file paths and load each file into the dictionary
for i, file in enumerate(archivos):
    # Dynamically create a name for each DataFrame, e.g., "df_1", "df_2", etc.
    dataframe_name = f"df_{i+2001}"
    
    # Load the CSV into a DataFrame and store it in the dictionary
    dataframes[dataframe_name] = limpieza(f"{i+2001}")

# Access DataFrames by their names
# print(dataframes["df_2005"].head())  # View the first few rows of the first DataFrame
# print(dataframes["df_2007"].info())  # View information about the second DataFrame

## Concatenate data

With the raw dataframes imported and cleaned it is the moment to concatenate them and to delete missing data in the process

In [9]:
# Concatenar:
combined_df = pd.concat(dataframes.values(), ignore_index=True)
print(combined_df.shape)

# sin eliminar nulos: (26340664, 10)
# eliminando nulos: (26340655, 10)

(26340655, 11)


In [11]:
# Over the concatenated dataframe, some columns are changed to lighter data types to make faster and efficient the memory usage
combined_df["SEXO"] = combined_df["SEXO"].astype("category")
combined_df["ID_PACIENTE"] = combined_df["ID_PACIENTE"].astype("category")
combined_df["PREVISION"] = combined_df["PREVISION"].astype("category")
combined_df["PERTENENCIA_ESTABLECIMIENTO_SALUD"] = combined_df["PERTENENCIA_ESTABLECIMIENTO_SALUD"].astype("category")
combined_df["DIAS_ESTADA"] = combined_df["DIAS_ESTADA"].astype(np.int32)
combined_df["CONDICION_EGRESO"] = combined_df["CONDICION_EGRESO"].astype("category")
combined_df["EDAD_A_OS"] = combined_df["EDAD_A_OS"].astype(np.int32)

# CAMBIAR NOMBRE COLUMNAS

In [13]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26340655 entries, 0 to 26340654
Data columns (total 11 columns):
 #   Column                             Dtype   
---  ------                             -----   
 0   ID_PACIENTE                        category
 1   SEXO                               category
 2   EDAD_A_OS                          int32   
 3   PREVISION                          category
 4   PERTENENCIA_ESTABLECIMIENTO_SALUD  category
 5   GLOSA_ESTABLECIMIENTO_SALUD        object  
 6   DIAS_ESTADA                        int32   
 7   CONDICION_EGRESO                   category
 8   DIAG1                              object  
 9   GLOSA_DIAG1                        object  
 10  Año                                int32   
dtypes: category(5), int32(3), object(3)
memory usage: 1.4+ GB


In [15]:
combined_df.head()

Unnamed: 0,ID_PACIENTE,SEXO,EDAD_A_OS,PREVISION,PERTENENCIA_ESTABLECIMIENTO_SALUD,GLOSA_ESTABLECIMIENTO_SALUD,DIAS_ESTADA,CONDICION_EGRESO,DIAG1,GLOSA_DIAG1,Año
0,,Hombre,60,Fonasa,Privado,Clínica Familia,2,Muerto,C780,TUMOR MALIGNO SECUNDARIO DEL PULMÓN,2001
1,,Hombre,74,Fonasa,Publico,"Hospital Del Salvador (Santiago, Providencia)",58,Muerto,E145,"DIABETES MELLITUS NO ESPECIFICADA, CON COMPLIC...",2001
2,,Hombre,71,Fonasa,Publico,"Hospital Del Salvador (Santiago, Providencia)",12,Muerto,J189,"NEUMONIA, NO ESPECIFICADA",2001
3,,Hombre,2,Fonasa,Publico,Instituto de Neurocirugía Dr. Alfonso Asenjo,1,Muerto,S065,HEMORRAGIA SUBDURAL TRAUMÁTICA,2001
4,,Hombre,81,Fonasa,Publico,Hospital Dr. Leonardo Guzmán (Antofagasta),7,Vivo,K830,COLANGITIS,2001


In [17]:
combined_df.shape

(26340655, 11)

# Export data to CSV and Parquet

In [21]:
combined_df.to_csv("Egresos_2001-2020.csv", index = False)

The dataframe is transformed into a parquet file, which is lighter and easier to import into other notebooks or upload to Kaggle to share it

In [19]:
combined_df.to_parquet('Egresos_2001-2020.parquet', index=False, compression="gzip")  # Save as Parquet