## Data importation: 2021 to 2024 

In [110]:
# Import libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [112]:
# Import 4 dataframes
df_2021 = pd.read_csv(r"C:\Users\mirko\Desktop\EK\JyJ\artroplastia rodilla PAD\Egresos Hospitalarios\raw_data\Egresos_Hospitalarios_2021.csv",low_memory=False, encoding='latin1', sep=';')
df_2022 = pd.read_csv(r"C:\Users\mirko\Desktop\EK\JyJ\artroplastia rodilla PAD\Egresos Hospitalarios\raw_data\Egresos_Hospitalarios_2022.csv",low_memory=False, encoding='latin1', sep=';')
df_2023 = pd.read_csv(r"C:\Users\mirko\Desktop\EK\JyJ\artroplastia rodilla PAD\Egresos Hospitalarios\raw_data\Egresos_Hospitalarios_2023.csv",low_memory=False, encoding='latin1', sep=';')
df_2024 = pd.read_csv(r"C:\Users\mirko\Desktop\EK\JyJ\artroplastia rodilla PAD\Egresos Hospitalarios\raw_data\Egresos_Hospitalarios_2024.csv",low_memory=False, encoding='latin1', sep=';')

# Concatenate data

In [114]:
# Create a list of dataframes to concatenate
df_list = [df_2021, df_2022, df_2023, df_2024]

In [115]:
# loop over the dataframes to create a column "Year" whose value is the year of the discharges of each dataframe
year = 2021
for df in df_list:
    df["Year"] = year
    year += 1

In [116]:
# Concatenate the dataframes
df_concat = pd.concat(df_list)

# Matching columns between year 2001-2020 and 2021-2024.

In [122]:
# Drop columns that are irrelevant and/or are not the same as in the discharges from 2001 to 2020 dataframe
df_concat = df_concat.drop(columns = ["GLOSA_PAIS_ORIGEN", "COMUNA_RESIDENCIA", "GLOSA_COMUNA_RESIDENCIA", "REGION_RESIDENCIA",
                          "GLOSA_REGION_RESIDENCIA","PREVISION", "DIAG2", "ETNIA",
                          "GLOSA_INTERV_Q_PPAL", "GLOSA_PROCED_PPAL", "PERTENENCIA_ESTABLECIMIENTO_SALU"])

In [123]:
# Rename columns with the same names as in the discharges from 2001 to 2020 dataframe to merge in the future
df_concat = df_concat.rename(columns={"PERTENENCIA_ESTABLECIMIENTO_SALUD": "Healthcare facility type",
                          "SEXO": "Sex", "GRUPO_EDAD" : "Age", "GLOSA_PREVISION" : "Health insurance", "ANO_EGRESO" : "Year_with_nans",
                         "DIAG1" : "Primary diagnosis code", "DIAS_ESTADA" : "Length of stay", "CONDICION_EGRESO" : "Discharge condition"})

In [125]:
# A general view of the dataframe
df_concat

Unnamed: 0,Healthcare facility type,Sex,Age,Health insurance,Year_with_nans,Primary diagnosis code,Length of stay,Discharge condition,Year
0,No Pertenecientes al Sistema Nacional de Servi...,1,1 A 4 AÑOS,ISAPRE,2021,K590,1,1,2021
1,No Pertenecientes al Sistema Nacional de Servi...,1,1 A 4 AÑOS,NINGUNA,2021,S015,1,1,2021
2,No Pertenecientes al Sistema Nacional de Servi...,1,1 A 4 AÑOS,ISAPRE,2021,T181,1,1,2021
3,No Pertenecientes al Sistema Nacional de Servi...,1,1 A 4 AÑOS,FONASA,2021,Q381,1,1,2021
4,No Pertenecientes al Sistema Nacional de Servi...,1,1 A 4 AÑOS,FONASA,2021,Q531,1,1,2021
...,...,...,...,...,...,...,...,...,...
1667344,Pertenecientes al Sistema Nacional de Servicio...,*,7 A 27 DIAS,*,2024,P073,4,1,2024
1667345,Pertenecientes al Sistema Nacional de Servicio...,*,7 A 27 DIAS,*,2024,P590,2,1,2024
1667346,Pertenecientes al Sistema Nacional de Servicio...,*,7 A 27 DIAS,*,2024,P599,2,1,2024
1667347,Pertenecientes al Sistema Nacional de Servicio...,*,menor a 7 días,*,2024,E250,52,1,2024


In [128]:
# See column data types to see if it is possible to reduce memorey usage by changing them
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6343796 entries, 0 to 1667348
Data columns (total 9 columns):
 #   Column                    Dtype 
---  ------                    ----- 
 0   Healthcare facility type  object
 1   Sex                       object
 2   Age                       object
 3   Health insurance          object
 4   Year_with_nans            object
 5   Primary diagnosis code    object
 6   Length of stay            int64 
 7   Discharge condition       int64 
 8   Year                      int64 
dtypes: int64(3), object(6)
memory usage: 484.0+ MB


In [130]:
# The original data uses "*" as a missing value, so it is important to count it on each column
(df_concat == "*").sum()

Healthcare facility type     39569
Sex                         173892
Age                         167677
Health insurance            173892
Year_with_nans              167677
Primary diagnosis code           0
Length of stay                   0
Discharge condition              0
Year                             0
dtype: int64

In [132]:
# Possibly also there are missing values (as nan) and there are 1.612.267 in the "Healthcare facility type" apart of the 39.569 "*" in the column
df_concat.isna().sum()

Healthcare facility type    1612267
Sex                               0
Age                               0
Health insurance                  0
Year_with_nans                    0
Primary diagnosis code            0
Length of stay                    0
Discharge condition               0
Year                              0
dtype: int64

In [69]:
# It is important to replace "*" by nan values to be able to change data types, so that it is possible to export the dataframe.
# Also, doing this allow to handle missing data properly
df_concat.replace(to_replace = "*", value = np.nan, inplace = True)

In [98]:
# Change data types on each column for a lighter one
df_concat["Healthcare facility type"] = df_concat["Healthcare facility type"].astype("category")
df_concat["Sex"] = df_concat["Sex"].astype("category")
df_concat["Age"] = df_concat["Age"].astype("category")
df_concat["Health insurance"] = df_concat["Health insurance"].astype("category")
df_concat["Year_with_nans"] = df_concat["Year_with_nans"].astype("category")
df_concat["Primary diagnosis code"] = df_concat["Primary diagnosis code"].astype("category")
df_concat["Length of stay"] = df_concat["Length of stay"].astype("int32")
df_concat["Discharge condition"] = df_concat["Discharge condition"].astype("category")
df_concat["Year"] = df_concat["Year"].astype("int16")

In [100]:
# Checking if the changes were made, noticing a reduction of memory usage from 484 to 133 mb.
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6343796 entries, 0 to 1667348
Data columns (total 9 columns):
 #   Column                    Dtype   
---  ------                    -----   
 0   Healthcare facility type  category
 1   Sex                       category
 2   Age                       category
 3   Health insurance          category
 4   Year_with_nans            category
 5   Primary diagnosis code    category
 6   Length of stay            int32   
 7   Discharge condition       category
 8   Year                      int16   
dtypes: category(7), int16(1), int32(1)
memory usage: 133.4 MB


In [102]:
# Force conversion with error checking
categorical_columns = [
    "Healthcare facility type", "Sex", "Age",
    "Health insurance", "Year_with_nans",
    "Primary diagnosis code", "Discharge condition"
]

for col in categorical_columns:
    df_concat[col] = df_concat[col].astype("string").astype("category")

In [106]:
# export as a compressed parquet to make the file even lighter. This helps to merge easier and faster with other dataframes
df_concat.to_parquet("Discharges_2021-2024_messy.parquet", engine="pyarrow", compression="snappy")