# Libraries and Data importation 2001-2020

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df_20 = pd.read_parquet(r"C:\Users\mirko\Desktop\Curso Data Science DL\EDAHospDischarges_Chile2001-2024\data\processed\General_analysis_data.parquet")

In [4]:
df_20.head()

Unnamed: 0,Sex,Age,Health insurance,Healthcare facility type,Healthcare facility name,Length of stay,Discharge condition,Primary diagnosis code,Primary diagnosis name,Year
0,Hombre,60,Fonasa,Privado,Clínica Familia,2,Muerto,C780,TUMOR MALIGNO SECUNDARIO DEL PULMÓN,2001
1,Hombre,74,Fonasa,Publico,"Hospital Del Salvador (Santiago, Providencia)",58,Muerto,E145,"DIABETES MELLITUS NO ESPECIFICADA, CON COMPLIC...",2001
2,Hombre,71,Fonasa,Publico,"Hospital Del Salvador (Santiago, Providencia)",12,Muerto,J189,"NEUMONIA, NO ESPECIFICADA",2001
3,Hombre,2,Fonasa,Publico,Instituto de Neurocirugía Dr. Alfonso Asenjo,1,Muerto,S065,HEMORRAGIA SUBDURAL TRAUMÁTICA,2001
4,Hombre,81,Fonasa,Publico,Hospital Dr. Leonardo Guzmán (Antofagasta),7,Vivo,K830,COLANGITIS,2001


In [5]:
df_20.shape

(25823381, 10)

In [6]:
df_20.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25823381 entries, 0 to 25823380
Data columns (total 10 columns):
 #   Column                    Dtype   
---  ------                    -----   
 0   Sex                       object  
 1   Age                       int32   
 2   Health insurance          category
 3   Healthcare facility type  category
 4   Healthcare facility name  object  
 5   Length of stay            int32   
 6   Discharge condition       category
 7   Primary diagnosis code    object  
 8   Primary diagnosis name    object  
 9   Year                      int64   
dtypes: category(3), int32(2), int64(1), object(4)
memory usage: 1.2+ GB


In [7]:
# Deleting births
mask_births = df_20["Primary diagnosis code"].str.startswith("O")

In [8]:
df_20 = df_20[~mask_births]

# Turning Age into age ranges to match them with discharges of years 2021-2024

In [10]:
# Define a function to turn age into numpy array that represent age range
def age_range(n):
    age_ranges = [[30, 39],[60, 69],[20, 29],[50, 59],[40, 49],[70, 79],[80, 120],[10, 19],[1, 9],[0, 1]]
    rango_n = None
    for rango in age_ranges:
        if n >= int(rango[0]) and n <= int(rango[1]):
            rango_n = np.array(rango)
    return(rango_n)

In [11]:
df_20["Age"] = df_20["Age"].apply(age_range)

# Importing data from 2021 to 2024

In [13]:
df_24 = pd.read_parquet(r"C:\Users\mirko\Desktop\Curso Data Science DL\EDAHospDischarges_Chile2001-2024\data\processed\Egresos_2021-2024.parquet")

In [14]:
df_24

Unnamed: 0,Healthcare facility type,Sex,Health insurance,Year_with_nans,Primary diagnosis code,Length of stay,Discharge condition,Year,Age
0,No Pertenecientes al Sistema Nacional de Servi...,Hombre,Isapre,2021,K590,1,1,2021,"[1, 9]"
1,No Pertenecientes al Sistema Nacional de Servi...,Hombre,Isapre,2021,T181,1,1,2021,"[1, 9]"
2,No Pertenecientes al Sistema Nacional de Servi...,Hombre,Fonasa,2021,Q381,1,1,2021,"[1, 9]"
3,No Pertenecientes al Sistema Nacional de Servi...,Hombre,Fonasa,2021,Q531,1,1,2021,"[1, 9]"
4,No Pertenecientes al Sistema Nacional de Servi...,Hombre,Fonasa,2021,Q539,1,1,2021,"[1, 9]"
...,...,...,...,...,...,...,...,...,...
4970759,Pertenecientes al Sistema Nacional de Servicio...,Mujer,Isapre,2024,S525,5,1,2024,"[70, 79]"
4970760,Pertenecientes al Sistema Nacional de Servicio...,Mujer,Isapre,2024,K805,9,1,2024,"[80, 120]"
4970761,Pertenecientes al Sistema Nacional de Servicio...,Mujer,Isapre,2024,M169,7,1,2024,"[80, 120]"
4970762,Pertenecientes al Sistema Nacional de Servicio...,Mujer,Isapre,2024,I489,9,2,2024,"[80, 120]"


In [15]:
df_24.shape

(4970764, 9)

In [16]:
df_24.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4970764 entries, 0 to 4970763
Data columns (total 9 columns):
 #   Column                    Dtype   
---  ------                    -----   
 0   Healthcare facility type  category
 1   Sex                       object  
 2   Health insurance          object  
 3   Year_with_nans            category
 4   Primary diagnosis code    category
 5   Length of stay            int32   
 6   Discharge condition       category
 7   Year                      int16   
 8   Age                       object  
dtypes: category(4), int16(1), int32(1), object(3)
memory usage: 166.2+ MB


# Joining dataframes

Preparing dataframes before joining them

In [19]:
# Both columns have the same data, but the column from the original data will be kept and the column name will be changed to match in both dataframes
df_24 = df_24.drop(columns = "Year").rename(columns = {"Year_with_nans" : "Year"})

In [20]:
# This column is created to have the same structure between both dataframes
df_24["Primary diagnosis name"] = pd.NA
df_24["healthcare facility name"] = pd.NA

In [21]:
# Making sure column names do not have unexpected spaces or characters
df_20.columns = df_20.columns.str.strip().str.lower()
df_24.columns = df_24.columns.str.strip().str.lower()

In [22]:
df_24.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4970764 entries, 0 to 4970763
Data columns (total 10 columns):
 #   Column                    Dtype   
---  ------                    -----   
 0   healthcare facility type  category
 1   sex                       object  
 2   health insurance          object  
 3   year                      category
 4   primary diagnosis code    category
 5   length of stay            int32   
 6   discharge condition       category
 7   age                       object  
 8   primary diagnosis name    object  
 9   healthcare facility name  object  
dtypes: category(4), int32(1), object(5)
memory usage: 232.6+ MB


In [23]:
# Convert dtypes of both dataframes to be able to concatenate
df_24.loc[:, "primary diagnosis name"] = df_24["primary diagnosis name"].astype(object)
df_24.loc[:, "healthcare facility name"] = df_24["healthcare facility name"].astype(object)
df_24.loc[:, "healthcare facility type"] = df_24["healthcare facility type"].astype(str)
df_24.loc[:, "discharge condition"] = df_24["discharge condition"].astype(str)
df_24["year"] = pd.to_numeric(df_24["year"].astype(str), errors="coerce").astype("Int64")
df_24.loc[:, "length of stay"] = pd.to_numeric(df_24["length of stay"], errors="coerce")
df_24.loc[:, "primary diagnosis code"] = df_24["primary diagnosis code"].astype(object)
df_24.loc[:, "health insurance"] = df_24["health insurance"].astype(object)

In [24]:
df_20.loc[:, "health insurance"] = df_20["health insurance"].astype(object)
df_20.loc[:, "healthcare facility type"] = df_20["healthcare facility type"].astype(str)
df_20.loc[:, "discharge condition"] = df_20["discharge condition"].astype(str)
df_20.loc[:, "year"] = pd.to_numeric(df_20["year"], errors="coerce")
df_20 = df_20.copy()  # ensures it's not a slice
df_20["age"] = df_20["age"].astype(object)
df_20.loc[:, "length of stay"] = pd.to_numeric(df_20["length of stay"], errors="coerce")
df_20.loc[:, "primary diagnosis name"] = df_20["primary diagnosis name"].astype(object)

In [25]:
df_24.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4970764 entries, 0 to 4970763
Data columns (total 10 columns):
 #   Column                    Dtype   
---  ------                    -----   
 0   healthcare facility type  category
 1   sex                       object  
 2   health insurance          object  
 3   year                      Int64   
 4   primary diagnosis code    category
 5   length of stay            int32   
 6   discharge condition       category
 7   age                       object  
 8   primary diagnosis name    object  
 9   healthcare facility name  object  
dtypes: Int64(1), category(3), int32(1), object(5)
memory usage: 270.5+ MB


In [26]:
df_20.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20788765 entries, 0 to 25823380
Data columns (total 10 columns):
 #   Column                    Dtype   
---  ------                    -----   
 0   sex                       object  
 1   age                       object  
 2   health insurance          category
 3   healthcare facility type  category
 4   healthcare facility name  object  
 5   length of stay            int32   
 6   discharge condition       category
 7   primary diagnosis code    object  
 8   primary diagnosis name    object  
 9   year                      int64   
dtypes: category(3), int32(1), int64(1), object(5)
memory usage: 1.2+ GB


In [27]:
# reset index to concatenate
df_24 = df_24.reset_index(drop=True)

In [28]:
df_20 = df_20.reset_index(drop=True)

In [29]:
# concatenate data from year 2001 to 2020 and 2021 to 2024
df = pd.concat([df_20, df_24], axis=0, ignore_index=True)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25759529 entries, 0 to 25759528
Data columns (total 10 columns):
 #   Column                    Dtype 
---  ------                    ----- 
 0   sex                       object
 1   age                       object
 2   health insurance          object
 3   healthcare facility type  object
 4   healthcare facility name  object
 5   length of stay            int32 
 6   discharge condition       object
 7   primary diagnosis code    object
 8   primary diagnosis name    object
 9   year                      Int64 
dtypes: Int64(1), int32(1), object(8)
memory usage: 1.8+ GB


In [31]:
df

Unnamed: 0,sex,age,health insurance,healthcare facility type,healthcare facility name,length of stay,discharge condition,primary diagnosis code,primary diagnosis name,year
0,Hombre,"[60, 69]",Fonasa,Privado,Clínica Familia,2,Muerto,C780,TUMOR MALIGNO SECUNDARIO DEL PULMÓN,2001
1,Hombre,"[70, 79]",Fonasa,Publico,"Hospital Del Salvador (Santiago, Providencia)",58,Muerto,E145,"DIABETES MELLITUS NO ESPECIFICADA, CON COMPLIC...",2001
2,Hombre,"[70, 79]",Fonasa,Publico,"Hospital Del Salvador (Santiago, Providencia)",12,Muerto,J189,"NEUMONIA, NO ESPECIFICADA",2001
3,Hombre,"[1, 9]",Fonasa,Publico,Instituto de Neurocirugía Dr. Alfonso Asenjo,1,Muerto,S065,HEMORRAGIA SUBDURAL TRAUMÁTICA,2001
4,Hombre,"[80, 120]",Fonasa,Publico,Hospital Dr. Leonardo Guzmán (Antofagasta),7,Vivo,K830,COLANGITIS,2001
...,...,...,...,...,...,...,...,...,...,...
25759524,Mujer,"[70, 79]",Isapre,Pertenecientes al Sistema Nacional de Servicio...,,5,1,S525,,2024
25759525,Mujer,"[80, 120]",Isapre,Pertenecientes al Sistema Nacional de Servicio...,,9,1,K805,,2024
25759526,Mujer,"[80, 120]",Isapre,Pertenecientes al Sistema Nacional de Servicio...,,7,1,M169,,2024
25759527,Mujer,"[80, 120]",Isapre,Pertenecientes al Sistema Nacional de Servicio...,,9,2,I489,,2024


In [32]:
# Functions to fix categories
def pubpriv(x):
    if x == "Pertenecientes al Sistema Nacional de Servicios de Salud, SNSS":
        x = "Publico"
    elif x == "No Pertenecientes al Sistema Nacional de Servicios de Salud, SNSS":
        x = "Privado"
    return x

def discharge_condition(x):
    if x == "2":
        x = "Muerto"
    elif x == "1":
        x = "Vivo"
    return x

In [33]:
df["discharge condition"] = df["discharge condition"].apply(discharge_condition)

In [34]:
df["healthcare facility type"] = df["healthcare facility type"].apply(pubpriv)

In [35]:
# Convert dtypes to reduce even more dataframe size
cols_to_convert = ["sex", "health insurance", "discharge condition", "healthcare facility type"]
df[cols_to_convert] = df[cols_to_convert].astype("category")

In [36]:
df["year"] = df["year"].astype("int32")

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25759529 entries, 0 to 25759528
Data columns (total 10 columns):
 #   Column                    Dtype   
---  ------                    -----   
 0   sex                       category
 1   age                       object  
 2   health insurance          category
 3   healthcare facility type  category
 4   healthcare facility name  object  
 5   length of stay            int32   
 6   discharge condition       category
 7   primary diagnosis code    object  
 8   primary diagnosis name    object  
 9   year                      int32   
dtypes: category(4), int32(2), object(4)
memory usage: 1.1+ GB


In [38]:
df.head()

Unnamed: 0,sex,age,health insurance,healthcare facility type,healthcare facility name,length of stay,discharge condition,primary diagnosis code,primary diagnosis name,year
0,Hombre,"[60, 69]",Fonasa,Privado,Clínica Familia,2,Muerto,C780,TUMOR MALIGNO SECUNDARIO DEL PULMÓN,2001
1,Hombre,"[70, 79]",Fonasa,Publico,"Hospital Del Salvador (Santiago, Providencia)",58,Muerto,E145,"DIABETES MELLITUS NO ESPECIFICADA, CON COMPLIC...",2001
2,Hombre,"[70, 79]",Fonasa,Publico,"Hospital Del Salvador (Santiago, Providencia)",12,Muerto,J189,"NEUMONIA, NO ESPECIFICADA",2001
3,Hombre,"[1, 9]",Fonasa,Publico,Instituto de Neurocirugía Dr. Alfonso Asenjo,1,Muerto,S065,HEMORRAGIA SUBDURAL TRAUMÁTICA,2001
4,Hombre,"[80, 120]",Fonasa,Publico,Hospital Dr. Leonardo Guzmán (Antofagasta),7,Vivo,K830,COLANGITIS,2001


# Export joint dataframe with discharges from 2001 to 2024

In [78]:
df.to_parquet('Discharges_2001-2024_messy_codes.parquet', index=False, compression="snappy")