# Libraries and Data importation

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from unidecode import unidecode

# General view of the data

In [4]:
# Importing data from 2001 to 2024 but diagnosis names are missing for data from 2021 to 2024
df = pd.read_parquet(r"C:\Users\mirko\Desktop\Curso Data Science DL\ProyectoEDA_Egresos_Hospitalarios_2001-2020\data\processed\Discharges_2001-2024_analysis.parquet")

In [5]:
df.head()

Unnamed: 0,sex,age,health insurance,healthcare facility type,healthcare facility name,length of stay,discharge condition,primary diagnosis code,primary diagnosis name,year
0,True,"[60, 69]",True,True,Clínica Familia,2,True,C780,TUMOR MALIGNO SECUNDARIO DEL PULMÓN,1970-01-01 00:00:00.000002001
1,True,"[70, 79]",True,True,"Hospital Del Salvador (Santiago, Providencia)",58,True,E145,"DIABETES MELLITUS NO ESPECIFICADA, CON COMPLIC...",1970-01-01 00:00:00.000002001
2,True,"[70, 79]",True,True,"Hospital Del Salvador (Santiago, Providencia)",12,True,J189,"NEUMONIA, NO ESPECIFICADA",1970-01-01 00:00:00.000002001
3,True,"[1, 9]",True,True,Instituto de Neurocirugía Dr. Alfonso Asenjo,1,True,S065,HEMORRAGIA SUBDURAL TRAUMÁTICA,1970-01-01 00:00:00.000002001
4,True,"[80, 120]",True,True,Hospital Dr. Leonardo Guzmán (Antofagasta),7,True,K830,COLANGITIS,1970-01-01 00:00:00.000002001


In [6]:
# Diagnosis names are missing for data from 2021 to 2024
# Healthcare facility name is missing in data from 2021 to 2024, so it will be replaced for "Unknown"
df.isna().sum()

sex                               0
age                               2
health insurance                  0
healthcare facility type          0
healthcare facility name    4970764
length of stay                    0
discharge condition               0
primary diagnosis code            0
primary diagnosis name      4970764
year                              0
dtype: int64

# Primary diagnosis names and codes cleaning

In [7]:
# To normalize strings they are turned to lower case and accents and unexpected whitespaces are deleted
df["primary diagnosis name"] = df["primary diagnosis name"].map(
    lambda x: unidecode(x).lower().strip() if isinstance(x, str) else x
)


In [8]:
# Step 1: Save in an object the 2 columns associated with diagnosis
diagnosis = df[["primary diagnosis code", "primary diagnosis name"]].copy()

In [9]:
# There are not nan codes, so it is possible to assign a name to the nan values in "primary diagnosis name"
diagnosis.isna().sum()

primary diagnosis code          0
primary diagnosis name    4970764
dtype: int64

In [10]:
# Step 2: Filter out rows where the name is None or NaN
diagnosis = diagnosis[diagnosis["primary diagnosis name"].notna()]

In [11]:
# A new column is created with only valid diangosis names
diagnosis["name_clean"] = diagnosis["primary diagnosis name"]

In [12]:
# Step 3: Group by code and take the first normalized name
first_valid = (
    diagnosis.groupby("primary diagnosis code")["name_clean"]
    .first()          # first value after grouping
    .reset_index()
)

# Step 4: Build dictionary
clean_dictionary = dict(
    zip(first_valid["primary diagnosis code"], first_valid["name_clean"])
)

In [13]:
# Step 5: vectorization to map names with the corresponding code in the dictionary and create a series with them to replace NaN names
replacement_series = df["primary diagnosis code"].map(clean_dictionary)

In [14]:
# Fill NaN names with the corresponding one depending on its diagnosis code for each row
df["primary diagnosis name"] = df["primary diagnosis name"].fillna(replacement_series)

In [16]:
# Still there are codes without a name in the dictionary. That possibly means that the diagnosis was not present until 2021
# Other option is the glossary (which is considering until year 2020) was not considering new diangosis codes yet (which is the case for U109 and U099)
# Only B04X will be replaced manually due to its high frequency adnt he rest deleted from the dataframe
df[df["primary diagnosis name"].isna()]["primary diagnosis code"].value_counts()

primary diagnosis code
B04X    407
U109    353
U099    235
A971     41
U129     37
A979     24
A970     19
L987     17
P917      9
e115      5
A972      3
G233      3
z518      3
C61x      2
G835      2
c498      2
A309      2
E502      2
o998      2
A803      1
M493      1
B550      1
L652      1
A798      1
T674      1
B600      1
B050      1
k819      1
A288      1
i660      1
k409      1
A304      1
R452      1
C20x      1
j90x      1
c795      1
Q962      1
d62x      1
j81X      1
k709      1
Q987      1
u071      1
B355      1
R894      1
Name: count, dtype: int64

In [17]:
# Proof that this diagnosis is not present with a name in the entire dataframe
df[(df["primary diagnosis code"] == "B04X") & (df["primary diagnosis name"].notna())]

Unnamed: 0,sex,age,health insurance,healthcare facility type,healthcare facility name,length of stay,discharge condition,primary diagnosis code,primary diagnosis name,year


In [18]:
# Replacing NaN name with "viruela de los monos", according to the glossary
condition = (df["primary diagnosis code"] == "B04X") & (df["primary diagnosis name"].isna())
df.loc[condition, "primary diagnosis name"] = "viruela de los monos"

In [19]:
# Dropping NaNs from the dataframe
df = df[df["primary diagnosis name"].notna()]

In [29]:
# Group by code and get the unique names per code
conflict_check = (
    df.groupby("primary diagnosis code")["primary diagnosis name"]
    .unique()
    .reset_index()
)

# Keep only codes with more than one unique name
conflict_check = conflict_check[
    conflict_check["primary diagnosis name"].apply(lambda x: len(x) > 1)
]

# Show the result
print(f"Number of codes with multiple names: {len(conflict_check)}")
print(conflict_check)



Number of codes with multiple names: 9
     primary diagnosis code                             primary diagnosis name
158                    A402  [septicemia debida a estreptococo, grupo d, se...
191                    A491  [infeccion estreptococica, sin otra especifica...
366                    B170  [infeccion (superinfeccion) aguda por agente d...
1395                   D460  [anemia refractaria sin anillos de sideroblast...
3390                   J09X  [influenza aviar (gripe aviar), influenza a (h...
3391                   J100  [influenza con neumonia, debida a virus de la ...
3392                   J101  [influenza con otras manifestaciones respirato...
3393                   J108  [influenza, con otras manifestaciones, debida ...
3411                   J156  [neumonia debida a otras bacterias aerobicas g...


In [30]:
# Still after the normalization there are some codes with multiple diagnosis names, to simplify, the first to appear in the data will be kept
first_name_per_code = df.groupby("primary diagnosis code")["primary diagnosis name"].first()

In [31]:
# map and keep just the first diangosis name associated with the codes
df["primary diagnosis name"] = df["primary diagnosis code"].map(first_name_per_code)

In [32]:
# Group by code and collect unique names
check = (
    df.groupby("primary diagnosis code")["primary diagnosis name"]
    .nunique()  # count how many unique names per code
    .reset_index(name="unique_count")
)

# Look for any codes with more than one name
multiple_names = check[check["unique_count"] > 1]

# Show results
print(f"Number of codes with multiple names: {len(multiple_names)}")
print(multiple_names)


Number of codes with multiple names: 0
Empty DataFrame
Columns: [primary diagnosis code, unique_count]
Index: []


# Healthcare facility name cleaning

In [33]:
# NaN healthcare facility names are filled by the string "Unknown" to avoid losing valuable data in the other columns
df["healthcare facility name"] = df["healthcare facility name"].fillna("Unknown")

In [34]:
df.isna().sum()

sex                         0
age                         2
health insurance            0
healthcare facility type    0
healthcare facility name    0
length of stay              0
discharge condition         0
primary diagnosis code      0
primary diagnosis name      0
year                        0
dtype: int64

In [35]:
# Deleting remaining NaNs
df = df.dropna()

In [40]:
# Checking memory usage and column dtypes before exporting
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25758741 entries, 0 to 25759528
Data columns (total 10 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   sex                       bool          
 1   age                       object        
 2   health insurance          bool          
 3   healthcare facility type  bool          
 4   healthcare facility name  object        
 5   length of stay            int32         
 6   discharge condition       bool          
 7   primary diagnosis code    object        
 8   primary diagnosis name    object        
 9   year                      datetime64[ns]
dtypes: bool(4), datetime64[ns](1), int32(1), object(4)
memory usage: 1.3+ GB


# Export data

In [45]:
# This dataframe is the one that will be used to perform the general analysis from 2001 to 2024
df.to_parquet('Discharges_2001-2024_analysis_finalversion.parquet', index=False, compression="snappy")