# 1. Load and Inspect data

In [37]:
import pandas as pd
import os

# Define the path to the raw data directory
RAW_DATA_PATH = "../backend/data/raw/csv/"

# List of datasets to load
datasets = ["patients", "conditions", "encounters", "medications", "claims", "payers"]

# Load datasets into a dictionary
dfs = {}
for dataset in datasets:
    file_path = os.path.join(RAW_DATA_PATH, f"{dataset}.csv")
    if os.path.exists(file_path):
        dfs[dataset] = pd.read_csv(file_path, dtype=str)
        print(f"Loaded {dataset}.csv with {dfs[dataset].shape[0]} rows and {dfs[dataset].shape[1]} columns.")
    else:
        print(f"File {dataset}.csv not found in the raw data folder.")


Loaded patients.csv with 2287 rows and 28 columns.
Loaded conditions.csv with 93114 rows and 7 columns.
Loaded encounters.csv with 167146 rows and 15 columns.
Loaded medications.csv with 163608 rows and 13 columns.
Loaded claims.csv with 330754 rows and 31 columns.
Loaded payers.csv with 10 rows and 22 columns.


# 2.1 Patients dataset

In [38]:
# Load and inspect the patients dataset
df_patients = dfs["patients"]

# Display column names
print("Columns in patients dataset:")
print(df_patients.columns.tolist())

# Expand column display width in Jupyter Notebook
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Show first 5 rows
print("\nFirst 5 rows of patients dataset:")
display(df_patients.head())

Columns in patients dataset:
['Id', 'BIRTHDATE', 'DEATHDATE', 'SSN', 'DRIVERS', 'PASSPORT', 'PREFIX', 'FIRST', 'MIDDLE', 'LAST', 'SUFFIX', 'MAIDEN', 'MARITAL', 'RACE', 'ETHNICITY', 'GENDER', 'BIRTHPLACE', 'ADDRESS', 'CITY', 'STATE', 'COUNTY', 'FIPS', 'ZIP', 'LAT', 'LON', 'HEALTHCARE_EXPENSES', 'HEALTHCARE_COVERAGE', 'INCOME']

First 5 rows of patients dataset:


Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,MIDDLE,LAST,SUFFIX,MAIDEN,MARITAL,RACE,ETHNICITY,GENDER,BIRTHPLACE,ADDRESS,CITY,STATE,COUNTY,FIPS,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME
0,a02a3f43-b6f4-d209-4062-af0cec3b22c1,2015-08-02,,999-99-5386,,,,Jacobo456,Patricio639,Malave728,,,,white,hispanic,M,Gaudalajara Jalisco MX,530 Zboncak Landing Suite 63,Montebello,California,Los Angeles County,6037.0,90022,34.012467415112646,-118.14746462284438,33339.29,4690.13,65295
1,d92132ce-06ac-3ab4-217f-97257a290b22,2011-09-25,,999-65-7810,,,,Dannette613,Darcie474,Bauch723,,,,white,nonhispanic,F,Berkeley California US,593 Doyle Ranch,Lemon Hill,California,Sacramento County,,0,38.55092979196755,-121.40955183225064,2798.49,141417.37,31770
2,abc59f62-dc5a-5095-1141-80b4ee8be73b,1996-06-13,,999-37-1058,S99918022,X63553287X,Mrs.,Jacque955,Jin479,Satterfield305,,Will178,M,white,nonhispanic,F,Santa Cruz California US,492 Keebler Estate,Fairfield,California,Solano County,6095.0,94585,38.18324005715623,-121.9588555053258,348990.79,186000.37,41915
3,54f1059e-6250-3949-6dd0-1dda9b85d22a,2003-02-12,,999-28-3364,S99936929,X15345756X,Ms.,Fredricka415,Matha641,Crist667,,,,white,nonhispanic,F,San Jose California US,931 Bartell Ville Apt 84,Irvine,California,Orange County,6059.0,92676,33.74517092481692,-117.74997027981271,56198.94,133887.3,330702
4,239ae86a-96db-6211-9042-d3f2850aabb8,1970-06-17,,999-74-7366,S99962894,X13153521X,Mr.,Darrell400,Harry448,Muller251,,,D,white,hispanic,M,Dublin California US,541 Stracke Plaza,San Jose,California,Santa Clara County,6085.0,95140,37.30226576037261,-121.96683622886896,10675.97,194000.08,3928


# 2.2 Drop Unnecessary Columns

In [39]:
# Define columns to drop
columns_to_drop = ['PREFIX', 'MIDDLE', 'SSN', 'DRIVERS', 'PASSPORT', 'SUFFIX', 'MAIDEN', 'BIRTHPLACE', 'FIPS', 'LAT', 'LON']

# Drop columns
df_patients_cleaned = df_patients.drop(columns=columns_to_drop)

# Display the updated dataset
print("Columns after dropping unnecessary ones:")
print(df_patients_cleaned.columns.tolist())

# Show first 5 rows after cleanup
display(df_patients_cleaned.head())

Columns after dropping unnecessary ones:
['Id', 'BIRTHDATE', 'DEATHDATE', 'FIRST', 'LAST', 'MARITAL', 'RACE', 'ETHNICITY', 'GENDER', 'ADDRESS', 'CITY', 'STATE', 'COUNTY', 'ZIP', 'HEALTHCARE_EXPENSES', 'HEALTHCARE_COVERAGE', 'INCOME']


Unnamed: 0,Id,BIRTHDATE,DEATHDATE,FIRST,LAST,MARITAL,RACE,ETHNICITY,GENDER,ADDRESS,CITY,STATE,COUNTY,ZIP,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME
0,a02a3f43-b6f4-d209-4062-af0cec3b22c1,2015-08-02,,Jacobo456,Malave728,,white,hispanic,M,530 Zboncak Landing Suite 63,Montebello,California,Los Angeles County,90022,33339.29,4690.13,65295
1,d92132ce-06ac-3ab4-217f-97257a290b22,2011-09-25,,Dannette613,Bauch723,,white,nonhispanic,F,593 Doyle Ranch,Lemon Hill,California,Sacramento County,0,2798.49,141417.37,31770
2,abc59f62-dc5a-5095-1141-80b4ee8be73b,1996-06-13,,Jacque955,Satterfield305,M,white,nonhispanic,F,492 Keebler Estate,Fairfield,California,Solano County,94585,348990.79,186000.37,41915
3,54f1059e-6250-3949-6dd0-1dda9b85d22a,2003-02-12,,Fredricka415,Crist667,,white,nonhispanic,F,931 Bartell Ville Apt 84,Irvine,California,Orange County,92676,56198.94,133887.3,330702
4,239ae86a-96db-6211-9042-d3f2850aabb8,1970-06-17,,Darrell400,Muller251,D,white,hispanic,M,541 Stracke Plaza,San Jose,California,Santa Clara County,95140,10675.97,194000.08,3928


# 2.3 Missing values

In [40]:
# Check for missing values in each column
missing_values = df_patients_cleaned.isnull().sum()

# Display only columns with missing values
missing_values = missing_values[missing_values > 0]

if missing_values.empty:
    print("No missing values found.")
else:
    print("Missing values per column:")
    print(missing_values)

Missing values per column:
DEATHDATE    2000
MARITAL       811
dtype: int64


In [41]:
# Fill missing MARITAL status with 'Unknown' using assignment (correct method)
df_patients_cleaned["MARITAL"] = df_patients_cleaned["MARITAL"].fillna("Unknown")

# Keep DEATHDATE as NULL (no replacement)

# Display updated missing values
print("Missing values after cleaning:")
print(df_patients_cleaned.isnull().sum())
# Show first 5 rows after cleanup
display(df_patients_cleaned.head())

Missing values after cleaning:
Id                        0
BIRTHDATE                 0
DEATHDATE              2000
FIRST                     0
LAST                      0
MARITAL                   0
RACE                      0
ETHNICITY                 0
GENDER                    0
ADDRESS                   0
CITY                      0
STATE                     0
COUNTY                    0
ZIP                       0
HEALTHCARE_EXPENSES       0
HEALTHCARE_COVERAGE       0
INCOME                    0
dtype: int64


Unnamed: 0,Id,BIRTHDATE,DEATHDATE,FIRST,LAST,MARITAL,RACE,ETHNICITY,GENDER,ADDRESS,CITY,STATE,COUNTY,ZIP,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME
0,a02a3f43-b6f4-d209-4062-af0cec3b22c1,2015-08-02,,Jacobo456,Malave728,Unknown,white,hispanic,M,530 Zboncak Landing Suite 63,Montebello,California,Los Angeles County,90022,33339.29,4690.13,65295
1,d92132ce-06ac-3ab4-217f-97257a290b22,2011-09-25,,Dannette613,Bauch723,Unknown,white,nonhispanic,F,593 Doyle Ranch,Lemon Hill,California,Sacramento County,0,2798.49,141417.37,31770
2,abc59f62-dc5a-5095-1141-80b4ee8be73b,1996-06-13,,Jacque955,Satterfield305,M,white,nonhispanic,F,492 Keebler Estate,Fairfield,California,Solano County,94585,348990.79,186000.37,41915
3,54f1059e-6250-3949-6dd0-1dda9b85d22a,2003-02-12,,Fredricka415,Crist667,Unknown,white,nonhispanic,F,931 Bartell Ville Apt 84,Irvine,California,Orange County,92676,56198.94,133887.3,330702
4,239ae86a-96db-6211-9042-d3f2850aabb8,1970-06-17,,Darrell400,Muller251,D,white,hispanic,M,541 Stracke Plaza,San Jose,California,Santa Clara County,95140,10675.97,194000.08,3928


In [42]:
# Check data types of the cleaned dataset
print(df_patients_cleaned.dtypes)


Id                     object
BIRTHDATE              object
DEATHDATE              object
FIRST                  object
LAST                   object
MARITAL                object
RACE                   object
ETHNICITY              object
GENDER                 object
ADDRESS                object
CITY                   object
STATE                  object
COUNTY                 object
ZIP                    object
HEALTHCARE_EXPENSES    object
HEALTHCARE_COVERAGE    object
INCOME                 object
dtype: object


In [43]:
import pandas as pd
import re

# Rename Id column to PATIENTID
df_patients_cleaned = df_patients_cleaned.rename(columns={"Id": "PATIENTID"})

# Convert BIRTHDATE and DEATHDATE to datetime format
df_patients_cleaned["BIRTHDATE"] = pd.to_datetime(df_patients_cleaned["BIRTHDATE"], errors="coerce")
df_patients_cleaned["DEATHDATE"] = pd.to_datetime(df_patients_cleaned["DEATHDATE"], errors="coerce")

# Calculate AGE from BIRTHDATE
today = pd.Timestamp.today()
df_patients_cleaned["AGE"] = df_patients_cleaned["BIRTHDATE"].apply(lambda x: today.year - x.year if pd.notnull(x) else None)

# Convert financial columns to float
financial_columns = ["HEALTHCARE_EXPENSES", "HEALTHCARE_COVERAGE", "INCOME"]
for col in financial_columns:
    df_patients_cleaned[col] = pd.to_numeric(df_patients_cleaned[col], errors="coerce")

# Round financial values to 2 decimal places
df_patients_cleaned[financial_columns] = df_patients_cleaned[financial_columns].round(2)

# Remove numbers from FIRST and LAST names
df_patients_cleaned["FIRST"] = df_patients_cleaned["FIRST"].apply(lambda x: re.sub(r'\d+$', '', x) if pd.notnull(x) else x)
df_patients_cleaned["LAST"] = df_patients_cleaned["LAST"].apply(lambda x: re.sub(r'\d+$', '', x) if pd.notnull(x) else x)

# Display the updated data types and first few rows
print(df_patients_cleaned.dtypes)
display(df_patients_cleaned.head())

PATIENTID                      object
BIRTHDATE              datetime64[ns]
DEATHDATE              datetime64[ns]
FIRST                          object
LAST                           object
MARITAL                        object
RACE                           object
ETHNICITY                      object
GENDER                         object
ADDRESS                        object
CITY                           object
STATE                          object
COUNTY                         object
ZIP                            object
HEALTHCARE_EXPENSES           float64
HEALTHCARE_COVERAGE           float64
INCOME                          int64
AGE                             int64
dtype: object


Unnamed: 0,PATIENTID,BIRTHDATE,DEATHDATE,FIRST,LAST,MARITAL,RACE,ETHNICITY,GENDER,ADDRESS,CITY,STATE,COUNTY,ZIP,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME,AGE
0,a02a3f43-b6f4-d209-4062-af0cec3b22c1,2015-08-02,NaT,Jacobo,Malave,Unknown,white,hispanic,M,530 Zboncak Landing Suite 63,Montebello,California,Los Angeles County,90022,33339.29,4690.13,65295,10
1,d92132ce-06ac-3ab4-217f-97257a290b22,2011-09-25,NaT,Dannette,Bauch,Unknown,white,nonhispanic,F,593 Doyle Ranch,Lemon Hill,California,Sacramento County,0,2798.49,141417.37,31770,14
2,abc59f62-dc5a-5095-1141-80b4ee8be73b,1996-06-13,NaT,Jacque,Satterfield,M,white,nonhispanic,F,492 Keebler Estate,Fairfield,California,Solano County,94585,348990.79,186000.37,41915,29
3,54f1059e-6250-3949-6dd0-1dda9b85d22a,2003-02-12,NaT,Fredricka,Crist,Unknown,white,nonhispanic,F,931 Bartell Ville Apt 84,Irvine,California,Orange County,92676,56198.94,133887.3,330702,22
4,239ae86a-96db-6211-9042-d3f2850aabb8,1970-06-17,NaT,Darrell,Muller,D,white,hispanic,M,541 Stracke Plaza,San Jose,California,Santa Clara County,95140,10675.97,194000.08,3928,55


In [44]:
print("Missing PATIENTID values:", df_patients_cleaned["PATIENTID"].isnull().sum())


Missing PATIENTID values: 0


In [45]:
print(df_patients_cleaned["ZIP"].unique()[:10])  # Show sample ZIP values


['90022' '00000' '94585' '92676' '95140' '90066' '91210' '92870' '93117'
 '92703']


In [46]:
df_patients_cleaned["ZIP"] = df_patients_cleaned["ZIP"].astype(str)


In [47]:
import os

PROCESSED_DATA_PATH = "../backend/data/processed/"
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)

df_patients_cleaned.to_csv(os.path.join(PROCESSED_DATA_PATH, "patients_cleaned.csv"), index=False)

print("✅ Cleaned patients dataset saved successfully!")


✅ Cleaned patients dataset saved successfully!


# 3.1 Conditions Dataset

In [59]:
# Load conditions dataset
df_conditions = dfs["conditions"]

# Display all column names
print("Columns in conditions dataset:")
print(df_conditions.columns.tolist())

# Show first 5 rows
display(df_conditions.head())

Columns in conditions dataset:
['START', 'STOP', 'PATIENT', 'ENCOUNTER', 'SYSTEM', 'CODE', 'DESCRIPTION']


Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,SYSTEM,CODE,DESCRIPTION
0,2015-03-01,2016-09-04,d92132ce-06ac-3ab4-217f-97257a290b22,e0819fcd-cd3c-371c-efa4-d923e3788730,http://snomed.info/sct,314529007,Medication review due (situation)
1,2015-08-02,2016-04-10,a02a3f43-b6f4-d209-4062-af0cec3b22c1,b06e3a3b-7d43-d28d-06c9-db649f132502,http://snomed.info/sct,314529007,Medication review due (situation)
2,2016-09-23,2016-10-23,d92132ce-06ac-3ab4-217f-97257a290b22,c138ba5e-eea4-b4a1-d431-30ae17bedfad,http://snomed.info/sct,32911000,Homeless (finding)
3,2015-09-13,2015-11-08,a02a3f43-b6f4-d209-4062-af0cec3b22c1,722dacab-9a59-66ca-16f9-16703dcd2543,http://snomed.info/sct,65363002,Otitis media (disorder)
4,2017-08-12,,d92132ce-06ac-3ab4-217f-97257a290b22,364c86f0-1d1f-6fcb-bd8c-5f55d8bc0d47,http://snomed.info/sct,367498001,Seasonal allergic rhinitis (disorder)


# 3.2 Drop unnecessary columns

In [60]:
import pandas as pd

# Load conditions dataset
df_conditions = dfs["conditions"]

# Drop SYSTEM and CODE columns
df_conditions_cleaned = df_conditions.drop(columns=["SYSTEM", "CODE"])

# Convert START and STOP to datetime format
df_conditions_cleaned["START"] = pd.to_datetime(df_conditions_cleaned["START"], errors="coerce")
df_conditions_cleaned["STOP"] = pd.to_datetime(df_conditions_cleaned["STOP"], errors="coerce")

# Display cleaned data
print(df_conditions_cleaned.dtypes)
display(df_conditions_cleaned.head())

START          datetime64[ns]
STOP           datetime64[ns]
PATIENT                object
ENCOUNTER              object
DESCRIPTION            object
dtype: object


Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,DESCRIPTION
0,2015-03-01,2016-09-04,d92132ce-06ac-3ab4-217f-97257a290b22,e0819fcd-cd3c-371c-efa4-d923e3788730,Medication review due (situation)
1,2015-08-02,2016-04-10,a02a3f43-b6f4-d209-4062-af0cec3b22c1,b06e3a3b-7d43-d28d-06c9-db649f132502,Medication review due (situation)
2,2016-09-23,2016-10-23,d92132ce-06ac-3ab4-217f-97257a290b22,c138ba5e-eea4-b4a1-d431-30ae17bedfad,Homeless (finding)
3,2015-09-13,2015-11-08,a02a3f43-b6f4-d209-4062-af0cec3b22c1,722dacab-9a59-66ca-16f9-16703dcd2543,Otitis media (disorder)
4,2017-08-12,NaT,d92132ce-06ac-3ab4-217f-97257a290b22,364c86f0-1d1f-6fcb-bd8c-5f55d8bc0d47,Seasonal allergic rhinitis (disorder)


# 3.3 Creating new useful columns

In [61]:
# Calculate DURATION_DAYS (difference between STOP and START)
df_conditions_cleaned["DURATION_DAYS"] = (df_conditions_cleaned["STOP"] - df_conditions_cleaned["START"]).dt.days

import re

# Extract condition type (text inside parentheses)
df_conditions_cleaned["CONDITION_TYPE"] = df_conditions_cleaned["DESCRIPTION"].apply(
    lambda x: re.findall(r'\((.*?)\)', x)[-1] if pd.notnull(x) and "(" in x else "Unknown"
)

# Remove condition type from DESCRIPTION column
df_conditions_cleaned["DESCRIPTION"] = df_conditions_cleaned["DESCRIPTION"].apply(
    lambda x: re.sub(r'\s*\(.*?\)', '', x) if pd.notnull(x) else x
)


# Display to confirm
# Display cleaned data
print(df_conditions_cleaned.dtypes)
display(df_conditions_cleaned.head())


START             datetime64[ns]
STOP              datetime64[ns]
PATIENT                   object
ENCOUNTER                 object
DESCRIPTION               object
DURATION_DAYS            float64
CONDITION_TYPE            object
dtype: object


Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,DESCRIPTION,DURATION_DAYS,CONDITION_TYPE
0,2015-03-01,2016-09-04,d92132ce-06ac-3ab4-217f-97257a290b22,e0819fcd-cd3c-371c-efa4-d923e3788730,Medication review due,553.0,situation
1,2015-08-02,2016-04-10,a02a3f43-b6f4-d209-4062-af0cec3b22c1,b06e3a3b-7d43-d28d-06c9-db649f132502,Medication review due,252.0,situation
2,2016-09-23,2016-10-23,d92132ce-06ac-3ab4-217f-97257a290b22,c138ba5e-eea4-b4a1-d431-30ae17bedfad,Homeless,30.0,finding
3,2015-09-13,2015-11-08,a02a3f43-b6f4-d209-4062-af0cec3b22c1,722dacab-9a59-66ca-16f9-16703dcd2543,Otitis media,56.0,disorder
4,2017-08-12,NaT,d92132ce-06ac-3ab4-217f-97257a290b22,364c86f0-1d1f-6fcb-bd8c-5f55d8bc0d47,Seasonal allergic rhinitis,,disorder


# 3.4 Missing values

In [62]:
print("Missing values in conditions dataset:")
print(df_conditions_cleaned.isnull().sum())


Missing values in conditions dataset:
START                 0
STOP              21524
PATIENT               0
ENCOUNTER             0
DESCRIPTION           0
DURATION_DAYS     21524
CONDITION_TYPE        0
dtype: int64


# 3.5 Duplicates

In [63]:
duplicates = df_conditions_cleaned.duplicated().sum()
print(f"Duplicate rows: {duplicates}")


Duplicate rows: 0


# 3.6 Saving the file

In [64]:
PROCESSED_DATA_PATH = "../backend/data/processed/"
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)

df_conditions_cleaned.to_csv(os.path.join(PROCESSED_DATA_PATH, "conditions_cleaned.csv"), index=False)

print("✅ Cleaned conditions dataset saved successfully!")

✅ Cleaned conditions dataset saved successfully!
