In [17]:
import pandas as pd
import numpy as np

In [18]:
bloodgas = pd.read_csv("bloodgas.csv", low_memory=False)
devices = pd.read_csv("devices.csv")
encounter = pd.read_csv("encounter.csv")
patient = pd.read_csv("patient.csv")
pulseox = pd.read_csv("pulseoximeter.csv")
spectrophotometer = pd.read_csv("spectrophotometer.csv")

Filter patient.csv and only include the columns `patient_id`, `assigned_sex`, and `race`. Export as new csv to `Cleaned_datasets/`.

In [None]:
patient_filtered=patient.drop(['site_id','ethnicity'],axis='columns')
patient_filtered.to_csv('Cleaned_datasets/patient_filtered.csv', index=False)

Filter encounter.csv and retain only certain columns to focus on the skin tone of the hand and finger, age, measurements in reference to right hand, pointer finger (r2). Export as new csv to `Cleaned_datasets/`.

In [None]:
retain_columns=['patient_id','encounter_id','age_at_encounter','monk_fingernail','monk_palmar','fitzpatrick','finger_r2_device','finger_r2_diameter']
encounter_filtered=encounter[retain_columns]
encounter_filtered.to_csv('Cleaned_datasets/encounter_filtered.csv', index=False)

Filter pulseox.csv by dropping `probe_location`. Potentially filter for `pi` because a pi < 0.4 can be considered unreliable. Pi could also help with determining unhealthy and healthy because it's essentially a determination of circulation. Export as new csv to `Cleaned_datasets/`.

In [20]:
pulseox_filtered=pulseox.drop(['probe_location'],axis='columns')

def safe_float(x):
    try:
        return float(x)
    except (ValueError, TypeError):
        return np.nan  # mark invalid values as NaN

# Apply function to the column
pulseox_filtered['saturation'] = pulseox_filtered['saturation'].apply(safe_float)
pulseox_filtered = pulseox_filtered[(pulseox_filtered['saturation'] >= 30) & (pulseox_filtered['saturation'] <= 100)]
pulseox_filtered = (
    pulseox_filtered.groupby(['encounter_id', 'sample_number'])
      .agg({'saturation': list, 'pi': list})
      .reset_index()
)
pulseox_filtered.to_csv('Cleaned_datasets/pulseox_filtered.csv', index=False)

Filter spectrophotometer.csv by retaining only quantitative skin tone values and wavelengths relevant to pulse oximeters wavelengths of 660 nm and 700 nm. Dropped any groups that were not fingernail or hand related. Export as new csv to `Cleaned_datasets/`.

In [None]:
retain_columns=['patient_id','encounter_id','group','melanin_index','hb_index','hb_so2_index','lab_l','km660','km700']
# Filter to only include hand and fingernail
spectrophotometer_filtered=spectrophotometer[spectrophotometer['group'].isin(['Palmar (C)', 'Fingernail (A)'])][retain_columns]
# Drop rows that are missing values
spectrophotometer_filtered = spectrophotometer_filtered.dropna(subset=['melanin_index', 'hb_index', 'hb_so2_index',
                                         'lab_l', 'km660', 'km700'])
spectrophotometer_filtered = spectrophotometer_filtered.reset_index(drop=True)
spectrophotometer_filtered.to_csv('Cleaned_datasets/spectrophotometer_filtered.csv', index=False)

Filter bloodgas.csv based on determination of healthy parameters. Potentially find healthy individuals and record their id #'s. But for now only retaining the oxygen saturation value in order to compare it to the measured pulseox value to later compare to different skin tones.

In [21]:

retain_columns=['patient_id','encounter_id','sample','so2','ScalcO2']
bloodgas_filtered=bloodgas[retain_columns]
bloodgas_filtered = bloodgas_filtered.rename(columns={'sample':'sample_number'})


#Convert string to float
bloodgas_filtered['so2'] = bloodgas_filtered['so2'].astype(float)
bloodgas_filtered['ScalcO2'] = bloodgas_filtered['ScalcO2'].astype(float)
bloodgas_filtered = bloodgas_filtered[(bloodgas_filtered['so2'] >= 30) & (bloodgas_filtered['so2'] <= 100)]
bloodgas_filtered = bloodgas_filtered[(bloodgas_filtered['ScalcO2'] >= 30) & (bloodgas_filtered['ScalcO2'] <= 100)]

bloodgas_filtered = (
    bloodgas_filtered.groupby(['encounter_id', 'sample_number'])
      .agg({'so2': list, 'ScalcO2': list})
      .reset_index()
)

def fill_so2(so2_list, ScalcO2_list):
    return [
        s if not np.isnan(s) else calc
        for s, calc in zip(so2_list, ScalcO2_list)
    ]

bloodgas_filtered['so2_filled'] = [
    fill_so2(so2, ScalcO2)
    for so2, ScalcO2 in zip(bloodgas_filtered['so2'], bloodgas_filtered['ScalcO2'])
]

bloodgas_filtered.to_csv('Cleaned_datasets/bloodgas_filtered.csv', index=False)