In [1]:
# Declaration

import pandas as pd
import numpy as np
from enum import Enum
from enum import IntEnum

class Disease(IntEnum):
    CHD = 53741008
    DIABETES = 44054006
    HYPERTENSION = 38341003
    MI = 22298006
    STROKE = 230690007

class DrugDiabetes(IntEnum):
    METFORMIN = 860975
    INSULIN = 106892
    GLP1 = 897122
    SGLT2 = 1373463

class Observation(Enum):
    BH = "8302-2"
    BW = "29463-7"
    BMI = "39156-5"
    DIASTOLIC = "8462-4"
    GLUCOSE = "2339-0"
    HBA1C = "4548-4"
    SYSTOLIC = "8480-6"


## LOAD & CLEANING DATA PER SET

source : set(setnumber)\_raw/
dest   : set(setnumber)/

In [2]:
cleaning_set = "set1"

# loading data
def load_data(type):
    data = pd.read_csv(cleaning_set + "/" + type + ".csv")
    return data

In [None]:
# cleaning - patients
def cleaning_patients():
    # patients
    pat = pd.read_csv(cleaning_setworking_set + "_raw/patients.csv", sep=";", usecols=range(17))
    pat = pat.drop(
        ["SSN", "DRIVERS", "PASSPORT", "PREFIX", "FIRST", "LAST", "SUFFIX", "MARITAL", "MAIDEN", "BIRTHPLACE", "ADDRESS" ], 
        axis=1
    )

    # clean data due to malformed csv
    race_list = ["asian", "white", "black", "hispanic"]
    pat["RACE"].replace(pat[~pat["RACE"].isin(race_list)]["RACE"].array, np.nan, inplace=True)
    pat = pat[pat["RACE"].notna()]

    ethnicity_list = ["african", "american", "american_indian", "asian_indian", "central_american", "chinese", "dominican", "english", "french", "french_canadian", "german", "irish", "italian", "mexican", "polish", "portuguese", "puerto_rican", "russian", "scottish", "swedish", "west_indian"]
    pat["ETHNICITY"].replace(pat[~pat["ETHNICITY"].isin(ethnicity_list)]["ETHNICITY"].array, np.nan, inplace=True)

    gender_list = ["F", "M"]
    pat["GENDER"].replace(pat[~pat["GENDER"].isin(gender_list)]["GENDER"].array, np.nan, inplace=True)
    pat = pat[pat["GENDER"].notna()]

    # save
    pat.to_csv(cleaning_setworking_set + "/patients.csv", index=False)

#cleaning_patients()

In [None]:
# cleaning - medications

def cleaning_medications():
    # remove all medications without patient data
    pat = load_data("patients")

    med = pd.read_csv(cleaning_set + "_raw/medications.csv")        
    med = med[med["PATIENT"].isin(pat["ID"])]
    med = med[~med["REASONCODE"].isna()]
    med = med.drop_duplicates()
    # save
    med.to_csv(cleaning_set + "/medications.csv", index=False)

#cleaning_medications()

In [None]:
# cleaning - encounters

def cleaning_encounters():
    # remove all encounters without patient data
    pat = load_data("patients")

    enc = pd.read_csv(cleaning_setworking_set + "_raw/encounters.csv")
    # most of it is null
    enc = enc.drop(
        ["DESCRIPTION", "REASONDESCRIPTION"], 
        axis=1
    )
    enc = enc[enc["PATIENT"].isin(pat["ID"])]

    # save
    enc.to_csv(cleaning_set + "/encounters.csv", index=False)
    
#cleaning_encounters()

In [None]:
# cleaning - observations

def cleaning_observations():
    # remove all encounters without patient data
    pat = load_data("patients")
    obs = pd.read_csv(cleaning_set + "_raw/observations.csv")
    obs = obs[obs["PATIENT"].isin(pat["ID"])]
    obs = obs[~obs["VALUE"].isna()]
    obs = obs.loc[
        (obs.CODE==Observation.BH.value)
        | (obs.CODE==Observation.BW.value)
        | (obs.CODE==Observation.BMI.value)
        | (obs.CODE==Observation.DIASTOLIC.value)
        | (obs.CODE==Observation.GLUCOSE.value)
        | (obs.CODE==Observation.HBA1C.value)
        | (obs.CODE==Observation.SYSTOLIC.value)
    ]
    obs = obs.drop_duplicates()
    # save
    obs.to_csv(cleaning_set + "/observations.csv", index=False)

#cleaning_observations()

## CONCAT ALL DATASET FROM ALL SET PER TYPE

source : set1/, set2/, set3/, ...

target : set_full/

files :
- patients.csv : ID, BIRTHDATE, DEATHDATE, RACE, ETHNICITY, GENDER
- medications.csv  : START, STOP, PATIENT, ENCOUNTER, CODE, DESCRIPTION, REASONCODE, REASONDESCRIPTION
- encounters.csv   : ID, DATE, PATIENT, CODE, REASONCODE
- observations.csv : DATE, PATIENT, ENCOUNTER, CODE, DESCRIPTION, VALUE, UNITS

In [None]:
def concat_set(type, max_set):
    result = pd.read_csv("set1/" + type + ".csv")
    for i in range(1, max_set):
        temp = pd.read_csv("set2/" + type + ".csv")
        result = result.append(temp).reset_index(drop=True)
    result.to_csv("set_full/" + type + ".csv", index=False)
    return result
        
#data = concat_set("observations", 2)
#data

## DIABETES


### 1. Medications
- file : "diabetes/m.csv"
- fields : PATIENT, ENCOUNTER, MEDSTART, INSULIN, METFORMIN, GLP1, SGLT2

In [41]:
# load medications
# transform multiple paralel medications into columns

medications = pd.read_csv('set_full/medications.csv')
medications = medications[medications['REASONCODE'] == Disease.DIABETES].reset_index(drop=True)
medications = medications.drop(['DESCRIPTION', 'REASONCODE', 'REASONDESCRIPTION'], axis=1)
medications = medications.set_index(['PATIENT', 'ENCOUNTER', 'START'])
medications = pd.get_dummies(data = medications, columns=['CODE'])
medications.sort_index(inplace=True)
medications = medications.groupby(level=['PATIENT','ENCOUNTER','START']).agg(['sum'])
medications.columns = medications.columns.droplevel(1)
medications = medications.reset_index()

for d in DrugDiabetes:
    medications.rename(
        columns={            
            'CODE_' + str(d.value) : d.name
        }, inplace=True
    )
    
# Rename columns
medications = medications.rename(
    columns={
        'START':'MEDSTART'
    }
)
medications.to_csv('diabetes/m.csv', index=False)

In [82]:
medications.head(1)

Unnamed: 0,PATIENT,ENCOUNTER,MEDSTART,INSULIN,METFORMIN,GLP1,SGLT2
0,000177c6-f76b-432b-9493-5a88bc9fb6bd,351e39d2-52a7-4ce7-928b-5920d9761787,1957-02-27,0,1,1,0


### 2. Observations
- file : "diabetes/o.csv"
- fields : DATE, PATIENT, ENCOUNTER, HBA1C (%)

In [89]:
observations = pd.read_csv('set_full/observations.csv')
observations = observations[observations['CODE'] == Observation.HBA1C.value]
observations.drop(['DESCRIPTION', 'UNITS'], axis=1, inplace=True, errors='ignore')

# minimum observation value for Diabetes is 0
observations['VALUE'].replace(observations[observations['VALUE'] < 0]['VALUE'].array, np.nan, inplace=True)
observations = observations[~observations['VALUE'].isna()].reset_index(drop=True)
observations
observations.to_csv('diabetes/o.csv', index=False)

In [90]:
observations.head(1)

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,VALUE
0,2013-04-24,7fcadc56-5964-418c-9e15-889c62f5df0b,e445e0f4-4b57-48c3-8263-4923c3571a17,4548-4,6.2


### 3. Patient - Medication

- file : "diabetes/p_m.csv"
- fields :
    - [RACE, ETHNICITY, GENDER, PATIENT, ENCOUNTER] 
    - [MEDSTART, INSULIN, METFORMIN, GLP1, SGLT2, DEAD]
    - [AGEDEAD, AGEMEDICATION, AGEFIRSTMEDICATION, DAYSLIVEFIRSTMED]


In [66]:
# load data
patients = pd.read_csv('set_full/patients.csv')

In [67]:
medications = pd.read_csv('diabetes/m.csv')

In [68]:
# MERGE : PATIENT + MEDICATION

# diabetes patients based on medications record
diabetes = medications
diabetes_patient_ids = diabetes['PATIENT'].unique()

# list of patients who have diabetes
diabetes_patients = patients[patients['ID'].isin(diabetes_patient_ids)]

# data of patient with diabetes medication
merged = diabetes_patients.merge(diabetes, left_on='ID', right_on='PATIENT', how='left')

#merged.head(1)

In [69]:
# Convert data types
merged['BIRTHDATE'] = pd.to_datetime(merged['BIRTHDATE'], format='%d/%m/%y')
merged['DEATHDATE'] = pd.to_datetime(merged['DEATHDATE'], format='%d/%m/%y')
merged['MEDSTART'] = pd.to_datetime(merged['MEDSTART'], format='%Y-%m-%d')

# Fix 2 digits of year misinterpretation
merged.loc[merged['BIRTHDATE'] > merged['MEDSTART'], 'BIRTHDATE'] = merged['BIRTHDATE'] - pd.offsets.DateOffset(years=100)
merged.loc[merged['DEATHDATE'] > pd.to_datetime('2020-12-31'), 'DEATHDATE'] = merged['DEATHDATE'] - pd.offsets.DateOffset(years=100)


In [70]:
# Add additional information

# dead status
merged['DEAD'] =  np.where(merged['DEATHDATE'].isnull(), 0, 1)

# age when dead (accuracy up to month level)
merged['AGEDEAD'] = merged['DEATHDATE'].dt.year - merged['BIRTHDATE'].dt.year - (merged['DEATHDATE'].dt.month < merged['BIRTHDATE'].dt.month)

# when the medication happened
merged['AGEMEDICATION'] = (merged['MEDSTART']).dt.year - (merged['BIRTHDATE']).dt.year - (merged['MEDSTART'].dt.month < merged['BIRTHDATE'].dt.month)

# age first medication
group = merged.groupby(['PATIENT'])
group_age_medication = group['AGEMEDICATION'].min()
temp = pd.DataFrame({'PATIENTID': group_age_medication.keys(), 'AGEFIRSTMEDICATION':group_age_medication.values})
merged = merged.merge(temp, left_on='ID', right_on='PATIENTID', how='left')

# livespan after first medication
merged['DAYSLIVEFIRSTMED'] = (merged['DEATHDATE'] - merged['MEDSTART']).dt.days

In [71]:
# Drop unused columns
merged.drop(['ID', 'BIRTHDATE', 'DEATHDATE', 'PATIENTID'], axis=1, inplace=True, errors='ignore')

In [72]:
merged = merged.drop_duplicates()

In [73]:
# get the patient_medication table
merged = merged.sort_values(by=['PATIENT', 'ENCOUNTER', 'MEDSTART'])

In [75]:
# save
merged.to_csv('diabetes/p_m.csv', index=False)

### 4. Medication - Observation

- file : "diabetes/m_o.csv"
- fields : PATIENT, ENCOUNTER, MEDSTART, INSULIN, METFORMIN, GLP1, SGLT2, OBSDATE, CODE, HBA1C

In [131]:
# load medications

medications = pd.read_csv('diabetes/m.csv')
medications.head(2)

Unnamed: 0,PATIENT,ENCOUNTER,MEDSTART,INSULIN,METFORMIN,GLP1,SGLT2
0,000177c6-f76b-432b-9493-5a88bc9fb6bd,351e39d2-52a7-4ce7-928b-5920d9761787,1957-02-27,0,1,1,0
1,000177c6-f76b-432b-9493-5a88bc9fb6bd,351e39d2-52a7-4ce7-928b-5920d9761787,1995-03-13,0,0,1,0


In [138]:
# load observations

observations = pd.read_csv('diabetes/o.csv')
observations = observations.rename(columns={'DATE':'OBSDATE', 'VALUE':'HBA1C'})
observations.head(2)

Unnamed: 0,OBSDATE,PATIENT,ENCOUNTER,CODE,HBA1C
142506,2011-05-20,00005a31-23bc-46dc-9dcd-13337680b90e,b539805a-20a7-4975-b16b-19629b23ba78,4548-4,5.8
142507,2013-09-27,00005a31-23bc-46dc-9dcd-13337680b90e,00dc3ed1-9382-43df-9c77-c6f3a8376f18,4548-4,6.3


In [160]:
# MERGE : MEDICATION + OBSERVATIONS (HBA1C)

merged = medications.merge(
    observations, 
    left_on=['PATIENT', 'ENCOUNTER', 'MEDSTART'],
    right_on=['PATIENT', 'ENCOUNTER', 'OBSDATE'],
    how='left')
merged = merged[~merged['HBA1C'].isna()].reset_index(drop=True)

In [162]:
merged.to_csv('diabetes/m_o.csv', index=False)

In [164]:
#merged

## Goal

1. livespan after the first medication using specific drug 
2. on what condition the medication given? (hba1c level)
3. 

- months do you spent from the first diagnosis until dead
- years until another medication? or another encounter?
- from diagnosis to first medication? then to second medication?
- proportion of patients get particular medication (metformin)

- likelihood for new patient using particular drug
- looking at diabetes + another disease


In [None]:
df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c'])
df['x1'] = df['a'].shift(1, fill_value=0)
df