In [1]:
# Declaration

import pandas as pd
import numpy as np
from enum import IntEnum

class Disease(IntEnum):
    CHD = 53741008
    DIABETES = 44054006
    HYPERTENSION = 38341003
    MI = 22298006
    STROKE = 230690007

class DrugDiabetes(IntEnum):
    METFORMIN = 860975
    INSULIN = 106892
    GLP1 = 897122
    SGLT2 = 1373463
    
src_folder = "set1_raw"
dest_folder = "set1"



    --- CLEANING ---
    
    

In [None]:
# cleaning - patients

# patients
pat = pd.read_csv(src_folder + "/patients.csv", sep=";", usecols=range(17))
pat = pat.drop(
    ["SSN", "DRIVERS", "PASSPORT", "PREFIX", "FIRST", "LAST", "SUFFIX", "MARITAL", "MAIDEN", "BIRTHPLACE", "ADDRESS" ], 
    axis=1
)

# clean data due to malformed csv
race_list = ["white", "black", "hispanic"]
pat["RACE"].replace(pat[~pat["RACE"].isin(race_list)]["RACE"].array, np.nan, inplace=True)
pat = pat[pat["RACE"].notna()]

ethnicity_list = ["african", "american", "american_indian", "asian", "asian_indian", "central_american", "chinese", "dominican", "english", "french", "french_canadian", "german", "irish", "italian", "mexican", "polish", "portuguese", "puerto_rican", "russian", "scottish", "swedish", "west_indian"]
pat["ETHNICITY"].replace(pat[~pat["ETHNICITY"].isin(ethnicity_list)]["ETHNICITY"].array, np.nan, inplace=True)

gender_list = ["F", "M"]
pat["GENDER"].replace(pat[~pat["GENDER"].isin(gender_list)]["GENDER"].array, np.nan, inplace=True)
pat = pat[pat["GENDER"].notna()]

# save
pat.to_csv(dest_folder + "/patients.csv", index=False)

#pat

In [None]:
# cleaning - medications

# remove all medications without patient data
med = pd.read_csv(src_folder + "/medications.csv")        
med = med[med["PATIENT"].isin(pat["ID"])]
med = med[~med["REASONCODE"].isna()]

# save
med.to_csv(dest_folder + "/medications.csv", index=False)
#med

In [None]:
# cleaning - encounter

# remove all medications without patient data
enc = pd.read_csv(src_folder + "/encounters.csv")
# most of the it is null
enc = enc.drop(
    ["DESCRIPTION", "REASONDESCRIPTION"], 
    axis=1
)
enc = enc[enc["PATIENT"].isin(pat["ID"])]

# save
enc.to_csv(dest_folder + "/encounters.csv", index=False)

    
    
    --- DIABETES ---
    
    

In [12]:
# load data

patients = pd.read_csv("set1/patients.csv")

medications = pd.read_csv("set1/medications.csv")
medications["REASONCODE"] = medications["REASONCODE"].astype("Int64")

#encounters = pd.read_csv("set1/encounters.csv")

In [13]:
# DIABETES

# MERGED : PATIENT + MEDICATION DATA OF DIABETES

# diabetes patients based on medications record
diabetes = medications[medications["REASONCODE"] == Disease.DIABETES]
diabetes_patient_ids = diabetes["PATIENT"].unique()

# list of patients who have diabetes
diabetes_patients = patients[patients["ID"].isin(diabetes_patient_ids)]

# data of patient with diabetes medication
merged = diabetes_patients.merge(diabetes, left_on="ID", right_on="PATIENT", how="left")
#merged.head(10)

In [14]:
# CONVERT DATA TYPES

merged["BIRTHDATE"] = pd.to_datetime(merged["BIRTHDATE"])
merged["DEATHDATE"] = pd.to_datetime(merged["DEATHDATE"])
merged["START"] = pd.to_datetime(merged["START"])
merged["STOP"] = pd.to_datetime(merged["STOP"])

merged.loc[merged["BIRTHDATE"] > merged["START"], "BIRTHDATE"] = merged["BIRTHDATE"] - pd.offsets.DateOffset(years=100)
merged.loc[merged["DEATHDATE"] > pd.to_datetime("2020-12-31"), "DEATHDATE"] = merged["DEATHDATE"] - pd.offsets.DateOffset(years=100)

# CONVERT VALUES

# convert drugs code
merged["CODE"] = merged["CODE"].replace({
    DrugDiabetes.METFORMIN : 0,
    DrugDiabetes.INSULIN: 1,
    DrugDiabetes.GLP1: 2,
    DrugDiabetes.SGLT2: 3
})

# convert gender
merged["GENDER"] = merged["GENDER"].replace({ "M" : 0, "F": 1})

# convert race
merged["RACE"] = merged["RACE"].replace({ "white" : 0, "black": 1, "hispanic": 2})

merged.head(2)

Unnamed: 0,ID,BIRTHDATE,DEATHDATE,RACE,ETHNICITY,GENDER,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,000177c6-f76b-432b-9493-5a88bc9fb6bd,1920-09-01,2001-01-06,0,irish,0,1957-02-27,NaT,000177c6-f76b-432b-9493-5a88bc9fb6bd,351e39d2-52a7-4ce7-928b-5920d9761787,0,24 HR Metformin hydrochloride 500 MG Extended ...,44054006,Diabetes
1,000177c6-f76b-432b-9493-5a88bc9fb6bd,1920-09-01,2001-01-06,0,irish,0,1957-02-27,1994-02-21,000177c6-f76b-432b-9493-5a88bc9fb6bd,351e39d2-52a7-4ce7-928b-5920d9761787,2,3 ML liraglutide 6 MG/ML Pen Injector,44054006,Diabetes


In [15]:
# ADDITIONAL COLUMNS

# add dead status
merged["DEAD"] =  np.where(merged["DEATHDATE"].isnull(), 0, 1)

# add age when dead (accuracy up to month level)
merged["AGEDEAD"] = merged["DEATHDATE"].dt.year - merged["BIRTHDATE"].dt.year - (merged["DEATHDATE"].dt.month < merged["BIRTHDATE"].dt.month)

# age when the medication happened
merged["AGEMEDICATION"] = (merged["START"]).dt.year - (merged["BIRTHDATE"]).dt.year - (merged["START"].dt.month < merged["BIRTHDATE"].dt.month)

# add age first medication
group = merged.groupby(["PATIENT"])
group_age_medication = group["AGEMEDICATION"].min()
temp = pd.DataFrame({"PATIENTID": group_age_medication.keys(), "AGEFIRSTMEDICATION":group_age_medication.values})
merged = merged.merge(temp, left_on="ID", right_on="PATIENTID", how="left")

In [9]:
# DROP
merged.drop(["ID", "BIRTHDATE", "DEATHDATE", "REASONCODE", "REASONDESCRIPTION", "PATIENTID"], axis=1, inplace=True)

In [17]:
#merged.sort_values(by=["ENCOUNTER", "START"]).head(20)
merged.loc[:,["BIRTHDATE", "DEATHDATE", "START", "STOP", "AGEDEAD", "AGEMEDICATION", "AGEFIRSTMEDICATION"]].head(20)

Unnamed: 0,BIRTHDATE,DEATHDATE,START,STOP,AGEDEAD,AGEMEDICATION,AGEFIRSTMEDICATION
0,1920-09-01,2001-01-06,1957-02-27,NaT,80.0,36,36
1,1920-09-01,2001-01-06,1957-02-27,1994-02-21,80.0,36,36
2,1920-09-01,2001-01-06,1995-03-13,1996-05-07,80.0,74,36
3,1920-09-01,2001-01-06,1997-03-28,NaT,80.0,76,36
4,1920-09-01,2001-01-06,1997-03-28,NaT,80.0,76,36
5,1945-08-27,2006-09-01,2000-01-09,NaT,61.0,54,54
6,1972-02-20,NaT,2007-12-19,NaT,,35,35
7,1954-03-13,2012-01-26,2000-07-13,NaT,57.0,46,46
8,1942-03-02,NaT,1984-03-31,NaT,,42,42
9,1942-03-02,NaT,2016-04-29,NaT,,74,42


## Information to provide in the data

* Dead status
* Dead age
* First treatment age
* Current treatment age
* Medication used

## To look for
* HbA1c observation result?
* 

## Goal

1. years live after the first medication using specific drug? (what about drugs change?)
2. years until another medication? or another encounter?
3. 

- months do you spent from the first diagnosis until dead
- from diagnosis to first medication? then to second medication?
- proportion of patients get particular medication (metformin)
- likelihood for new patient using particular drug

- looking at diabetes + another disease


on what condition the medication given?

In [None]:
# dead carrying diabetes

#dead = merged[merged.DEAD==1]
group = merged.groupby(["PATIENT"])
age = group["AGEMEDICATION"].min()
age

In [None]:
#age[age["PATIENT"]=="000177c6-f76b-432b-9493-5a88bc9fb6bd"]
age.min()["00097ab2-2de3-44a2-926c-f92f4f39efa8"]

#merged["AGEFIRSTMEDICATION"] = group_age_medication[merged["PATIENT"]]
