In [17]:
import os
import pandas as pd
import numpy as np
from datetime import date
from dateutil.relativedelta import relativedelta
import pytz


In [18]:
# set and confirm the folder where the downloaded CSV files reside

# set "date" suffix for CSV filenames
suffix = "20200603"
# set path
path = f"~/Documents/lstm/Protocols/META/reports/{suffix}/"
# expand path to absolute name
folder = os.path.expanduser(path)
print(folder)
# assert the folder exists
assert os.path.exists(folder) is True
# list folder contents
os.listdir(folder)    

/Users/erikvw/Documents/lstm/Protocols/META/reports/20200603/


['meta_subject_followupvitals_20200603.csv',
 'meta_subject_subjectvisit_20200603.csv',
 'meta_subject_bloodresultsglu_20200603.csv',
 'meta_subject_followup_20200603.csv',
 'edc_registration_registeredsubject_20200603.csv',
 'glucose.xlsx',
 'edc_appointment_appointment_20200603.csv',
 'meta_subject_glucose_20200603.csv',
 'glucose.csv',
 'meta_screening_subjectscreening_20200603.csv']

In [19]:
# import each CSV file into a pandas dataframe

# csv import options
opts = dict(delimiter="|", parse_dates=True)

# import RegisteredSubject
path = os.path.join(folder, f"edc_registration_registeredsubject_{suffix}.csv")
print(path)
df_registered_subject = pd.read_csv(path, **opts)

# import SubjectScreening
path = os.path.join(folder, f"meta_screening_subjectscreening_{suffix}.csv")
print(path)
df_subject_screening = pd.read_csv(path, **opts)

# import Appointment
path = os.path.join(folder, f"edc_appointment_appointment_{suffix}.csv")
print(path)
df_appointment = pd.read_csv(path, **opts)

# import SubjectVisit
path = os.path.join(folder, f"meta_subject_subjectvisit_{suffix}.csv")
print(path)
df_subject_visit = pd.read_csv(path, **opts)

# import Glucose CRF
path = os.path.join(folder, f"meta_subject_glucose_{suffix}.csv")
print(path)
df_crf_glucose = pd.read_csv(path, **opts)
        
# import HBa1c CRF
path = os.path.join(folder, f"meta_subject_blood_results_hba1c_{suffix}.csv")
if os.path.exists(path):
    print(path)
    df_crf_hba1c = pd.read_csv(path, **opts)
else:
    print(path, "DOES NOT EXIST!")
    df_crf_hba1c = pd.DataFrame(columns=["subject_visit_id", "report_datetime", "hba1c", "hba1c_units", "hba1c_assay_datetime", "site_id"])

/Users/erikvw/Documents/lstm/Protocols/META/reports/20200603/edc_registration_registeredsubject_20200603.csv
/Users/erikvw/Documents/lstm/Protocols/META/reports/20200603/meta_screening_subjectscreening_20200603.csv
/Users/erikvw/Documents/lstm/Protocols/META/reports/20200603/edc_appointment_appointment_20200603.csv
/Users/erikvw/Documents/lstm/Protocols/META/reports/20200603/meta_subject_subjectvisit_20200603.csv
/Users/erikvw/Documents/lstm/Protocols/META/reports/20200603/meta_subject_glucose_20200603.csv
/Users/erikvw/Documents/lstm/Protocols/META/reports/20200603/meta_subject_blood_results_hba1c_20200603.csv DOES NOT EXIST!


In [20]:
# Convert imported dates and datetimes to proper dataframe datetime objects.

# needed for filtering and duration calculations
# notice also that the datetimes are timezone aware, UTC.
# all dates in the EDC's database are converted to UTC as they are committed to the DB.

dataframes = [
    df_registered_subject,
    df_subject_screening,
    df_subject_visit,
    df_appointment,
    df_crf_glucose,
    df_crf_hba1c,
]
for df in dataframes:
    for col in list(df.columns):
        if "date" in col:
            df[col] = pd.to_datetime(df[col], utc=True)

# this date does not follow the EDC's field naming convention of _date or _datetime suffixes for dates and datetimes 
df_registered_subject["dob"] = pd.to_datetime(df_registered_subject["dob"], utc=True)

In [21]:
# convert site_id from float to char

dataframes = dict(
    df_registered_subject=df_registered_subject,
    df_subject_screening=df_subject_screening,
    df_subject_visit=df_subject_visit,
    df_appointment=df_appointment,
    df_crf_glucose=df_crf_glucose,
    df_crf_hba1c=df_crf_hba1c,
)
for name, df in dataframes.items():
    # print(name)
    for col in list(df.columns):
        df["site_id"] = df["site_id"].astype("int64")

In [22]:
# create working dataframes

# create a working "screening" DF
df_screening = df_subject_screening[df_subject_screening["consented"]][["subject_identifier", "gender", "age_in_years", "calculated_bmi", "converted_fasting_glucose", "converted_ogtt_two_hr", "hba1c"]]
df_screening.rename(columns={"age_in_years": "age", "calculated_bmi": "baseline_bmi", "converted_fasting_glucose": "baseline_fg", "converted_ogtt_two_hr": "baseline_ogtt", "hba1c": "baseline_hba1c"}, inplace=True)

# merge working "screening" DF with some columns from df_registered_subject on subject_identifier
df_screening = pd.merge(df_screening, df_registered_subject[["subject_identifier", "consent_datetime"]], on="subject_identifier", how="left")
df_screening.rename(columns={"consent_datetime": "enrolled"}, inplace=True)
df_screening = df_screening[["subject_identifier", "gender", "age", "enrolled", "baseline_bmi", "baseline_fg", "baseline_ogtt", "baseline_hba1c"]]
df_screening = df_screening.sort_values(by = 'enrolled')

# create a working Appointment DF by merging some Registered Subject columns into Appointment
rs_cols = ["subject_identifier", "dob", "gender", "screening_datetime", "consent_datetime"]
df_appt = pd.merge(df_appointment, df_registered_subject[rs_cols], on="subject_identifier", how='left', suffixes=["", "_rs"])

# merge some Appointment/RegisteredSubject columns into a woring Subject Visit DF
appt_cols = ["id", "appt_datetime", "timepoint", "visit_code", "visit_code_sequence", "appt_status", "subject_identifier", "gender", "dob", "screening_datetime", "consent_datetime",]
visit_cols = ["id", "appointment_id", "report_datetime", "reason", "site_id"]
df_visits = pd.merge(df_appt[appt_cols], df_subject_visit[visit_cols], left_on="id", right_on="appointment_id", how='left', suffixes=["_appt", ""])

# show df_visits
df_visits[["subject_identifier", "gender", "dob", "screening_datetime", "consent_datetime", "report_datetime", "visit_code", "timepoint"]]

Unnamed: 0,subject_identifier,gender,dob,screening_datetime,consent_datetime,report_datetime,visit_code,timepoint
0,101-20-0158-0,F,1973-07-30 00:00:00+00:00,2020-04-08 08:19:22+00:00,2020-04-15 08:38:11+00:00,2020-04-15 08:39:53+00:00,1000,0.0
1,101-20-0130-9,F,1974-06-16 00:00:00+00:00,2020-03-31 06:26:08+00:00,2020-04-01 07:59:19+00:00,2020-04-01 08:03:51+00:00,1000,0.0
2,101-10-0010-4,M,1967-10-13 00:00:00+00:00,2020-01-31 08:10:30+00:00,2020-02-10 07:35:12+00:00,2020-02-10 07:39:18+00:00,1000,0.0
3,101-30-0076-3,F,1975-07-15 00:00:00+00:00,2020-05-28 05:36:52+00:00,2020-05-28 09:53:55+00:00,2020-05-28 10:00:26+00:00,1000,0.0
4,101-30-0018-5,F,1970-07-15 00:00:00+00:00,2020-03-26 08:34:00+00:00,2020-04-02 09:26:54+00:00,2020-04-02 09:44:16+00:00,1000,0.0
...,...,...,...,...,...,...,...,...
2100,101-40-0014-3,F,1987-07-15 00:00:00+00:00,2020-05-28 07:14:16+00:00,2020-06-01 09:16:38+00:00,NaT,1120,6.0
2101,101-20-0102-8,M,1949-02-15 00:00:00+00:00,2020-03-18 07:31:48+00:00,2020-03-20 08:53:20+00:00,NaT,1120,6.0
2102,101-20-0100-2,F,1972-07-15 00:00:00+00:00,2020-03-19 06:37:39+00:00,2020-03-19 11:05:08+00:00,NaT,1120,6.0
2103,101-10-0031-0,F,1972-07-12 00:00:00+00:00,2020-03-10 06:03:50+00:00,2020-03-10 08:56:25+00:00,NaT,1120,6.0


In [23]:
# filter out future appointments / visits

# create a mask by date
value_to_check = pd.Timestamp.now(tz=pytz.utc)
filter_mask = df_visits['appt_datetime'] < value_to_check
filtered_df = df_visits[filter_mask]

# apply the mask and limit the columns
visit_cols = ["id", "subject_identifier", "site_id", "gender", "dob", "screening_datetime", "consent_datetime", "timepoint", "visit_code", "appt_datetime", "appt_status", "report_datetime", "reason"]
df_visits = df_visits[filter_mask][visit_cols]

# show min/max appt dates after filtering
print(min(df_visits["appt_datetime"]))
print(max(df_visits["appt_datetime"]))

2019-11-12 12:50:10+00:00
2020-06-13 10:39:14+00:00


In [24]:
# merge to CRF

# merge glucose CRF (6m) with demographics and timepoint values (appt/visit/registered subject)
crf_cols = ["ifg_performed", "fasted", "fasting_glucose", "fasting_glucose_quantifier", "fasting_glucose_datetime", "ogtt_performed"]
df = pd.merge(df_visits, df_crf_glucose, left_on="id", right_on="subject_visit_id", how='left', suffixes=["", "_crf"])
cols = ["id", "subject_identifier", "gender", "fasting_glucose", "ogtt_two_hr", "site_id", "visit_code", "report_datetime"]
df = df[df["fasting_glucose"].notnull()][cols]

# merge in baseline values from screening
df = pd.merge(df_screening, df, on="subject_identifier", how="left", suffixes=["", "_crf"])
df.rename(columns={"fasting_glucose": "6m_fg", "ogtt_two_hr": "6m_ogtt"}, inplace=True)

# merge 6m hba1c
df_crf_hba1c
df = pd.merge(df, df_crf_hba1c, left_on="id", right_on="subject_visit_id", how='left', suffixes=["", "_crf"])
df.rename(columns={"hba1c": "6m_hba1c"}, inplace=True)

In [25]:
# prepare final DF for export

df = df[["subject_identifier", "gender", "age", "enrolled", "baseline_bmi", "baseline_fg", "baseline_ogtt", "baseline_hba1c", "6m_fg", "6m_ogtt", "6m_hba1c", "report_datetime", "site_id"]]
pd.options.display.float_format = '{:,.1f}'.format
df

Unnamed: 0,subject_identifier,gender,age,enrolled,baseline_bmi,baseline_fg,baseline_ogtt,baseline_hba1c,6m_fg,6m_ogtt,6m_hba1c,report_datetime,site_id
0,101-20-0001-2,F,32,2019-11-12 14:39:05+00:00,34.1,5.2,7.7,6.0,,,,NaT,
1,101-20-0002-0,F,50,2019-11-12 15:14:10+00:00,36.4,4.6,10.0,5.6,5.4,9.0,,2020-05-12 05:20:24+00:00,20.0
2,101-20-0003-8,F,61,2019-11-18 08:59:14+00:00,46.3,4.9,7.3,5.2,,,,NaT,
3,101-20-0004-6,M,60,2019-11-20 07:55:09+00:00,21.7,4.5,9.7,5.2,5.5,12.3,,2020-05-20 05:15:31+00:00,20.0
4,101-20-0005-3,F,55,2019-11-20 09:29:15+00:00,33.4,4.6,9.3,6.1,,,,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
306,101-30-0087-0,F,50,2020-06-03 08:44:28+00:00,38.7,7.4,10.6,6.1,,,,NaT,
307,101-40-0023-4,F,39,2020-06-03 08:56:40+00:00,36.6,6.7,6.0,5.7,,,,NaT,
308,101-30-0088-8,M,58,2020-06-03 09:09:37+00:00,34.1,7.4,8.0,6.3,,,,NaT,
309,101-30-0089-6,F,42,2020-06-03 09:19:03+00:00,19.1,6.5,7.7,5.8,,,,NaT,


In [None]:
# export

df.to_csv("/Users/erikvw/Documents/lstm/Protocols/META/reports/20200603/glucose.csv", float_format='%.1f')

In [16]:
df.describe()

Unnamed: 0,age,baseline_bmi,baseline_fg,baseline_ogtt,baseline_hba1c,6m_fg,6m_ogtt,site_id
count,311.0,311.0,311.0,311.0,310.0,9.0,9.0,9.0
mean,47.3,31.8,6.3,8.4,5.7,6.8,8.7,20.0
std,8.4,6.6,0.8,1.2,0.8,1.3,1.7,0.0
min,25.0,18.8,4.2,6.0,4.0,5.4,6.4,20.0
25%,42.5,27.7,5.8,7.4,5.2,5.9,8.2,20.0
50%,47.0,31.4,6.3,8.1,5.6,6.6,8.6,20.0
75%,52.0,35.3,6.7,9.3,6.0,7.1,9.0,20.0
max,74.0,54.8,13.1,13.0,10.4,9.1,12.3,20.0
