In [1]:
# PHUSE CSS Hackaton 2020
# Nicolas Dupuis - Geoffrey Low
# Purpose: create an ADSL dataset

In [2]:
import python.podr_connections as podr
import pandas as pd
import subprocess
import yaml, json

# first time
#subprocess.run(["pip3", "install", 'xlrd'])

## Connect to PODR

In [3]:
connection = podr.podr_connection(username='phuse_3i3892wcjd')
# password: GpAMqVCDMPxo

PODR Password: ········


## Fetch SDTM datasets

In [4]:
sdtm = {}
sdtm['dm'] = connection.read(dataset = 'dm', libname = 'cdisc_pilot_sdtm')
sdtm['dm'].head()

Unnamed: 0,STUDYID,DOMAIN,USUBJID,SUBJID,RFSTDTC,RFENDTC,RFXSTDTC,RFXENDTC,RFICDTC,RFPENDTC,...,SEX,RACE,ETHNIC,ARMCD,ARM,ACTARMCD,ACTARM,COUNTRY,DMDTC,DMDY
0,CDISCPILOT01,DM,01-701-1015,1015,2014-01-02,2014-07-02,2014-01-02,2014-07-02,,2014-07-02 11:45:00,...,F,WHITE,HISPANIC OR LATINO,Pbo,Placebo,Pbo,Placebo,USA,2013-12-26,-7.0
1,CDISCPILOT01,DM,01-701-1023,1023,2012-08-05,2012-09-02,2012-08-05,2012-09-01,,2013-02-18 00:00:00,...,M,WHITE,HISPANIC OR LATINO,Pbo,Placebo,Pbo,Placebo,USA,2012-07-22,-14.0
2,CDISCPILOT01,DM,01-701-1028,1028,2013-07-19,2014-01-14,2013-07-19,2014-01-14,,2014-01-14 11:10:00,...,M,WHITE,NOT HISPANIC OR LATINO,Xan_Hi,Xanomeline High Dose,Xan_Hi,Xanomeline High Dose,USA,2013-07-11,-8.0
3,CDISCPILOT01,DM,01-701-1033,1033,2014-03-18,2014-04-14,2014-03-18,2014-03-31,,2014-09-15 00:00:00,...,M,WHITE,NOT HISPANIC OR LATINO,Xan_Lo,Xanomeline Low Dose,Xan_Lo,Xanomeline Low Dose,USA,2014-03-10,-8.0
4,CDISCPILOT01,DM,01-701-1034,1034,2014-07-01,2014-12-30,2014-07-01,2014-12-30,,2014-12-30 09:50:00,...,F,WHITE,NOT HISPANIC OR LATINO,Xan_Hi,Xanomeline High Dose,Xan_Hi,Xanomeline High Dose,USA,2014-06-24,-7.0


In [5]:
sdtm['ds'] = connection.read(dataset = 'ds', libname = 'cdisc_pilot_sdtm')
sdtm['ds'].head()

Unnamed: 0,STUDYID,DOMAIN,USUBJID,DSSEQ,DSSPID,DSTERM,DSDECOD,DSCAT,VISITNUM,VISIT,EPOCH,DSDTC,DSSTDTC,DSDY,DSSTDY
0,CDISCPILOT01,DS,01-701-1015,1.0,,PROTOCOL COMPLETED,COMPLETED,DISPOSITION EVENT,13.0,WEEK 26,FOLLOW-UP,2014-07-02 00:00:00,2014-07-02,182.0,182.0
1,CDISCPILOT01,DS,01-701-1015,2.0,,FINAL LAB VISIT,FINAL LAB VISIT,OTHER EVENT,13.0,WEEK 26,FOLLOW-UP,2014-07-02 11:45:00,2014-07-02,182.0,182.0
2,CDISCPILOT01,DS,01-701-1023,1.0,24.0,ADVERSE EVENT,ADVERSE EVENT,DISPOSITION EVENT,5.0,WEEK 4,TREATMENT,2012-09-02 00:00:00,2012-09-02,29.0,29.0
3,CDISCPILOT01,DS,01-701-1023,2.0,,FINAL LAB VISIT,FINAL LAB VISIT,OTHER EVENT,5.0,WEEK 4,TREATMENT,2012-09-02 10:15:00,2012-09-02,29.0,29.0
4,CDISCPILOT01,DS,01-701-1023,3.0,,FINAL RETRIEVAL VISIT,FINAL RETRIEVAL VISIT,OTHER EVENT,201.0,RETRIEVAL,FOLLOW-UP,2013-02-18 00:00:00,2013-02-18,198.0,198.0


## Fetch ADaM Metadata

In [6]:
adam_metadata = pd.read_excel('PHUSE CSS_2020_hackathon_AD usecase.xlsx', sheet_name='ADSL')
adsl_metadata = adam_metadata [adam_metadata ['Dataset Name']=='ADSL']
adsl_metadata.head()

Unnamed: 0,Dataset Name,Variable Name,Variable Label,Variable Role,Variable Type,Parameter Identifier,Variable Order,Length,Dec Digits,Display Format,Codelist/Controlled Terms,Core,Source/Derivation,Derived or Copied from Source Variable
0,ADSL,STUDYID,,,,,,,,,,Required,DM.STUDYID,Copied from Source
1,ADSL,USUBJID,,,,,,,,,,Required,DM.USUBJID,Copied from Source
2,ADSL,ARM,,,,,,,,,,Required,DM.ARM,Copied from Source
3,ADSL,ARMCD,,,,,,,,,,Permissible,DM.ARMCD,Copied from Source
4,ADSL,ACTARM,,,,,,,,,,Permissible,DM.ACTARM,Copied from Source


## Fetch ADSL derivations

In [15]:
with open('derivations.json', 'r') as f:
    derivations = json.load(f)
derivations

{'AGE_RACE': {'broadcast': ['SEX', 'RACE']},
 'AGEGR1': {'category': {'source': 'AGE', 'bins': [18, 40, 60, 80]}},
 'RANDFL': {'code': "\nds = sdtm['ds']\nif len ( ds[ (ds['USUBJID'] == adsl['USUBJID']) & (ds['DSDECOD'] == 'RANDOMIZED') ] ) > 0: \n    value = 'Y'\nelse: \n    value = 'N'\n"},
 'SAFFL': {'code': "\nvalue='Not done yet'\n", 'dependency': ['RANDFL']}}

## Main Class

In [18]:
class create_adam():
    
    def __init__(self, metadata, source):
        
        self.source = source.copy()
        
        # variable that should be copied from source
        vars_to_copy = set(metadata [metadata ['Derived or Copied from Source Variable']=='Copied from Source']['Variable Name'])
        
        # variables from source
        source_variables = set(source.columns)
        
        # variables that should and can be copied from source
        self.copy_variables = vars_to_copy.intersection(source_variables)
        
        # variables that should be copied but cannot
        not_in_source = vars_to_copy.difference(source_variables)
        if len(not_in_source) >0: 
            print(f"The following variables are not available in source: {not_in_source}")           

    
    def source_variables(self):
        ''' Filter the SDTM source dataset to keep only the 'copied_from_source' variables '''
        self.adsl = self.source[self.copy_variables]
        self.adsl = self.adsl.copy() # avoid the ugly warning
        return self.adsl
    

    def apply_new_variable(self, __adsl):

        store = {'sdtm': sdtm, 'adsl': __adsl}
        
        try: 
            exec(self.code, store) # the executed code should create a 'value' variable, to load into the new variable

        except Exception as error: 
            print(repr(error))
            return None
        
        return store['value']

    
    def derived_variables(self, derivations):

        self.derivations = derivations.copy()
        
        # loop at long as we have variables to produce and making progress
        making_progress= True
        while len(self.derivations) > 0 and making_progress==True:
        
            making_progress==False
            derivations_done = []
        
            for variable in self.derivations:

                self.variable = variable            
                specs = self.derivations[variable]

                # assuming we can derive the variable, we'll check that
                dependency_fullfilled = True

                if 'dependency' in specs: 

                    for var_dependency in specs['dependency']:

                        if not var_dependency in self.adsl: 

                            dependency_fullfilled = False
                            break


                if dependency_fullfilled == True:
                    
                    if 'category' in specs:

                        source = specs['category']['source']
                        bins = specs['category']['bins']
                        self.adsl[variable] = pd.cut(self.adsl[source], bins)

                    elif 'broadcast' in specs:        

                        items = ["self.adsl['" + item +"']" for item in specs['broadcast']]
                        concat = ' + "-" + '.join(items)
                        self.adsl[variable] = eval(concat)

                    elif 'code' in self.derivations[variable]:

                        self.code = self.derivations[variable]['code']
                        self.adsl[variable] = self.adsl.apply(self.apply_new_variable, axis=1)               
                
                    # making progress here!
                    derivations_done.append(variable)
                    making_progress==True
            
            # clean our todo list
            for var in derivations_done:
                del self.derivations[var]
                    
        
        return self.adsl

# Instantiate class
create_adsl = create_adam(metadata=adsl_metadata, source=sdtm['dm'])

# Add variables copied from source
adsl = create_adsl.source_variables() 

# Add derived variables
adsl = create_adsl.derived_variables(derivations)

SyntaxError: invalid syntax (<ipython-input-18-161d0b4aedd3>, line 97)

In [14]:
adsl.head()

Unnamed: 0,USUBJID,AGE,ACTARMCD,RACE,DTHFL,DTHDTC,STUDYID,COUNTRY,SEX,SITEID,ARM,ETHNIC,ACTARM,SUBJID,AGEU,ARMCD,AGE_RACE,AGEGR1,RANDFL,SAFFL
0,01-701-1015,63.0,Pbo,WHITE,,NaT,CDISCPILOT01,USA,F,701,Placebo,HISPANIC OR LATINO,Placebo,1015,YEARS,Pbo,F-WHITE,"(60, 80]",N,Not done yet
1,01-701-1023,64.0,Pbo,WHITE,,NaT,CDISCPILOT01,USA,M,701,Placebo,HISPANIC OR LATINO,Placebo,1023,YEARS,Pbo,M-WHITE,"(60, 80]",N,Not done yet
2,01-701-1028,71.0,Xan_Hi,WHITE,,NaT,CDISCPILOT01,USA,M,701,Xanomeline High Dose,NOT HISPANIC OR LATINO,Xanomeline High Dose,1028,YEARS,Xan_Hi,M-WHITE,"(60, 80]",N,Not done yet
3,01-701-1033,74.0,Xan_Lo,WHITE,,NaT,CDISCPILOT01,USA,M,701,Xanomeline Low Dose,NOT HISPANIC OR LATINO,Xanomeline Low Dose,1033,YEARS,Xan_Lo,M-WHITE,"(60, 80]",N,Not done yet
4,01-701-1034,77.0,Xan_Hi,WHITE,,NaT,CDISCPILOT01,USA,F,701,Xanomeline High Dose,NOT HISPANIC OR LATINO,Xanomeline High Dose,1034,YEARS,Xan_Hi,F-WHITE,"(60, 80]",N,Not done yet
