In [2]:
import pandas as pd
import sqlalchemy as sa


In [16]:
def get_outcome_and_intervention_modules(studies):
    outcome_modules = []
    intervention_modules = []
    study_ids = []
    for study in studies:
        if (
            "ResultsSection" in study["Study"]
            and "OutcomeMeasuresModule" in study["Study"]["ResultsSection"]
        ):
            outcome_modules.append(study["Study"]["ResultsSection"]["OutcomeMeasuresModule"])
            study_ids.append(study['Study']['ProtocolSection']['IdentificationModule']['NCTId'])

        elif "ArmsInterventionsModule" in study["Study"]["ProtocolSection"]:
            intervention_modules.append(study["Study"]["ProtocolSection"]["ArmsInterventionsModule"])

    return outcome_modules, intervention_modules, study_ids

In [17]:
def create_measurements_table_helper(studies):
    outcome_modules, intervention_modules, study_ids = get_outcome_and_intervention_modules(studies)
    df = {
        "study_id": [],
        "measure": [],
        "type": [],
        "description": [],
        "dispersion_param": [],
        "measure_param": [],
        "units": [],
    }

    for i, module in enumerate(outcome_modules):
        for measure in module["OutcomeMeasureList"]["OutcomeMeasure"]:
            df["type"].append(measure.get("OutcomeMeasureType", "NA"))
            df["measure"].append(measure.get("OutcomeMeasureTitle", "NA"))
            df["description"].append(measure.get("OutcomeMeasureDescription", "NA"))
            df["measure_param"].append(measure.get("OutcomeMeasureParamType", "NA"))
            df["dispersion_param"].append(measure.get("OutcomeMeasureDispersionType", "NA"))
            df["units"].append(measure.get("OutcomeMeasureUnitOfMeasure", "NA"))
            df["study_id"].append(study_ids[i])

    # This is for studies without results
    # for i, module in enumerate(intervention_modules):
    #     for measure in module.get("ArmGroupList", {"ArmGroup": []})["ArmGroup"]:
    #         # Measure data is unstructured and often has other fields in the description.
    #         df["type"].append("NA")
    #         df["measure"].append(measure.get("ArmGroupLabel", "NA"))
    #         df["description"].append(measure.get("ArmGroupDescription", "NA"))
    #         df["measure_param"].append("NA")
    #         df["dispersion_param"].append("NA")
    #         df["units"].append("NA")
    #         df["study_id"].append(study_ids[i])

    return pd.DataFrame.from_dict(df).reset_index(drop=True)

In [18]:
from tqdm import tqdm
import os
import pickle

DATA_PATH='/Users/porterhunley/datasets'

def create_measurements_table():
    measurements_table_dfs = []
    directory = DATA_PATH + "/clinical_trials/"
    print("Deserializing studies...")
    for studies_data_pickle_file in tqdm(os.listdir(directory)):
        studies_file = os.path.join(directory, studies_data_pickle_file)
        with open(studies_file, "rb") as f:
            studies_data = pickle.load(f)
            measurements_table_df = create_measurements_table_helper(
                studies=studies_data
            )
            measurements_table_dfs.append(measurements_table_df)

    measurements_table = pd.concat(measurements_table_dfs).reset_index(drop=True)
    return measurements_table


In [19]:
measurements_table = create_measurements_table()

Deserializing studies...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 87/87 [02:09<00:00,  1.49s/it]


In [22]:
measurements_table['dispersion_param'].str.len().max()

34

## More info from studies


In [32]:
def create_studies_table_helper(studies) -> pd.DataFrame:
    buffer = {
        'nct_id': [], 'official_title': [], 'short_title': [], 'conditions': [],
        'verified_date': [], 'responsible_party': [], 'sponsor': [], 'phase': [], 'type': [], 'description': [],
        'interventions': [], 'purpose': [], 'intervention_type': [], 'mesh_terms': [],
        'criteria': [], 'min_age': [], 'max_age': [], 'gender': [], 'completion_date': [], ''completion_date_type'':[],
        'status': [], 'stopped_reason': []
    }
    for _, study in enumerate(studies):
        try:
            buffer['nct_id'].append(study['Study']['ProtocolSection']['IdentificationModule']['NCTId'])
        except KeyError as e:
            buffer['nct_id'].append('NA')

        try:
            buffer['official_title'].append(study['Study']['ProtocolSection']['IdentificationModule']['OfficialTitle'])
        except KeyError as e:
            buffer['official_title'].append('NA')

        try:
            buffer['short_title'].append(study['Study']['ProtocolSection']['IdentificationModule']['BriefTitle'])
        except KeyError as e:
            buffer['short_title'].append('NA')

        try:
            buffer['verified_date'].append(study['Study']['ProtocolSection']['StatusModule']['StatusVerifiedDate'])
        except KeyError as e:
            buffer['verified_date'].append('NA')

        try:
            buffer['responsible_party'].append(
                study['Study']['ProtocolSection']['SponsorCollaboratorsModule']['ResponsibleParty'][
                    'ResponsiblePartyInvestigatorFullName'])
        except KeyError as e:
            buffer['responsible_party'].append('NA')

        try:
            buffer['sponsor'].append(
                study['Study']['ProtocolSection']['SponsorCollaboratorsModule']['LeadSponsor']['LeadSponsorName'])
        except KeyError as e:
            buffer['sponsor'].append('NA')

        try:
            buffer['conditions'].append(
                study['Study']['ProtocolSection']['ConditionsModule']['ConditionList']['Condition'])
        except KeyError as e:
            buffer['conditions'].append('NA')

        try:
            phases = study['Study']['ProtocolSection']['DesignModule']['PhaseList']['Phase']
            if len(phases) > 1:
                phase = ' '.join(phases)
            else:
                phase = 'NA' if phases[0] == 'Not Applicable' else phases[0]
            buffer['phase'].append(phase)
        except KeyError as e:
            buffer['phase'].append('NA')

        try:
            buffer['type'].append(study['Study']['ProtocolSection']['DesignModule']['StudyType'])
        except KeyError as e:
            buffer['type'].append('NA')

        try:
            buffer['purpose'].append(
                study['Study']['ProtocolSection']['DesignModule']['DesignInfo'].get('DesignPrimaryPurpose', 'NA'))
        except KeyError as e:
            buffer['purpose'].append('NA')

        try:
            buffer['intervention_type'].append(
                study['Study']['ProtocolSection']['DesignModule']['DesignInfo'].get('DesignInterventionModel', 'NA'))
        except KeyError as e:
            buffer['intervention_type'].append('NA')

        try:
            buffer['mesh_terms'].append([x.get('ConditionMeshTerm', 'NA') for x in
                                         study['Study']['DerivedSection']['ConditionBrowseModule']['ConditionMeshList'][
                                             'ConditionMesh']])
        except KeyError as e:
            buffer['mesh_terms'].append([])

        try:
            buffer['description'].append(study['Study']['ProtocolSection']['DescriptionModule']['BriefSummary'])
        except KeyError as e:
            buffer['description'].append('NA')

        try:
            buffer['interventions'].append([x.get('InterventionMeshTerm', 'NA') for x in
                                            study['Study']['DerivedSection']['InterventionBrowseModule'][
                                                'InterventionMeshList']['InterventionMesh']])
        except KeyError as e:
            buffer['interventions'].append([])

        try:
            buffer['criteria'].append(study['Study']['ProtocolSection']['EligibilityModule']['EligibilityCriteria'])
        except KeyError as e:
            buffer['criteria'].append('NA')

        try:
            buffer['gender'].append(study['Study']['ProtocolSection']['EligibilityModule']['Gender'])
        except KeyError as e:
            buffer['gender'].append('NA')

        try:
            buffer['min_age'].append(study['Study']['ProtocolSection']['EligibilityModule']['MinimumAge'])
        except KeyError as e:
            buffer['min_age'].append('NA')

        try:
            buffer['max_age'].append(study['Study']['ProtocolSection']['EligibilityModule']['MaximumAge'])
        except KeyError as e:
            buffer['max_age'].append('NA')
            
        try:
            buffer['status'].append(study['Study']['ProtocolSection']['StatusModule']['OverallStatus'])
        except KeyError as e:
            buffer['status'].append('NA')
            
        try:
            buffer['completion_date'].append(study['Study']['ProtocolSection']['StatusModule']['PrimaryCompletionDateStruct']['PrimaryCompletionDate'])
        except KeyError as e:
            buffer['completion_date'].append('NA')
            
        try:
            buffer['completion_date_type'].append(study['Study']['ProtocolSection']['StatusModule']['PrimaryCompletionDateStruct']['PrimaryCompletionDateType'])
        except KeyError as e:
            buffer['completion_date_type'].append('NA')

        try:
            buffer['stopped_reason'].append(study['Study']['ProtocolSection']['StatusModule']['WhyStopped'])
        except KeyError as e:
            buffer['stopped_reason'].append('NA')
            
            
    return pd.DataFrame.from_dict(buffer).reset_index(drop=True)


In [33]:
def create_studies_table() -> pd.DataFrame:
    studies_table_dfs = []
    directory = DATA_PATH + '/clinical_trials/'
    for studies_data_pickle_file in tqdm(os.listdir(directory)):
        studies_file = os.path.join(directory, studies_data_pickle_file)
        with open(studies_file, 'rb') as f:
            studies_data = pickle.load(f)
            studies_table_df = create_studies_table_helper(studies=studies_data)
            studies_table_dfs.append(studies_table_df)

    studies_table = pd.concat(studies_table_dfs).reset_index(drop=True)
    return studies_table


In [34]:
studies_table = create_studies_table()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 87/87 [02:22<00:00,  1.64s/it]


In [36]:
studies_table.head()

Unnamed: 0,nct_id,official_title,short_title,conditions,verified_date,responsible_party,sponsor,phase,type,description,...,intervention_type,mesh_terms,criteria,min_age,max_age,gender,completion_date,completion_date_type,status,stopped_reason
0,NCT00633971,A Randomized Trial to Determine the Impact of ...,Treatment Trial for Post-Thrombotic Syndrome,[Post Thrombotic Syndrome],September 2020,Chris Holmes,University of Vermont,,Interventional,The purpose of this study is to determine if c...,...,Parallel Assignment,"[Postthrombotic Syndrome, Postphlebitic Syndro...",Inclusion Criteria:\n\nAge >18 years.\nDiagnos...,18 Years,,All,June 2010,Actual,Completed,
1,NCT00631761,Standardized Criteria to Judge Diagnostic Uret...,Standardized Criteria to Judge Diagnostic Uret...,[Urethrocystoscopy Skills],August 2008,,University of Cincinnati,,Interventional,Develop a tool to objectively quantify technic...,...,Single Group Assignment,[],Inclusion Criteria:\n\nAll Obstetrics & Gyneco...,20 Years,50 Years,All,June 2008,Actual,Completed,
2,NCT00636948,Rectal Cancer Trial On Defunctioning Stoma,Rectal Cancer Trial On Defunctioning Stoma,[Rectal Cancer],March 2008,,Rectal Cancer Trial on Defunctioning Stoma Stu...,,Observational,The hypothesis of the present trial was that t...,...,,[Rectal Neoplasms],Inclusion Criteria:\n\nAbsence of intraoperati...,18 Years,,All,June 2005,Actual,Completed,
3,NCT00632008,"A Randomised, Double-blind, Placebo-controlled...",Soluble Beta-glucan (SBG) as Treatment for Dia...,[Chronic Diabetic Foot Ulcers],January 2010,,Biotec Pharmacon ASA,Phase 3,Interventional,The purpose of this study is to determine whet...,...,Parallel Assignment,"[Diabetic Foot, Foot Ulcer, Ulcer]",Inclusion Criteria:\n\nType 1 or Type 2 diabet...,18 Years,,All,August 2009,Actual,Completed,
4,NCT00634426,Surgical Versus Nonoperative Treatment of Meta...,Surgical Versus Nonoperative Treatment of Meta...,[Metastatic Epidural Spinal Cord Compression],March 2015,,AOSpine North America Research Network,,Observational,The aim of this trial is to evaluate the diffe...,...,,[Spinal Cord Compression],Inclusion Criteria:\n\nSingle symptomatic meta...,18 Years,,All,March 2013,Actual,Completed,


In [38]:
studies_table['status'].value_counts()

Completed                    235473
Recruiting                    63026
Unknown status                55225
Terminated                    25042
Not yet recruiting            18665
Active, not recruiting        18442
Withdrawn                     12062
Enrolling by invitation        3770
Suspended                      1601
Withheld                        833
No longer available             395
Available                       246
Approved for marketing          187
Temporarily not available        33
Name: status, dtype: int64

In [39]:
studies_table['completion_date_type'].value_counts()

Actual         261069
Anticipated    152068
NA              21863
Name: completion_date_type, dtype: int64

In [41]:
studies_table['status'].str.upper().str.replace(' ', '_').value_counts()

COMPLETED                    235473
RECRUITING                    63026
UNKNOWN_STATUS                55225
TERMINATED                    25042
NOT_YET_RECRUITING            18665
ACTIVE,_NOT_RECRUITING        18442
WITHDRAWN                     12062
ENROLLING_BY_INVITATION        3770
SUSPENDED                      1601
WITHHELD                        833
NO_LONGER_AVAILABLE             395
AVAILABLE                       246
APPROVED_FOR_MARKETING          187
TEMPORARILY_NOT_AVAILABLE        33
Name: status, dtype: int64

In [42]:
studies_table['completion_date_type'].str.upper().str.replace(' ', '_').value_counts()

ACTUAL         261069
ANTICIPATED    152068
NA              21863
Name: completion_date_type, dtype: int64