In [3]:
import pandas as pd
import sqlalchemy as sa


In [4]:
def get_outcome_and_intervention_modules(studies):
    outcome_modules = []
    intervention_modules = []
    study_ids = []
    for study in studies:
        if (
            "ResultsSection" in study["Study"]
            and "OutcomeMeasuresModule" in study["Study"]["ResultsSection"]
        ):
            outcome_modules.append(study["Study"]["ResultsSection"]["OutcomeMeasuresModule"])
            study_ids.append(study['Study']['ProtocolSection']['IdentificationModule']['NCTId'])

        elif "ArmsInterventionsModule" in study["Study"]["ProtocolSection"]:
            intervention_modules.append(study["Study"]["ProtocolSection"]["ArmsInterventionsModule"])

    return outcome_modules, intervention_modules, study_ids

In [5]:
def create_measurements_table_helper(studies):
    outcome_modules, intervention_modules, study_ids = get_outcome_and_intervention_modules(studies)
    df = {
        "study_id": [],
        "measure": [],
        "type": [],
        "description": [],
        "dispersion_param": [],
        "measure_param": [],
        "units": [],
    }

    for i, module in enumerate(outcome_modules):
        for measure in module["OutcomeMeasureList"]["OutcomeMeasure"]:
            df["type"].append(measure.get("OutcomeMeasureType", "NA"))
            df["measure"].append(measure.get("OutcomeMeasureTitle", "NA"))
            df["description"].append(measure.get("OutcomeMeasureDescription", "NA"))
            df["measure_param"].append(measure.get("OutcomeMeasureParamType", "NA"))
            df["dispersion_param"].append(measure.get("OutcomeMeasureDispersionType", "NA"))
            df["units"].append(measure.get("OutcomeMeasureUnitOfMeasure", "NA"))
            df["study_id"].append(study_ids[i])

    # This is for studies without results
    # for i, module in enumerate(intervention_modules):
    #     for measure in module.get("ArmGroupList", {"ArmGroup": []})["ArmGroup"]:
    #         # Measure data is unstructured and often has other fields in the description.
    #         df["type"].append("NA")
    #         df["measure"].append(measure.get("ArmGroupLabel", "NA"))
    #         df["description"].append(measure.get("ArmGroupDescription", "NA"))
    #         df["measure_param"].append("NA")
    #         df["dispersion_param"].append("NA")
    #         df["units"].append("NA")
    #         df["study_id"].append(study_ids[i])

    return pd.DataFrame.from_dict(df).reset_index(drop=True)

In [6]:
from tqdm import tqdm
import os
import pickle

DATA_PATH='/Users/porterhunley/datasets'

def create_measurements_table():
    measurements_table_dfs = []
    directory = DATA_PATH + "/clinical_trials/"
    print("Deserializing studies...")
    for studies_data_pickle_file in tqdm(os.listdir(directory)):
        studies_file = os.path.join(directory, studies_data_pickle_file)
        with open(studies_file, "rb") as f:
            studies_data = pickle.load(f)
            measurements_table_df = create_measurements_table_helper(
                studies=studies_data
            )
            measurements_table_dfs.append(measurements_table_df)

    measurements_table = pd.concat(measurements_table_dfs).reset_index(drop=True)
    return measurements_table


In [19]:
measurements_table = create_measurements_table()

Deserializing studies...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 87/87 [02:09<00:00,  1.49s/it]


In [22]:
measurements_table['dispersion_param'].str.len().max()

34

## More info from studies


In [61]:
def create_studies_table_helper(studies) -> pd.DataFrame:
    buffer = {
        'nct_id': [], 'official_title': [], 'short_title': [], 'conditions': [],
        'verified_date': [], 'responsible_party': [], 'sponsor': [], 'phase': [], 'type': [], 'description': [],
        'interventions': [], 'purpose': [], 'intervention_type': [], 'mesh_terms': [],
        'criteria': [], 'min_age': [], 'max_age': [], 'gender': [], 'completion_date': [], 'completion_date_type':[],
        'status': [], 'stopped_reason': [], 'design_allocation': [], 'design_masking': [], 'design_time_perspective': [],
        'who_masked': [], 'observational_model': [], 'masking_description': [], 'model_description': []
    }
    for _, study in enumerate(studies):
        try:
            buffer['nct_id'].append(study['Study']['ProtocolSection']['IdentificationModule']['NCTId'])
        except KeyError as e:
            buffer['nct_id'].append('NA')

        try:
            buffer['official_title'].append(study['Study']['ProtocolSection']['IdentificationModule']['OfficialTitle'])
        except KeyError as e:
            buffer['official_title'].append('NA')

        try:
            buffer['short_title'].append(study['Study']['ProtocolSection']['IdentificationModule']['BriefTitle'])
        except KeyError as e:
            buffer['short_title'].append('NA')

        try:
            buffer['verified_date'].append(study['Study']['ProtocolSection']['StatusModule']['StatusVerifiedDate'])
        except KeyError as e:
            buffer['verified_date'].append('NA')

        try:
            buffer['responsible_party'].append(
                study['Study']['ProtocolSection']['SponsorCollaboratorsModule']['ResponsibleParty'][
                    'ResponsiblePartyInvestigatorFullName'])
        except KeyError as e:
            buffer['responsible_party'].append('NA')

        try:
            buffer['sponsor'].append(
                study['Study']['ProtocolSection']['SponsorCollaboratorsModule']['LeadSponsor']['LeadSponsorName'])
        except KeyError as e:
            buffer['sponsor'].append('NA')

        try:
            buffer['conditions'].append(
                study['Study']['ProtocolSection']['ConditionsModule']['ConditionList']['Condition'])
        except KeyError as e:
            buffer['conditions'].append('NA')

        try:
            phases = study['Study']['ProtocolSection']['DesignModule']['PhaseList']['Phase']
            if len(phases) > 1:
                phase = ' '.join(phases)
            else:
                phase = 'NA' if phases[0] == 'Not Applicable' else phases[0]
            buffer['phase'].append(phase)
        except KeyError as e:
            buffer['phase'].append('NA')

        try:
            buffer['type'].append(study['Study']['ProtocolSection']['DesignModule']['StudyType'])
        except KeyError as e:
            buffer['type'].append('NA')

        try:
            buffer['purpose'].append(
                study['Study']['ProtocolSection']['DesignModule']['DesignInfo'].get('DesignPrimaryPurpose', 'NA'))
        except KeyError as e:
            buffer['purpose'].append('NA')

        try:
            buffer['intervention_type'].append(
                study['Study']['ProtocolSection']['DesignModule']['DesignInfo'].get('DesignInterventionModel', 'NA'))
        except KeyError as e:
            buffer['intervention_type'].append('NA')

        try:
            buffer['mesh_terms'].append([x.get('ConditionMeshTerm', 'NA') for x in
                                         study['Study']['DerivedSection']['ConditionBrowseModule']['ConditionMeshList'][
                                             'ConditionMesh']])
        except KeyError as e:
            buffer['mesh_terms'].append([])

        try:
            buffer['description'].append(study['Study']['ProtocolSection']['DescriptionModule']['BriefSummary'])
        except KeyError as e:
            buffer['description'].append('NA')

        try:
            buffer['interventions'].append([x.get('InterventionMeshTerm', 'NA') for x in
                                            study['Study']['DerivedSection']['InterventionBrowseModule'][
                                                'InterventionMeshList']['InterventionMesh']])
        except KeyError as e:
            buffer['interventions'].append([])

        try:
            buffer['criteria'].append(study['Study']['ProtocolSection']['EligibilityModule']['EligibilityCriteria'])
        except KeyError as e:
            buffer['criteria'].append('NA')

        try:
            buffer['gender'].append(study['Study']['ProtocolSection']['EligibilityModule']['Gender'])
        except KeyError as e:
            buffer['gender'].append('NA')

        try:
            buffer['min_age'].append(study['Study']['ProtocolSection']['EligibilityModule']['MinimumAge'])
        except KeyError as e:
            buffer['min_age'].append('NA')

        try:
            buffer['max_age'].append(study['Study']['ProtocolSection']['EligibilityModule']['MaximumAge'])
        except KeyError as e:
            buffer['max_age'].append('NA')
            
        try:
            buffer['status'].append(study['Study']['ProtocolSection']['StatusModule']['OverallStatus'])
        except KeyError as e:
            buffer['status'].append('NA')
            
        try:
            buffer['completion_date'].append(study['Study']['ProtocolSection']['StatusModule']['PrimaryCompletionDateStruct']['PrimaryCompletionDate'])
        except KeyError as e:
            buffer['completion_date'].append('NA')
            
        try:
            buffer['completion_date_type'].append(study['Study']['ProtocolSection']['StatusModule']['PrimaryCompletionDateStruct']['PrimaryCompletionDateType'])
        except KeyError as e:
            buffer['completion_date_type'].append('NA')

        try:
            buffer['stopped_reason'].append(study['Study']['ProtocolSection']['StatusModule']['WhyStopped'])
        except KeyError as e:
            buffer['stopped_reason'].append('NA')
            
        try:
            buffer['design_allocation'].append(study['Study']['ProtocolSection']['DesignModule']['DesignInfo']['DesignAllocation'])
        except KeyError as e:
            buffer['design_allocation'].append('NA')
        
        try:
            buffer['design_masking'].append(study['Study']['ProtocolSection']['DesignModule']['DesignInfo']['DesignMaskingInfo']['DesignMasking'])
        except KeyError as e:
            buffer['design_masking'].append('NA')

        try:
            buffer['design_time_perspective'].append(study['Study']['ProtocolSection']['DesignModule']['DesignInfo']['DesignTimePerspectiveList']['DesignTimePerspective'])
        except KeyError as e:
            buffer['design_time_perspective'].append('NA')
            
        try:
            buffer['who_masked'].append(study['Study']['ProtocolSection']['DesignModule']['DesignInfo']['DesignMaskingInfo']['DesignWhoMaskedList']['DesignWhoMasked'])
        except KeyError as e:
            buffer['who_masked'].append('NA')
                        
        try:
            buffer['observational_model'].append(study['Study']['ProtocolSection']['DesignModule']['DesignInfo']['DesignInterventionModel'])
        except KeyError as e:
            buffer['observational_model'].append('NA')
                                    
        try:
            buffer['masking_description'].append(study['Study']['ProtocolSection']['DesignModule']['DesignInfo']['DesignMaskingInfo']['DesignMaskingDescription'])
        except KeyError as e:
            buffer['masking_description'].append('NA')
                                    
        try:
            buffer['model_description'].append(study['Study']['ProtocolSection']['DesignModule']['DesignInfo']['DesignInterventionModelDescription'])
        except KeyError as e:
            buffer['model_description'].append('NA')
            
            
    return pd.DataFrame.from_dict(buffer).reset_index(drop=True)


In [62]:
def create_studies_table() -> pd.DataFrame:
    studies_table_dfs = []
    directory = DATA_PATH + '/clinical_trials/'
    for studies_data_pickle_file in tqdm(os.listdir(directory)):
        studies_file = os.path.join(directory, studies_data_pickle_file)
        with open(studies_file, 'rb') as f:
            studies_data = pickle.load(f)
            studies_table_df = create_studies_table_helper(studies=studies_data)
            studies_table_dfs.append(studies_table_df)

    studies_table = pd.concat(studies_table_dfs).reset_index(drop=True)
    return studies_table


In [63]:
studies_table = create_studies_table()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 87/87 [03:29<00:00,  2.40s/it]


In [78]:
studies_table['who_masked'].apply(lambda x: [y.upper().replace(' ', '_') for y in x if x != 'NA']).value_counts()

[]                                                               296947
[PARTICIPANT, CARE_PROVIDER, INVESTIGATOR, OUTCOMES_ASSESSOR]     31191
[PARTICIPANT, INVESTIGATOR]                                       22794
[OUTCOMES_ASSESSOR]                                               20375
[PARTICIPANT]                                                     18052
[PARTICIPANT, CARE_PROVIDER, INVESTIGATOR]                         9585
[PARTICIPANT, INVESTIGATOR, OUTCOMES_ASSESSOR]                     9314
[PARTICIPANT, OUTCOMES_ASSESSOR]                                   8702
[INVESTIGATOR]                                                     5522
[INVESTIGATOR, OUTCOMES_ASSESSOR]                                  4069
[PARTICIPANT, CARE_PROVIDER]                                       3044
[PARTICIPANT, CARE_PROVIDER, OUTCOMES_ASSESSOR]                    2616
[CARE_PROVIDER]                                                    1019
[CARE_PROVIDER, INVESTIGATOR, OUTCOMES_ASSESSOR]                

In [None]:
studies_table[]

In [64]:
studies_table.head()

Unnamed: 0,nct_id,official_title,short_title,conditions,verified_date,responsible_party,sponsor,phase,type,description,...,completion_date_type,status,stopped_reason,design_allocation,design_masking,design_time_perspective,who_masked,observational_model,masking_description,model_description
0,NCT00633971,A Randomized Trial to Determine the Impact of ...,Treatment Trial for Post-Thrombotic Syndrome,[Post Thrombotic Syndrome],September 2020,Chris Holmes,University of Vermont,,Interventional,The purpose of this study is to determine if c...,...,Actual,Completed,,Randomized,Single,,[Outcomes Assessor],Parallel Assignment,,
1,NCT00631761,Standardized Criteria to Judge Diagnostic Uret...,Standardized Criteria to Judge Diagnostic Uret...,[Urethrocystoscopy Skills],August 2008,,University of Cincinnati,,Interventional,Develop a tool to objectively quantify technic...,...,Actual,Completed,,Randomized,None (Open Label),,,Single Group Assignment,,
2,NCT00636948,Rectal Cancer Trial On Defunctioning Stoma,Rectal Cancer Trial On Defunctioning Stoma,[Rectal Cancer],March 2008,,Rectal Cancer Trial on Defunctioning Stoma Stu...,,Observational,The hypothesis of the present trial was that t...,...,Actual,Completed,,,,[Prospective],,,,
3,NCT00632008,"A Randomised, Double-blind, Placebo-controlled...",Soluble Beta-glucan (SBG) as Treatment for Dia...,[Chronic Diabetic Foot Ulcers],January 2010,,Biotec Pharmacon ASA,Phase 3,Interventional,The purpose of this study is to determine whet...,...,Actual,Completed,,Randomized,Quadruple,,"[Participant, Care Provider, Investigator, Out...",Parallel Assignment,,
4,NCT00634426,Surgical Versus Nonoperative Treatment of Meta...,Surgical Versus Nonoperative Treatment of Meta...,[Metastatic Epidural Spinal Cord Compression],March 2015,,AOSpine North America Research Network,,Observational,The aim of this trial is to evaluate the diffe...,...,Actual,Completed,,,,[Prospective],,,,


In [70]:
studies_table['design_allocation'].str.upper().str.replace(' ','_').str.replace('/','').str.replace('-','').value_counts()


RANDOMIZED       218804
NA               179419
NONRANDOMIZED     36777
Name: design_allocation, dtype: int64

In [72]:
studies_table['design_masking'].apply(lambda x: 'None' if x == 'None (Open Label)' else x ).str.upper().str.replace(' ','_').value_counts()

NONE         185373
NA           104174
SINGLE        46240
DOUBLE        45773
QUADRUPLE     31191
TRIPLE        22249
Name: design_masking, dtype: int64

In [73]:
studies_table['design_time_perspective'].str.upper().str.replace(' ', '_').value_counts()

NA    340740
Name: design_time_perspective, dtype: int64

In [None]:
studies_table['design_time_perspective'].str.upper().str.replace(' ', '_').value_counts()

In [67]:
studies_table[studies_table['model_description'] != 'NA']

Unnamed: 0,nct_id,official_title,short_title,conditions,verified_date,responsible_party,sponsor,phase,type,description,...,completion_date_type,status,stopped_reason,design_allocation,design_masking,design_time_perspective,who_masked,observational_model,masking_description,model_description
101,NCT00634283,Physiologic Monitoring of Antidepressant Medic...,Study of the Effects of an Antidepressant Medi...,[Depression],February 2020,Andrew F. Leuchter,"University of California, Los Angeles",Phase 4,Interventional,This study examines the effects of an antidepr...,...,Actual,Completed,,Randomized,Triple,,"[Participant, Care Provider, Outcomes Assessor]",Parallel Assignment,We compared EEG outcomes for those subjects wh...,Subjects are assigned to one of two groups (an...
136,NCT00635518,Randomized Controlled Trial of Dietary Advice ...,Randomized Controlled Trial of Dietary Advice ...,[Breast Feeding],August 2021,Marcia Regina Vitolo,Federal University of Health Science of Porto ...,,Interventional,A cluster randomized field trial to evaluate t...,...,Actual,Withdrawn,Funding was canceled and no resources are avai...,,None (Open Label),,,Single Group Assignment,,Study Withdrawn
684,NCT00776087,European Health Economic Trial on Home Monitor...,European Health Economic Trial on Home Monitor...,"[Ventricular Fibrillation, Tachycardia, Ventri...",June 2017,,Biotronik SE & Co. KG,,Interventional,BIOTRONIK Home Monitoring (HM) service enables...,...,Actual,Terminated,it is unethical to continue a study with deact...,Randomized,None (Open Label),,,Parallel Assignment,HM ON vs. HM OFF,Randomized prospective multicenter internation...
945,NCT00775476,Treatment of Systemic Lupus Erythematosus (SLE...,Treatment of Systemic Lupus Erythematosus (SLE...,[Systemic Lupus Erythematosus],June 2022,,State University of New York - Upstate Medical...,Phase 2,Interventional,Systemic lupus erythematosus (SLE) is a chroni...,...,Anticipated,Recruiting,,Randomized,Quadruple,,"[Participant, Care Provider, Investigator, Out...",Parallel Assignment,,This study will titrate to tolerance during an...
1113,NCT00772824,Study of the Effect of Glutamine Supplementati...,Study of the Effect of Glutamine Supplementati...,[Breast Cancer],July 2018,Manoj Pandey,Banaras Hindu University,Phase 4,Interventional,"Glutamine, a non essential branched chain amin...",...,Actual,Completed,,Randomized,Single,,[Participant],Parallel Assignment,,Case Control
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434925,NCT05169385,Improving Outcomes of Adolescents in Residenti...,Parent SMART (Substance Misuse in Adolescents ...,"[Substance Use, Adolescent Behavior]",July 2022,,Brown University,,Interventional,Adolescents in residential substance use treat...,...,Anticipated,Recruiting,,Randomized,Single,,[Outcomes Assessor],Parallel Assignment,Outcomes assessors will be masked to study con...,Effectiveness Trial
434930,NCT05168150,Testing the Efficacy of an Artificial Intellig...,Testing the Efficacy of an Artificial Intellig...,[Surgical Education],August 2022,Rolando Del Maestro,McGill University,,Interventional,Background:\n\nTrainees learn surgical technic...,...,Actual,Completed,,Randomized,Double,,"[Participant, Outcomes Assessor]",Parallel Assignment,Double (Participant and Expert Rater)\n\nParti...,Randomized Control trial
434939,NCT05167708,The Effect of Single and Repeated Micro-osteop...,Effects of Micro-osteoperforation on the Maxil...,[Accretions; Teeth],December 2021,Alaa alkasaby,Mansoura University,,Interventional,the study aimed to evaluate the effect of diff...,...,Actual,Completed,,Randomized,Single,,[Outcomes Assessor],Parallel Assignment,"single blinded, the assessor of the out comes ...","single-center, prospective, single blinded, wi..."
434949,NCT02702271,Protection Against Embolism for Non-valvular A...,Investigational Device Evaluation of the WATCH...,[Atrial Fibrillation],May 2022,,Boston Scientific Corporation,,Interventional,"The study is a prospective, non-randomized, mu...",...,Actual,Completed,,Non-Randomized,None (Open Label),,,Sequential Assignment,,A total of 400 main cohort subjects and 58 rol...


In [14]:
studies_table['has_allocation'] = studies_table['design_section'].apply(lambda x: 'DesignAllocation' in x)

In [None]:
studies_table['has_time_per'] = studies_table['design_section'].apply(lambda x: 'DesignAllocation' in x)

In [35]:
# Enumerate all the keys in the design section
def is_dict(obj):
    if isinstance(obj, dict):
        return True
    else:
        return False

studies_table['is_dict'] = studies_table['design_section'].apply(is_dict)

In [37]:
design_table = studies_table[studies_table['is_dict']]

In [39]:
design_table['design_section'].apply(lambda x: list(x.keys())).explode().value_counts()

DesignMaskingInfo                     330835
DesignAllocation                      330360
DesignInterventionModel               329481
DesignPrimaryPurpose                  328714
DesignTimePerspectiveList              94260
DesignObservationalModelList           91305
DesignInterventionModelDescription     50682
Name: design_section, dtype: int64

In [40]:
## Look at masking
masking_table = design_table[design_table['design_section'].apply(lambda x: 'DesignMaskingInfo')]

In [44]:
masking_table['design_section'].apply(lambda x: list(x['DesignMaskingInfo'].keys())).explode().value_counts()

DesignMasking               330826
DesignWhoMaskedList         138053
DesignMaskingDescription     26013
Name: design_section, dtype: int64

In [48]:
## Masking types
masking_table['design_section'].apply(lambda x: x.get('DesignInterventionModel', 'N/A')).explode().value_counts()

Parallel Assignment        194688
Single Group Assignment     93724
Crossover Assignment        28028
Sequential Assignment        7410
Factorial Assignment         4828
N/A                          2157
Name: design_section, dtype: int64

In [49]:
## Masking types
masking_table['design_section'].apply(lambda x: x.get('DesignMaskingInfo', 'N/A')).explode().value_counts()

Treatment                          213244
Prevention                          35753
Other                               16239
Basic Science                       16043
Supportive Care                     16020
Diagnostic                          14652
Health Services Research             8011
N/A                                  6870
Screening                            2813
Device Feasibility                   1010
Educational/Counseling/Training       180
Name: design_section, dtype: int64

In [50]:
masking_table['design_section'].apply(lambda x: x['DesignMaskingInfo'].get(
    'DesignWhoMaskedList', {'DesignWhoMasked': []})['DesignWhoMasked']).explode().value_counts()

Participant          105298
Investigator          83594
Outcomes Assessor     77652
Care Provider         49225
Name: design_section, dtype: int64

In [51]:
masking_table['design_section'].apply(lambda x: x['DesignMaskingInfo'].get(
    'DesignMasking', 'NA')).explode().value_counts()

None (Open Label)    185373
Single                46240
Double                45773
Quadruple             31191
Triple                22249
NA                        9
Name: design_section, dtype: int64

In [52]:
design_table['design_section'].apply(lambda x: x.get(
    'DesignTimePerspectiveList', {'DesignTimePerspective': []})['DesignTimePerspective']).explode().value_counts()

Prospective        65205
Retrospective      13912
Cross-Sectional    11233
Other               3910
Name: design_section, dtype: int64

In [53]:
design_table['design_section'].apply(lambda x: x.get(
    'DesignObservationalModelList', {'DesignObservationalModel': []})['DesignObservationalModel']).explode().value_counts()


Cohort                   53563
Case-Control             13318
Case-Only                13186
Other                     7915
Ecologic or Community     1207
Case-Crossover             883
Defined Population         651
Family-Based               492
Natural History             90
Name: design_section, dtype: int64

In [54]:
## Masking types
masking_table['design_section'].apply(lambda x: x.get('DesignAllocation', 'N/A')).explode().value_counts()

Randomized        217611
N/A                76560
Non-Randomized     36664
Name: design_section, dtype: int64

In [57]:
len(max(masking_table['design_section'].apply(lambda x: x['DesignMaskingInfo'].get(
    'DesignMaskingDescription', 'NA')), key=len))

1003

In [60]:
len(max(design_table['design_section'].apply(lambda x: x.get('DesignInterventionModelDescription', 'NA')), key=len))

1000

In [21]:
# Vast majority of intervential
studies_table[studies_table['has_allocation'] == True]['type'].value_counts()

Interventional    330360
Name: type, dtype: int64

In [22]:
studies_table['design_section']

0         {'DesignAllocation': 'Randomized', 'DesignInte...
1         {'DesignAllocation': 'Randomized', 'DesignInte...
2         {'DesignTimePerspectiveList': {'DesignTimePers...
3         {'DesignAllocation': 'Randomized', 'DesignInte...
4         {'DesignObservationalModelList': {'DesignObser...
                                ...                        
434995    {'DesignAllocation': 'Randomized', 'DesignInte...
434996    {'DesignAllocation': 'Randomized', 'DesignInte...
434997    {'DesignTimePerspectiveList': {'DesignTimePers...
434998    {'DesignAllocation': 'Randomized', 'DesignInte...
434999    {'DesignAllocation': 'Non-Randomized', 'Design...
Name: design_section, Length: 435000, dtype: object

In [38]:
studies_table['status'].value_counts()

Completed                    235473
Recruiting                    63026
Unknown status                55225
Terminated                    25042
Not yet recruiting            18665
Active, not recruiting        18442
Withdrawn                     12062
Enrolling by invitation        3770
Suspended                      1601
Withheld                        833
No longer available             395
Available                       246
Approved for marketing          187
Temporarily not available        33
Name: status, dtype: int64

In [39]:
studies_table['completion_date_type'].value_counts()

Actual         261069
Anticipated    152068
NA              21863
Name: completion_date_type, dtype: int64

In [41]:
studies_table['status'].str.upper().str.replace(' ', '_').value_counts()

COMPLETED                    235473
RECRUITING                    63026
UNKNOWN_STATUS                55225
TERMINATED                    25042
NOT_YET_RECRUITING            18665
ACTIVE,_NOT_RECRUITING        18442
WITHDRAWN                     12062
ENROLLING_BY_INVITATION        3770
SUSPENDED                      1601
WITHHELD                        833
NO_LONGER_AVAILABLE             395
AVAILABLE                       246
APPROVED_FOR_MARKETING          187
TEMPORARILY_NOT_AVAILABLE        33
Name: status, dtype: int64

In [42]:
studies_table['completion_date_type'].str.upper().str.replace(' ', '_').value_counts()

ACTUAL         261069
ANTICIPATED    152068
NA              21863
Name: completion_date_type, dtype: int64