In [1]:
# from openhealthdata import ClinicalTrialsGov

import pandas as pd
import plotly.express as px
pd.options.display.max_columns = None

def protocol_feature_query():
  print("loads selected features. Phase 2, 3, 4 Trials. Takes ~20 seconds to load")
  df = pd.read_parquet("https://storage.googleapis.com/data.openhealthdata.org/clinical_trials_gov/protocol_feature_cache.parquet")
  return df

protocol_feature = protocol_feature_query()
protocol_feature

loads selected features. Phase 2, 3, 4 Trials. Takes ~20 seconds to load


Unnamed: 0,_trial_id,_org_name,_org_class,_sponsor_name,_sponsor_class,_collaborator,_condition,_location,_eligibility,_status,_arm,_random,_enrollment,_phase,_location_count,_arm_count,_start_yr,_end_yr,_last_yr,_enrollment_z,_location_count_z,_arm_count_z
0,NCT04834349,M.D. Anderson Cancer Center,OTHER,M.D. Anderson Cancer Center,OTHER,,[Recurrent Head and Neck Squamous Cell Carcino...,"[{'LocationCity': 'Houston', 'LocationContactL...",Inclusion Criteria:\n\nPatients with biopsy pr...,Recruiting,[{'ArmGroupDescription': 'Patients receive NBT...,Non-Randomized,80.0,Phase 2,1.0,2.0,2021.0,2025.0,2021,-0.044914,-0.197093,-0.048174
1,NCT04837924,General Hospital Sveti Duh,OTHER,General Hospital Sveti Duh,OTHER,,[Hip Fractures],"[{'LocationCity': 'Zagreb', 'LocationContactLi...",Inclusion Criteria:\n\nhip fracture after mino...,Recruiting,[{'ArmGroupDescription': 'Participants receivi...,Randomized,80.0,Phase 4,1.0,2.0,2021.0,2021.0,2021,-0.044914,-0.197093,-0.048174
2,NCT04838444,Valneva Austria GmbH,INDUSTRY,Valneva Austria GmbH,INDUSTRY,,[Chikungunya Virus Infection],"[{'LocationCity': 'Phoenix', 'LocationContactL...",Inclusion Criteria:\n\nSubject participated in...,Enrolling by invitation,"[{'ArmGroupDescription': None, 'ArmGroupInterv...",,375.0,Phase 3,11.0,1.0,2021.0,2025.0,2021,0.002014,0.138348,-0.826292
3,NCT04831736,NYU Langone Health,OTHER,NYU Langone Health,OTHER,,[Postoperative Pain],"[{'LocationCity': 'New York', 'LocationContact...","Inclusion Criteria:\n\nAdult women, aged 18 to...",Recruiting,"[{'ArmGroupDescription': None, 'ArmGroupInterv...",Randomized,30.0,Phase 4,1.0,2.0,2021.0,2022.0,2022,-0.052868,-0.197093,-0.048174
4,NCT04837820,Memorial Sloan Kettering Cancer Center,OTHER,Memorial Sloan Kettering Cancer Center,OTHER,,[Breast Cancer],"[{'LocationCity': 'Commack', 'LocationContactL...",Inclusion Criteria:\n\nEnglish-proficient adul...,Recruiting,[{'ArmGroupDescription': 'The intervention wil...,Randomized,260.0,Phase 2,4.0,3.0,2021.0,2025.0,2021,-0.016280,-0.096461,0.729943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117240,NCT00554723,CHIMES Society,OTHER,CHIMES Society,OTHER,"[{'CollaboratorClass': 'OTHER_GOV', 'Collabora...","[Cerebral Infarction, Stroke]","[{'LocationCity': 'Hong Kong', 'LocationContac...",Inclusion Criteria:\n\nSubject is aged 18 year...,Completed,"[{'ArmGroupDescription': 'NeuroAid', 'ArmGroup...",Randomized,1100.0,Phase 3,24.0,2.0,2007.0,2012.0,2014,0.162484,0.470072,-0.059196
117241,NCT00559845,Hoffmann-La Roche,INDUSTRY,Hoffmann-La Roche,INDUSTRY,,[Breast Cancer],"[{'LocationCity': 'Napoli', 'LocationContactLi...","Inclusion Criteria:\n\nfemale participants, >=...",Completed,[{'ArmGroupDescription': 'Participants will re...,,56.0,Phase 2,8.0,1.0,2008.0,2015.0,2017,-0.071838,0.022932,-0.799786
117242,NCT00552136,Nova Scotia Health Authority,OTHER,Ross Leighton,OTHER,"[{'CollaboratorClass': 'OTHER', 'CollaboratorN...",[Ankle Arthritis],"[{'LocationCity': 'Halifax', 'LocationContactL...",Inclusion Criteria:\n\nEighteen (18) or older\...,Unknown status,,Randomized,100.0,Phase 4,1.0,,2005.0,2020.0,2016,-0.061963,-0.172691,-1.540376
117243,NCT00556933,University of Nebraska,OTHER,"R. Brian Stevens, MD",OTHER,"[{'CollaboratorClass': 'INDUSTRY', 'Collaborat...",[End-stage Renal Disease],"[{'LocationCity': 'Omaha', 'LocationContactLis...",Inclusion Criteria:\n\nPrimary renal transplan...,Completed,[{'ArmGroupDescription': 'Kidney transplant re...,Randomized,180.0,Phase 4,1.0,4.0,2004.0,2011.0,2015,-0.044007,-0.172691,1.421984


In [2]:
protocol_feature['_org_name'].value_counts()

Novartis                                    2306
GlaxoSmithKline                             1930
Pfizer                                      1553
National Cancer Institute (NCI)             1501
Sanofi                                      1484
                                            ... 
Igy Inc.                                       1
Entegrion, Inc.                                1
Shenzhen Zhongshan Urology Hospital            1
Centro en Insuficiencia Cardiaca, Mexico       1
CHIMES Society                                 1
Name: _org_name, Length: 11071, dtype: int64

In [3]:
protocol_feature = protocol_feature[protocol_feature['_start_yr'] > 1999]
protocol_feature = protocol_feature[protocol_feature['_start_yr'] < 2023]
px.box(
    protocol_feature,
    x="_start_yr",
    y="_arm_count",
    title=f"n={len(protocol_feature)} Studies",
    facet_col="_random",
)

In [4]:
def _get_us_locations(protocol_feature: pd.DataFrame, country: list = ["United States"]) -> pd.DataFrame:
    """
        1. Get locations of Phase 2, 3, 4 trials
        2. Adds the geocoordinates of the zip code where it takes place
    """
    df = protocol_feature
    df = df[df['_location_count_z'] < 3] # filter < 3 sigma 
    # df = df[df['_location_count'] > 0] 
    df = df.sort_values("_location_count")
    df = df[['_trial_id','_location','_phase','_location_count','_org_name']].reset_index(drop=True)
    dff = df.explode("_location").reset_index()

    location = pd.json_normalize(dff['_location']) #.reset_index()
    location = location.join(dff[['index','_trial_id','_location_count','_phase','_org_name']])
    location = location[location['LocationCountry'].isin(country)]
    location = location.drop(columns=['LocationContactList.LocationContact','LocationCountry','LocationContactList'])
    location = location.dropna(subset=["LocationStatus"]).reset_index(drop=True)

    def _geolocate_zip5(df):
        zip5_census = pd.read_feather("https://storage.googleapis.com/ph-cdn/us_census_zips_geo/adi_stats_zip5.feather")
        zip5_census = zip5_census.drop(columns=['count','adi_mean','min','max','std'])
        return df.merge(zip5_census, left_on="LocationZip", right_on="_zip5")

    location = _geolocate_zip5(location)
    
    return location

location = _get_us_locations(protocol_feature)
location

Unnamed: 0,LocationCity,LocationFacility,LocationState,LocationStatus,LocationZip,index,_trial_id,_location_count,_phase,_org_name,_state,_zip5,adi_median,_lat,_lng,_census_total
0,Houston,M D Anderson Cancer Center,Texas,Recruiting,77030,0,NCT04834349,1.0,Phase 2,M.D. Anderson Cancer Center,TX,77030,10.0,29.705557,-95.401754,10258
1,Houston,M D Anderson Cancer Center,Texas,Recruiting,77030,13,NCT04947254,1.0,Phase 2,M.D. Anderson Cancer Center,TX,77030,10.0,29.705557,-95.401754,10258
2,Houston,M D Anderson Cancer Center,Texas,Recruiting,77030,24,NCT04940286,1.0,Phase 2,M.D. Anderson Cancer Center,TX,77030,10.0,29.705557,-95.401754,10258
3,Houston,M D Anderson Cancer Center,Texas,Recruiting,77030,70,NCT04940299,1.0,Phase 2,M.D. Anderson Cancer Center,TX,77030,10.0,29.705557,-95.401754,10258
4,Houston,The University of Texas Health Science Center ...,Texas,Recruiting,77030,361,NCT04570475,1.0,Phase 3,"The University of Texas Health Science Center,...",TX,77030,10.0,29.705557,-95.401754,10258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37251,Solvang,Sansum Clinic,California,"Active, not recruiting",93463,100043,NCT03706365,117.0,Phase 2,Eli Lilly and Company,CA,93463,6.0,34.606422,-120.135431,7646
37252,Palm Bay,Cancer Care Center of Brevard,Florida,"Active, not recruiting",32909,100043,NCT03706365,117.0,Phase 2,Eli Lilly and Company,FL,32909,62.0,27.899533,-80.659568,30120
37253,Naples,Advanced Research for Health Improvement,Florida,"Active, not recruiting",34109,100072,NCT03832946,120.0,Phase 2,Galecto Biotech AB,FL,34109,20.0,26.240842,-81.763823,23338
37254,Chicago,Northwestern Memorial Hospital,Illinois,Recruiting,97232,100072,NCT03832946,120.0,Phase 2,Galecto Biotech AB,OR,97232,14.0,45.528929,-122.643927,11472


In [5]:
px.set_mapbox_access_token("pk.eyJ1IjoicGFycXVhciIsImEiOiJja3lpcXMycGUxbmF5MnBzZXVzMHBzaXl4In0.jz0tx-HTJWym8jWPa8lqiA")

px.scatter_mapbox(
        location,
        lat="_lat",
        lon="_lng",
        title=f"Where do Clinical Trials take place? {len(location)} locations",
        color='LocationStatus',
        hover_data=['LocationFacility'],
        size_max=15,
        height=700,
        zoom=3,
    )