# 0B 
get interventional_trials with descendents (which is processing trials and disease, biomarker, etc information based on ontology definitions in NCI Theasurus
- also further clean data 

In [1]:
from owlready2 import get_ontology
from datetime import datetime
import json
import pandas as pd

In [2]:
import os, glob
# os.getcwd()
# glob.glob(

from https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/archive/24.02d_Release/

In [3]:
# ontology = get_ontology("/Users/margaretguo/Documents/clinical_trial_llm/clinical_data/ThesaurusInferred.owl").load()


In [4]:
%%time
ontology_path = "/Users/margaretguo/Documents/clinical_trial_llm/clinical_data/Thesaurus.owl"
ontology = get_ontology(ontology_path).load()

# Verify if the ontology is loaded
print(f"Ontology loaded: {ontology}")

Ontology loaded: get_ontology("http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#")
CPU times: user 58.9 s, sys: 1.72 s, total: 1min
Wall time: 1min


In [5]:
# Check ontology metadata
print(f"Ontology IRI: {ontology.base_iri}")
print(f"Number of classes: {len(list(ontology.classes()))}")

Ontology IRI: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#
Number of classes: 189646


In [6]:
for cls in list(ontology.classes())[:10]:
    print(cls)

Thesaurus.C1000
Thesaurus.C10000
Thesaurus.C100000
Thesaurus.C100001
Thesaurus.C100002
Thesaurus.C100003
Thesaurus.C100004
Thesaurus.C100005
Thesaurus.C100006
Thesaurus.C100007


In [7]:
glob.glob("*intervention*")

['interventional_trials_2024-12-26_12-10-35.json',
 'interventional_trials_with_descendants2024-07-26.json',
 'active_interventional_trials_2024-12-26_12-11-31.json',
 'active_interventional_trials_with_descendants2024-12-26.json.zip',
 'interventional_trials_with_descendants2024-08-23.json',
 'interventional_trials_simple_dx_bm_2024-12-26_17-42-04.json',
 'interventional_trials_2024-07-26_13-02-20.json',
 'active_interventional_trials_with_descendants2024-12-26.json',
 'active_interventional_trials_2024-07-26_13-02-57.json',
 'interventional_trials_with_descendants2024-12-26.json',
 'interventional_trials_with_descendants2024-07-26.json.zip',
 'interventional_trials_with_descendants2024-12-26.json.zip',
 'filtered_interventional_trials.json',
 'interventional_trials_simple_dx_bm_2024-07-26_13-14-44.json']

In [80]:
active_trials = json.load(open('interventional_trials_simple_dx_bm_2024-07-26_13-14-44.json','r'))



In [81]:
description = [active_trials['data'][0]['eligibility']['unstructured'][i]['description'] for i in range(len(active_trials['data'][0]['eligibility']['unstructured']))]

In [82]:
subclasses = ontology.search_one(iri="*C4571")
print(f"Subclasses of {cls}: {subclasses}")
ancestors = list(cls.ancestors())
print(f"Ancestors of {cls}: {ancestors}")
subclasses = list(cls.subclasses())

Subclasses of Thesaurus.C100007: Thesaurus.C4571
Ancestors of Thesaurus.C100007: [owl.Thing, Thesaurus.C36291, Thesaurus.C3367, Thesaurus.C35552, Thesaurus.C7057, Thesaurus.C36278, Thesaurus.C99896, Thesaurus.C100007, Thesaurus.C101883]


In [83]:
def save_data_to_file(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

In [84]:
def get_all_subclasses(cls, depth=0, max_depth=2, subclasses=None):
    if subclasses is None:
        subclasses = set()
    if depth < max_depth:  # Only proceed if the current depth is less than the max depth
        for subclass in cls.subclasses():
            if subclass not in subclasses:
                subclasses.add(subclass)
                # Increment the depth when making the recursive call
                get_all_subclasses(subclass, depth + 1, max_depth, subclasses)
    return subclasses


In [85]:
# Step 1: Create a mapping of class names to class objects
class_name_to_object = {cls.name: cls for cls in ontology.classes()}


descendant_dict = {}
# Assuming active_trials['data'] is a list or similar iterable
all_nci_disease_codes = set()
print(f"Processing {len(active_trials['data'])} active trials for disease codes...")
for i, trial in enumerate(active_trials['data'], 1):  # Enumerate to keep track of the iteration count
    cur_nci_codes = set(trial['diseases_new']['inclusion']['TRIAL']['nci_thesaurus_concept_id'])
    all_nci_disease_codes = all_nci_disease_codes.union(cur_nci_codes)
    cur_nci_codes = set(trial['diseases_new']['exclusion']['TRIAL']['nci_thesaurus_concept_id'])
    all_nci_disease_codes = all_nci_disease_codes.union(cur_nci_codes)

    if i % 1000 == 0:  # Print every 1000 iterations
        print(f"Processed {i} out of {len(active_trials['data'])} trials for disease codes...")


# Step 2: Process the NCI disease codes using the mapping for fast lookups
print(f"Processing {len(all_nci_disease_codes)} NCI disease codes...")
descendant_dict['diseases'] = {}
bad_names = []

for i, c in enumerate(all_nci_disease_codes, 1):
    # Use the mapping to find the class object directly
    cls_class = class_name_to_object.get(c)

    if cls_class:
        name_cur = cls_class.name
        all_subclasses = get_all_subclasses(cls_class)
        descendant_dict['diseases'][name_cur] = [cc.name for cc in all_subclasses]
    else:
        bad_names.append(c)  # Keep track of codes that don't have a corresponding class

    if i % 1000 == 0:
        print(f"Processed {i} out of {len(all_nci_disease_codes)} NCI disease codes...")

Processing 20894 active trials for disease codes...
Processed 1000 out of 20894 trials for disease codes...
Processed 2000 out of 20894 trials for disease codes...
Processed 3000 out of 20894 trials for disease codes...
Processed 4000 out of 20894 trials for disease codes...
Processed 5000 out of 20894 trials for disease codes...
Processed 6000 out of 20894 trials for disease codes...
Processed 7000 out of 20894 trials for disease codes...
Processed 8000 out of 20894 trials for disease codes...
Processed 9000 out of 20894 trials for disease codes...
Processed 10000 out of 20894 trials for disease codes...
Processed 11000 out of 20894 trials for disease codes...
Processed 12000 out of 20894 trials for disease codes...
Processed 13000 out of 20894 trials for disease codes...
Processed 14000 out of 20894 trials for disease codes...
Processed 15000 out of 20894 trials for disease codes...
Processed 16000 out of 20894 trials for disease codes...
Processed 17000 out of 20894 trials for disea

In [89]:
# Step 1: Create a mapping of class names to class objects
class_name_to_object = {cls.name: cls for cls in ontology.classes()}


descendant_dict = {}
# Assuming active_trials['data'] is a list or similar iterable
all_nci_disease_codes = set()
print(f"Processing {len(active_trials['data'])} active trials for disease codes...")
for i, trial in enumerate(active_trials['data'], 1):  # Enumerate to keep track of the iteration count
    cur_nci_codes = set(trial['diseases_new']['inclusion']['TRIAL']['nci_thesaurus_concept_id'])
    all_nci_disease_codes = all_nci_disease_codes.union(cur_nci_codes)
    cur_nci_codes = set(trial['diseases_new']['exclusion']['TRIAL']['nci_thesaurus_concept_id'])
    all_nci_disease_codes = all_nci_disease_codes.union(cur_nci_codes)

    if i % 1000 == 0:  # Print every 1000 iterations
        print(f"Processed {i} out of {len(active_trials['data'])} trials for disease codes...")


# Step 2: Process the NCI disease codes using the mapping for fast lookups
print(f"Processing {len(all_nci_disease_codes)} NCI disease codes...")
descendant_dict['diseases'] = {}
bad_names = []

for i, c in enumerate(all_nci_disease_codes, 1):
    # Use the mapping to find the class object directly
    cls_class = class_name_to_object.get(c)

    if cls_class:
        name_cur = cls_class.name
        all_subclasses = get_all_subclasses(cls_class)
        descendant_dict['diseases'][name_cur] = [cc.name for cc in all_subclasses]
    else:
        bad_names.append(c)  # Keep track of codes that don't have a corresponding class

    if i % 1000 == 0:
        print(f"Processed {i} out of {len(all_nci_disease_codes)} NCI disease codes...")
bad_disease_names = bad_names
# Step 2: Process the NCI disease codes using the mapping for fast lookups
# Assuming active_trials['data'] is a list or similar iterable
all_nci_biomarker_codes = set()
print(f"Processing {len(active_trials['data'])} active trials for biomarker codes...")
for i, trial in enumerate(active_trials['data'], 1):
    cur_nci_codes = set(trial['biomarkers_new']['inclusion']['TRIAL']['nci_thesaurus_concept_id'])
    all_nci_biomarker_codes = all_nci_biomarker_codes.union(cur_nci_codes)
    cur_nci_codes = set(trial['biomarkers_new']['exclusion']['TRIAL']['nci_thesaurus_concept_id'])
    all_nci_biomarker_codes = all_nci_biomarker_codes.union(cur_nci_codes)

    if i % 1000 == 0:
        print(f"Processed {i} out of {len(active_trials['data'])} trials for biomarker codes...")

print(f"Processing {len(all_nci_biomarker_codes)} NCI biomarker codes...")
descendant_dict['biomarkers'] = {}
bad_names = []

for i, c in enumerate(all_nci_biomarker_codes, 1):
    # Use the mapping to find the class object directly
    cls_class = class_name_to_object.get(c)

    if cls_class:
        name_cur = cls_class.name
        all_subclasses = get_all_subclasses(cls_class)
        descendant_dict['biomarkers'][name_cur] = [cc.name for cc in all_subclasses]
    else:
        bad_names.append(c)  # Keep track of codes that don't have a corresponding class

    if i % 1000 == 0:
        print(f"Processed {i} out of {len(all_nci_biomarker_codes)} NCI biomarker codes...")

        
#INTERVENTIONS
# Step 2: Process the NCI disease codes using the mapping for fast lookups
# Assuming active_trials['data'] is a list or similar iterable
all_nci_intervention_codes = set()
print(f"Processing {len(active_trials['data'])} active trials for intervention codes...")
for i, trial in enumerate(active_trials['data'], 1):
    cur_nci_codes = set(trial['interventions_new']['inclusion']['TRIAL']['nci_thesaurus_concept_id'])
    all_nci_intervention_codes = all_nci_intervention_codes.union(cur_nci_codes)
    cur_nci_codes = set(trial['interventions_new']['exclusion']['TRIAL']['nci_thesaurus_concept_id'])
    all_nci_intervention_codes = all_nci_intervention_codes.union(cur_nci_codes)

    if i % 1000 == 0:
        print(f"Processed {i} out of {len(active_trials['data'])} trials for interventions codes...")

print(f"Processing {len(all_nci_intervention_codes)} NCI intervention codes...")
descendant_dict['interventions'] = {}
bad_names = []

for i, c in enumerate(all_nci_intervention_codes, 1):
    # Use the mapping to find the class object directly
    cls_class = class_name_to_object.get(c)

    if cls_class:
        name_cur = cls_class.name
        all_subclasses = get_all_subclasses(cls_class)
        descendant_dict['interventions'][name_cur] = [cc.name for cc in all_subclasses]
    else:
        bad_names.append(c)  # Keep track of codes that don't have a corresponding class

    if i % 1000 == 0:
        print(f"Processed {i} out of {len(all_nci_intervention_codes)} NCI intervention codes...")        
        
#PREVIOUS THERAPIES
# Step 2: Process the NCI disease codes using the mapping for fast lookups
# Assuming active_trials['data'] is a list or similar iterable
all_nci_prior_therapy_codes = set()
print(f"Processing {len(active_trials['data'])} active trials for intervention codes...")
for i, trial in enumerate(active_trials['data'], 1):
    cur_nci_codes = set(trial['prior_therapy_new']['inclusion']['TRIAL']['nci_thesaurus_concept_id'])
    all_nci_prior_therapy_codes = all_nci_prior_therapy_codes.union(cur_nci_codes)
    cur_nci_codes = set(trial['prior_therapy_new']['exclusion']['TRIAL']['nci_thesaurus_concept_id'])
    all_nci_prior_therapy_codes = all_nci_prior_therapy_codes.union(cur_nci_codes)

    if i % 1000 == 0:
        print(f"Processed {i} out of {len(active_trials['data'])} trials for prior therapy codes...")

print(f"Processing {len(all_nci_prior_therapy_codes)} NCI prior therapy codes...")
descendant_dict['prior_therapy'] = {}
bad_names = []

for i, c in enumerate(all_nci_intervention_codes, 1):
    # Use the mapping to find the class object directly
    cls_class = class_name_to_object.get(c)

    if cls_class:
        name_cur = cls_class.name
        all_subclasses = get_all_subclasses(cls_class)
        descendant_dict['prior_therapy'][name_cur] = [cc.name for cc in all_subclasses]
    else:
        bad_names.append(c)  # Keep track of codes that don't have a corresponding class

    if i % 1000 == 0:
        print(f"Processed {i} out of {len(all_nci_prior_therapy_codes)} NCI prior therapy codes...")        
        
        
        
# Assuming existing_dict is your existing dictionary and it has items with 'nci_thesaurus_concept_id' key
for trial in active_trials['data']:
    # Initialize a set to store unique elements
    unique_elements = set()
    
    # Iterate over each concept ID in the item's 'nci_thesaurus_concept_id'
    for concept_id in trial['diseases_new']['inclusion']['TRIAL']['nci_thesaurus_concept_id']:
        # Fetch the corresponding list from descendant_dict and add elements to the set
        # This example assumes 'diseases' is the relevant key in descendant_dict, adjust as necessary
        if concept_id in descendant_dict['diseases']:
            unique_elements.update(descendant_dict['diseases'][concept_id])
    
    # Assign the set of unique elements as a list to a new key, e.g., 'descendants'
    trial['diseases_new']['inclusion']['TRIAL']['descendants'] = list(unique_elements)
    
    unique_elements = set()
    # Iterate over each concept ID in the item's 'nci_thesaurus_concept_id'
    for concept_id in trial['diseases_new']['exclusion']['TRIAL']['nci_thesaurus_concept_id']:
        # Fetch the corresponding list from descendant_dict and add elements to the set
        # This example assumes 'diseases' is the relevant key in descendant_dict, adjust as necessary
        if concept_id in descendant_dict['diseases']:
            unique_elements.update(descendant_dict['diseases'][concept_id])
    
    # Assign the set of unique elements as a list to a new key, e.g., 'descendants'
    trial['diseases_new']['exclusion']['TRIAL']['descendants'] = list(unique_elements)
    
    #BIOMARKERS
    unique_elements = set()
    # Iterate over each concept ID in the item's 'nci_thesaurus_concept_id'
    for concept_id in trial['biomarkers_new']['inclusion']['TRIAL']['nci_thesaurus_concept_id']:
        # Fetch the corresponding list from descendant_dict and add elements to the set
        # This example assumes 'diseases' is the relevant key in descendant_dict, adjust as necessary
        if concept_id in descendant_dict['biomarkers']:
            unique_elements.update(descendant_dict['biomarkers'][concept_id])
    
    # Assign the set of unique elements as a list to a new key, e.g., 'descendants'
    trial['biomarkers_new']['inclusion']['TRIAL']['descendants'] = list(unique_elements)
    
    unique_elements = set()
    # Iterate over each concept ID in the item's 'nci_thesaurus_concept_id'
    for concept_id in trial['biomarkers_new']['exclusion']['TRIAL']['nci_thesaurus_concept_id']:
        # Fetch the corresponding list from descendant_dict and add elements to the set
        # This example assumes 'diseases' is the relevant key in descendant_dict, adjust as necessary
        if concept_id in descendant_dict['biomarkers']:
            unique_elements.update(descendant_dict['biomarkers'][concept_id])
    
    # Assign the set of unique elements as a list to a new key, e.g., 'descendants'
    trial['biomarkers_new']['exclusion']['TRIAL']['descendants'] = list(unique_elements)
    
    #INTERVENTIONS
    unique_elements = set()
    
    # Iterate over each concept ID in the item's 'nci_thesaurus_concept_id'
    for concept_id in trial['interventions_new']['inclusion']['TRIAL']['nci_thesaurus_concept_id']:
        # Fetch the corresponding list from descendant_dict and add elements to the set
        # This example assumes 'diseases' is the relevant key in descendant_dict, adjust as necessary
        if concept_id in descendant_dict['interventions']:
            unique_elements.update(descendant_dict['interventions'][concept_id])
    
    # Assign the set of unique elements as a list to a new key, e.g., 'descendants'
    trial['interventions_new']['inclusion']['TRIAL']['descendants'] = list(unique_elements)
    
    unique_elements = set()
    # Iterate over each concept ID in the item's 'nci_thesaurus_concept_id'
    for concept_id in trial['interventions_new']['exclusion']['TRIAL']['nci_thesaurus_concept_id']:
        # Fetch the corresponding list from descendant_dict and add elements to the set
        # This example assumes 'diseases' is the relevant key in descendant_dict, adjust as necessary
        if concept_id in descendant_dict['interventions']:
            unique_elements.update(descendant_dict['interventions'][concept_id])
    
    # Assign the set of unique elements as a list to a new key, e.g., 'descendants'
    trial['interventions_new']['exclusion']['TRIAL']['descendants'] = list(unique_elements)
    
    unique_elements = set()
    # Iterate over each concept ID in the item's 'nci_thesaurus_concept_id'
    for concept_id in trial['interventions_new']['inclusion']['TREE']['nci_thesaurus_concept_id']:
        # Fetch the corresponding list from descendant_dict and add elements to the set
        # This example assumes 'diseases' is the relevant key in descendant_dict, adjust as necessary
        if concept_id in descendant_dict['interventions']:
            unique_elements.update(descendant_dict['interventions'][concept_id])
    
    # Assign the set of unique elements as a list to a new key, e.g., 'descendants'
    trial['interventions_new']['inclusion']['TREE']['descendants'] = list(unique_elements)
    
    unique_elements = set()
    # Iterate over each concept ID in the item's 'nci_thesaurus_concept_id'
    for concept_id in trial['interventions_new']['exclusion']['TREE']['nci_thesaurus_concept_id']:
        # Fetch the corresponding list from descendant_dict and add elements to the set
        # This example assumes 'diseases' is the relevant key in descendant_dict, adjust as necessary
        if concept_id in descendant_dict['interventions']:
            unique_elements.update(descendant_dict['interventions'][concept_id])
    
    # Assign the set of unique elements as a list to a new key, e.g., 'descendants'
    trial['interventions_new']['exclusion']['TREE']['descendants'] = list(unique_elements)
    
    #PRIOR THERAPY
    unique_elements = set()
    
    # Iterate over each concept ID in the item's 'nci_thesaurus_concept_id'
    for concept_id in trial['prior_therapy_new']['inclusion']['TRIAL']['nci_thesaurus_concept_id']:
        # Fetch the corresponding list from descendant_dict and add elements to the set
        # This example assumes 'diseases' is the relevant key in descendant_dict, adjust as necessary
        if concept_id in descendant_dict['prior_therapy']:
            unique_elements.update(descendant_dict['prior_therapy'][concept_id])
    
    # Assign the set of unique elements as a list to a new key, e.g., 'descendants'
    trial['prior_therapy_new']['inclusion']['TRIAL']['descendants'] = list(unique_elements)
    
    unique_elements = set()
    # Iterate over each concept ID in the item's 'nci_thesaurus_concept_id'
    for concept_id in trial['prior_therapy_new']['exclusion']['TRIAL']['nci_thesaurus_concept_id']:
        # Fetch the corresponding list from descendant_dict and add elements to the set
        # This example assumes 'diseases' is the relevant key in descendant_dict, adjust as necessary
        if concept_id in descendant_dict['prior_therapy']:
            unique_elements.update(descendant_dict['prior_therapy'][concept_id])
    
    # Assign the set of unique elements as a list to a new key, e.g., 'descendants'
    trial['prior_therapy_new']['exclusion']['TRIAL']['descendants'] = list(unique_elements)
    
    unique_elements = set()
    # Iterate over each concept ID in the item's 'nci_thesaurus_concept_id'
    for concept_id in trial['prior_therapy_new']['inclusion']['TRIAL']['nci_thesaurus_concept_id']:
        # Fetch the corresponding list from descendant_dict and add elements to the set
        # This example assumes 'diseases' is the relevant key in descendant_dict, adjust as necessary
        if concept_id in descendant_dict['prior_therapy']:
            unique_elements.update(descendant_dict['prior_therapy'][concept_id])
    
    # Assign the set of unique elements as a list to a new key, e.g., 'descendants'
    trial['prior_therapy_new']['inclusion']['TRIAL']['descendants'] = list(unique_elements)
    
    unique_elements = set()
    # Iterate over each concept ID in the item's 'nci_thesaurus_concept_id'
    for concept_id in trial['prior_therapy_new']['exclusion']['TREE']['nci_thesaurus_concept_id']:
        # Fetch the corresponding list from descendant_dict and add elements to the set
        # This example assumes 'diseases' is the relevant key in descendant_dict, adjust as necessary
        if concept_id in descendant_dict['prior_therapy']:
            unique_elements.update(descendant_dict['prior_therapy'][concept_id])
    
    # Assign the set of unique elements as a list to a new key, e.g., 'descendants'
    trial['prior_therapy_new']['exclusion']['TREE']['descendants'] = list(unique_elements)


Processing 20894 active trials for disease codes...
Processed 1000 out of 20894 trials for disease codes...
Processed 2000 out of 20894 trials for disease codes...
Processed 3000 out of 20894 trials for disease codes...
Processed 4000 out of 20894 trials for disease codes...
Processed 5000 out of 20894 trials for disease codes...
Processed 6000 out of 20894 trials for disease codes...
Processed 7000 out of 20894 trials for disease codes...
Processed 8000 out of 20894 trials for disease codes...
Processed 9000 out of 20894 trials for disease codes...
Processed 10000 out of 20894 trials for disease codes...
Processed 11000 out of 20894 trials for disease codes...
Processed 12000 out of 20894 trials for disease codes...
Processed 13000 out of 20894 trials for disease codes...
Processed 14000 out of 20894 trials for disease codes...
Processed 15000 out of 20894 trials for disease codes...
Processed 16000 out of 20894 trials for disease codes...
Processed 17000 out of 20894 trials for disea

# Part D - add phase information

In [90]:
unique_phases = list(set([active_trials['data'][i]['phase'] for i in range(len(active_trials['data']))]))

In [91]:
def translate_phase(phase_value):
    phase_translation = {
        'O': ['O'],
        'I': ['I'],
        'II': ['II'],
        'III': ['III'],
        'IV': ['IV'],
        'I_II': ['I', 'II'],
        'II_III': ['II', 'III'],
        'NA': ['NA']
    }
    return phase_translation.get(phase_value, [phase_value])  # Default to original if not in translation dict

# Update trials with translated phases
for trial in active_trials['data']:
    trial['phase_new'] = translate_phase(trial['phase'])


In [98]:
len(active_trials['data'])

20894

# Part - E add disease and biomarker label to trials

- see memo: 
- diseases are 36 manually defined ones (see 0A)
- biomarkers are now mapped to hgnc symbols, but can only be biomarkers if the particular disease is being studied 


Consider - right now "otherdiseases" and "othercancer", makes the diseases-specific biomarker filter very weak - so consider if there are diseases, specific to the trial itself (aka maybe only one or two diseases types to delete the other possible markers)

In [105]:
diseases_groups = pd.read_excel('diseases_df_new.xlsx', sheet_name=1, header=0).fillna('')
diseases_groups['code_list'] = diseases_groups.apply(lambda row: [ x for x in [row['code1'],row['code2'],row['code3'] ]if x !=''],axis=1)
# display(diseases_groups)
group_to_ids = pd.Series(diseases_groups.code_list.values, index=diseases_groups.group_name.values).to_dict()
id_to_group = {}
for group, ids in group_to_ids.items():
    for dz_id in ids:
        id_to_group[dz_id] = group

In [106]:
nci_biomarker_df = pd.read_csv('nci_biomarker_df.csv',index_col=0)
nci_to_sym = pd.Series(nci_biomarker_df.sym.values,index=nci_biomarker_df.nci_id.values).to_dict()
nci_biomarker_df['diseases_arr'] = nci_biomarker_df.diseases_str.map(lambda string: string.split('|'))
nci_to_diseases = pd.Series(nci_biomarker_df.diseases_arr.values,index=nci_biomarker_df.nci_id.values).to_dict()

# nci_biomarker_df



In [107]:
def group_diseases(nci_ids):
    disease_group = set()
    for nci_id in nci_ids:
        if nci_id in id_to_group:
            disease_group.add(id_to_group[nci_id])
    return sorted(disease_group)


def filter_biomarker_disease_relevant(nci_ids,diseases_trial, nci_to_diseases=nci_to_diseases):
    # filter nci biomarker ids to make sure disease relevant
    # diseases_trial = arr of string of disease maingroups for the trial
    nci_ids_filt = []
    diseases = set(diseases_trial)
    for nci in nci_ids:
        if nci in nci_to_diseases:
            test_diseases = nci_to_diseases[nci] #diseases associated with biomarker
            if len(set(test_diseases).intersection(diseases))>0:
                nci_ids_filt.append(nci)
    return nci_ids_filt
                
        
        
        
def get_symbols(nci_ids, nci_to_sym=nci_to_sym):
    sym_list = []
    for nci in nci_ids:
        if nci in nci_to_sym:
            sym_list.append(nci_to_sym[nci])
    return sym_list

    

In [110]:
%%time
for trial in active_trials['data']:
    # do for both inclusion and exclusion
    for criteria in ['inclusion','exclusion']:
        # get disease names
        diseases_trial = group_diseases(trial['diseases_new'][criteria]['TREE']['nci_thesaurus_concept_id'])
        # get biomarkers associated with diseases
        nci_ids = trial['biomarkers_new'][criteria]['TREE']['nci_thesaurus_concept_id']
        nci_ids_dz = filter_biomarker_disease_relevant(nci_ids,diseases_trial)
        # get gene symbols
        symbols_dz =  get_symbols(nci_ids_dz)
       
        #assign back to tree to store info
        trial['diseases_new'][criteria]['TREE']['dz_groups'] = diseases_trial
        trial['biomarkers_new'][criteria]['TREE']['nci_ids_dz'] = nci_ids_dz
        trial['biomarkers_new'][criteria]['TREE']['symbols_dz'] = symbols_dz

                                            
 

CPU times: user 164 ms, sys: 8.37 ms, total: 173 ms
Wall time: 173 ms


In [112]:
%%time
#takes awhile - saving
# filename = f'interventional_trials_with_descendants{datetime.now().strftime("%Y-%m-%d")}.json'
# filename = 'interventional_trials_with_descendants2024-12-26.json'
filename = 'interventional_trials_with_descendants2024-07-26.json'


print(filename)
save_data_to_file({"total": len(active_trials['data']), "data": active_trials['data']}, filename)

interventional_trials_with_descendants2024-07-26.json
CPU times: user 1min 4s, sys: 2.6 s, total: 1min 7s
Wall time: 1min 7s


In [113]:
len(active_trials['data'])

20894

In [114]:
%%time
filename = 'active_interventional_trials_with_descendants2024-07-26.json'

active_interv_trials_data = [active_trials['data'][i] for i in range(len(active_trials['data'])) if active_trials['data'][i]['current_trial_status']=='Active']
print(len(active_interv_trials_data))
save_data_to_file({"total": len(active_interv_trials_data), "data": active_interv_trials_data}, filename)

5731
CPU times: user 18.4 s, sys: 694 ms, total: 19.1 s
Wall time: 19.2 s


some information that is stored in trial info

In [116]:
sorted(test_trial_x.keys())


['_current_trial_status_sort_order',
 '_phase_sort_order',
 '_primary_purpose_sort_order',
 '_study_protocol_type_sort_order',
 'acronym',
 'active_sites_count',
 'amendment_date',
 'anatomic_sites',
 'arms',
 'associated_studies',
 'biomarkers',
 'biomarkers_new',
 'brief_summary',
 'brief_title',
 'ccr_id',
 'central_contact',
 'classification_code',
 'collaborators',
 'completion_date',
 'completion_date_type_code',
 'ctep_id',
 'current_trial_status',
 'current_trial_status_date',
 'dcp_id',
 'detail_description',
 'diseases',
 'diseases_new',
 'eligibility',
 'interventional_model',
 'interventions_new',
 'keywords',
 'lead_org',
 'lead_org_cancer_center',
 'masking',
 'minimum_target_accrual_number',
 'nci_funded',
 'nci_id',
 'nci_programs',
 'nct_id',
 'number_of_arms',
 'official_title',
 'other_ids',
 'outcome_measures',
 'phase',
 'phase_new',
 'primary_purpose',
 'principal_investigator',
 'prior_therapy',
 'prior_therapy_new',
 'protocol_id',
 'record_verification_date',
 

In [117]:
years_arr = []
for trial in active_trials['data']:
    year = trial['start_date'].split('-')[0]
    years_arr.append(year)


In [118]:
from collections import Counter
Counter(years_arr)

Counter({'2021': 1900,
         '2018': 1888,
         '2017': 1874,
         '2020': 1812,
         '2019': 1803,
         '2022': 1677,
         '2023': 1639,
         '2016': 1632,
         '2015': 1613,
         '2014': 1320,
         '2013': 979,
         '2024': 974,
         '2012': 679,
         '2011': 408,
         '2010': 261,
         '2009': 158,
         '2008': 86,
         '2007': 56,
         '2006': 38,
         '2025': 25,
         '2005': 18,
         '2004': 17,
         '2003': 8,
         '2001': 4,
         '1999': 4,
         '2000': 4,
         '1995': 3,
         '2002': 3,
         '1998': 2,
         '2026': 2,
         '1979': 1,
         '1993': 1,
         '2029': 1,
         '1989': 1,
         '1988': 1,
         '1991': 1,
         '1997': 1})