# 1 - biomarkers

In [21]:
import pandas as pd
import os,glob
import json
from collections import defaultdict,Counter
import seaborn as sns
import numpy as np

## get trials

In [22]:
%%time
filename = 'interventional_trials_with_descendants2024-07-26.json'
active_trials = json.load(open(filename,'r'))




CPU times: user 24.9 s, sys: 12 s, total: 36.9 s
Wall time: 41.5 s


In [23]:
len(active_trials['data']), active_trials['total']


(20894, 20894)

## define some disease groups and prettify names

In [24]:
diseases_groups = pd.read_excel('diseases_df_new.xlsx', sheet_name=1, header=0).fillna('')
diseases = list(diseases_groups.group_name.values)
names_pretty = ['Biliary', 'Anal', 'Appendix', 'Bladder', 
                'Bone', 'Breast', 'Cervical', 'Colon', 'Germ Cell',
                'Esophageal', 'Eye', 'Gastric', 'Head and Neck', 
                 'Renal', 'Liver', 'Lung', 'Skin',
                'Leukemia','Lymphoma','Myeloid', 'Plasma Cell',
                'Neuro', 'Neuroendocrine', 'Ovarian', 'Pancreas', 'Penile',
                'Peritoneal', 'Prostate', 'Sarcoma', 'Small Intestine', 
                'Testicular', 'Thymus', 'Uterine', 'Covid', 'Misc Cancer', 'Unspecified']
diseases_to_namespretty = dict(zip(diseases,names_pretty))
diseases_to_namespretty

{'biliary': 'Biliary',
 'anal': 'Anal',
 'appendix': 'Appendix',
 'bladder_uu': 'Bladder',
 'bone': 'Bone',
 'breast': 'Breast',
 'cervix_vv': 'Cervical',
 'colon': 'Colon',
 'embryo': 'Germ Cell',
 'esophageal': 'Esophageal',
 'eye': 'Eye',
 'stomach': 'Gastric',
 'head_neck': 'Head and Neck',
 'kidney': 'Renal',
 'liver': 'Liver',
 'lung': 'Lung',
 'skin': 'Skin',
 'leukemia': 'Leukemia',
 'lymphoma': 'Lymphoma',
 'myeloid': 'Myeloid',
 'plasma': 'Plasma Cell',
 'neuro': 'Neuro',
 'neuroendocrine': 'Neuroendocrine',
 'ovarian_f': 'Ovarian',
 'pancreas': 'Pancreas',
 'penile': 'Penile',
 'peritoneal': 'Peritoneal',
 'prostate': 'Prostate',
 'sarcoma': 'Sarcoma',
 'small_intest': 'Small Intestine',
 'testicular': 'Testicular',
 'thymus': 'Thymus',
 'uterus': 'Uterine',
 'covid': 'Covid',
 'othercancer': 'Misc Cancer',
 'otherdisease': 'Unspecified'}

In [25]:
def get_phase_study(study):
    """
    get phase of trial as a string, if multi-phase trial, trial gets upgraded to highest phase
    input:
    study <dict>
    return <str> phase
    """
    phase_str = str(study['phase']).strip()
    if phase_str=='I':
        return 'I'
    elif phase_str=='II' or phase_str =='I_II':
        return 'II'
    elif phase_str=='III' or phase_str =='II_III':
        return 'III'
    elif phase_str=='IV':
        return 'IV'
    elif phase_str=='O':
        return 'other'
    elif phase_str=='NA':
        return 'other'
    else:
        print(phase_str, type(phase_str), phase_str=='0' )
        return 'other'
    

In [26]:
# get all genes
all_genes = set()
gene_to_study = defaultdict(list)
phase_arr= []
study_to_phase = {}
for study in active_trials['data']:
    genes = [g for g in study['biomarkers_new']['inclusion']['TREE']['symbols_dz'] if type(g)==str]
    phase = get_phase_study(study)
    phase_arr.append(phase)
    study_to_phase[study['nct_id']]= phase 
    for g in genes:
        gene_to_study[g].append(study['nct_id'])
    
    all_genes = all_genes.union(set(genes))

# all_genes = list([gene for gene in all_genes if type(gene)==str])
all_genes = sorted(all_genes)
print(len(all_genes))


dz_to_gene_to_numstudy = defaultdict(dict)
dz_to_gene_to_numstudy_phase1 = defaultdict(dict)
dz_to_gene_to_numstudy_phase2 = defaultdict(dict)
dz_to_gene_to_numstudy_phase3 = defaultdict(dict)
dz_to_gene_to_numstudy_phase4 = defaultdict(dict)

for dz in names_pretty:
    for gene in all_genes:
        dz_to_gene_to_numstudy[dz][gene] = 0
        dz_to_gene_to_numstudy_phase1[dz][gene] = 0
        dz_to_gene_to_numstudy_phase2[dz][gene] = 0
        dz_to_gene_to_numstudy_phase3[dz][gene] = 0
        dz_to_gene_to_numstudy_phase4[dz][gene] = 0        
    

586


In [27]:
#fill in the tables
phase_counter = []
phase_counter_biomarker = []

disease_phase_df = pd.DataFrame(index=diseases_to_namespretty.values(), columns=['I','II','III','IV','other']).fillna(0)



for study in active_trials['data']:
    phase = get_phase_study(study)
    phase_counter.append(phase)
    dz_names = [diseases_to_namespretty[dz] for dz in study['diseases_new']['inclusion']['TREE']['dz_groups']]
    genes = [g for g in study['biomarkers_new']['inclusion']['TREE']['symbols_dz'] if type(g)==str]
    if len(genes)>0:
        phase_counter_biomarker.append(phase)
    for dz in dz_names:
        for gene in genes:
            dz_to_gene_to_numstudy[dz][gene]+=1
            disease_phase_df.loc[dz,phase]+=1
            if phase =='I':
                dz_to_gene_to_numstudy_phase1[dz][gene]+=1
            elif phase =='II':
                dz_to_gene_to_numstudy_phase2[dz][gene]+=1
            elif phase =='III':
                dz_to_gene_to_numstudy_phase3[dz][gene]+=1
            elif phase =='IV':
                dz_to_gene_to_numstudy_phase4[dz][gene]+=1
disease_phase_df['total_num_trials'] = disease_phase_df.sum(axis=1)


  disease_phase_df = pd.DataFrame(index=diseases_to_namespretty.values(), columns=['I','II','III','IV','other']).fillna(0)


In [28]:
print(len(phase_counter))

Counter(phase_counter)

20894


Counter({'II': 9097, 'I': 5156, 'other': 4406, 'III': 2036, 'IV': 199})

In [29]:
print(len(phase_counter_biomarker))
Counter(phase_counter_biomarker)

5057


Counter({'II': 2738, 'I': 1522, 'III': 517, 'other': 257, 'IV': 23})

In [30]:
for phase in Counter(phase_counter).keys():
    print(phase)
    print(Counter(phase_counter_biomarker)[phase]/Counter(phase_counter)[phase])
          

II
0.3009783445091789
III
0.2539292730844794
I
0.2951900698215671
IV
0.11557788944723618
other
0.05832955061280073


In [31]:
disease_phase_df.to_csv('disease_phase_df.csv')
# disease_phase_df[:5]

In [32]:
dz_to_gene_to_numstudy_df = pd.DataFrame.from_dict(dz_to_gene_to_numstudy,orient='index').fillna(0)
dz_to_gene_to_numstudy_df.to_csv('dz_to_gene_to_numstudy_df.csv')

dz_to_gene_to_numstudy_phase1_df = pd.DataFrame.from_dict(dz_to_gene_to_numstudy_phase1,orient='index').fillna(0)
dz_to_gene_to_numstudy_phase1_df.to_csv('dz_to_gene_to_numstudy_phase1_df.csv')

dz_to_gene_to_numstudy_phase2_df = pd.DataFrame.from_dict(dz_to_gene_to_numstudy_phase2,orient='index').fillna(0)
dz_to_gene_to_numstudy_phase2_df.to_csv('dz_to_gene_to_numstudy_phase2_df.csv')

dz_to_gene_to_numstudy_phase3_df = pd.DataFrame.from_dict(dz_to_gene_to_numstudy_phase3,orient='index').fillna(0)
dz_to_gene_to_numstudy_phase3_df.to_csv('dz_to_gene_to_numstudy_phase3_df.csv')

dz_to_gene_to_numstudy_phase4_df = pd.DataFrame.from_dict(dz_to_gene_to_numstudy_phase4,orient='index').fillna(0)
dz_to_gene_to_numstudy_phase4_df.to_csv('dz_to_gene_to_numstudy_phase4_df.csv')


## break down gene study by phase
make table of 
- gene 
- nci study ids 
- num studies total 
- num studies phase1
- num studies phase2
- num studies phase3
- num studies phase4


In [33]:
gene_info_dict = {}
for gene, studies in gene_to_study.items():
    # print(gene)
    study_phases = [study_to_phase[study] for study in studies]
    # print(Counter(study_phases))
    gene_info_dict[gene] = Counter(study_phases)
    gene_info_dict[gene]['tot_studies'] = len(studies)
    
gene_study_info_df = pd.DataFrame.from_dict(gene_info_dict,orient='index').fillna(0)
gene_study_info_df = gene_study_info_df[['I','II', 'III', 'IV',  'other', 'tot_studies']]


#save
gene_study_info_df.to_csv('gene_study_info_df.csv')
gene_study_info_df[:15]

Unnamed: 0,I,II,III,IV,other,tot_studies
BRCA2,44.0,96.0,15.0,0.0,25.0,180
BRCA1,44.0,96.0,15.0,0.0,25.0,180
PALB2,13.0,51.0,1.0,0.0,10.0,75
NF1,10.0,35.0,1.0,0.0,3.0,49
MAP2K1,0.0,9.0,0.0,0.0,0.0,9
BRAF,79.0,153.0,25.0,2.0,8.0,267
HRAS,117.0,147.0,17.0,0.0,2.0,283
RAF1,34.0,59.0,3.0,0.0,2.0,98
NRAS,120.0,155.0,17.0,1.0,2.0,295
MAP2K2,0.0,5.0,0.0,0.0,0.0,5


In [34]:
gene_study_info_df.shape

(586, 6)

In [20]:
num_unique_markers_per_phase = (gene_study_info_df[['I', 'II', 'III', 'IV','other']]>0).sum(axis=0)
num_unique_markers_per_phase

I        371
II       443
III      101
IV        14
other    115
dtype: int64

In [17]:
sorted(gene_study_info_df[ 'IV'][gene_study_info_df[ 'IV']>0].index.values)

['ABL1',
 'ALK',
 'BCR',
 'BRAF',
 'CD22',
 'CD33',
 'EGFR',
 'ERBB2',
 'ESR1',
 'FLT3',
 'KLK3',
 'MKI67',
 'NRAS',
 'PGR']

There have been a total of 374 biomarkers listed as eligibility criteria for phase I clinical trials, 449 for phase II, 107 for phase III, and only 14 for phase IV. The  14 biomarkers studies in phase IV clinical trials include: ABL1 and BCR (involved in the BCL-ABL1 translocation); ALK, EGFR, and ROS1 (biomarkers in NSCLC); BRAF and NRAS (biomarkers most commonly used in melanoma, but also associated with other solid tumor malignancies); CD22 (B cell marker useful in leukemias and triple negative breast cancer); CD33 (biomarker used for monitoring in AML); ERBB2 (also known as HER2), ESR1 (estrogen receptor gene), and PGR (progesterone receptor gene) most well-known as markers in breast cancer but also associated with other solid tumor malignancies; KLK3 (biomarker associated with prostate cancer), and MKI67 (or Ki-67 a proliferation marker). 

In [159]:
gene_study_info_df

Unnamed: 0,I,II,III,IV,other,tot_studies
BRCA2,44.0,96.0,15.0,0.0,25.0,180
BRCA1,44.0,96.0,15.0,0.0,25.0,180
PALB2,13.0,51.0,1.0,0.0,10.0,75
NF1,10.0,35.0,1.0,0.0,3.0,49
MAP2K1,0.0,9.0,0.0,0.0,0.0,9
...,...,...,...,...,...,...
TMEM127,0.0,0.0,0.0,0.0,1.0,1
HOXB@,0.0,0.0,0.0,0.0,1.0,1
BMPR1A,0.0,0.0,0.0,0.0,1.0,1
PHOX2B,0.0,0.0,0.0,0.0,1.0,1


In [187]:
tissuegroup_numbiomarkers = (dz_to_gene_to_numstudy_df.T[['Biliary', 'Anal', 'Appendix', 'Bladder', 'Bone', 'Breast', 'Cervical',
       'Colon', 'Germ Cell', 'Esophageal', 'Eye', 'Gastric', 'Head and Neck',
       'Renal', 'Liver', 'Lung', 'Skin','Leukemia','Lymphoma','Myeloid', 'Plasma Cell',
                                                          'Neuro',
       'Neuroendocrine', 'Ovarian', 'Pancreas', 'Penile', 'Peritoneal',
       'Prostate', 'Sarcoma', 'Small Intestine', 'Testicular', 'Thymus',
       'Uterine']]>0).sum(axis=0).sort_values()
tissuegroup_numbiomarkers.to_csv('tissuegroup_numbiomarkers.csv')
tissuegroup_numbiomarkers

Thymus               4
Appendix             9
Penile              11
Small Intestine     12
Testicular          14
Anal                16
Cervical            49
Liver               65
Esophageal          66
Peritoneal          72
Biliary             75
Bone                79
Neuroendocrine      81
Eye                 85
Uterine            101
Gastric            111
Prostate           113
Renal              118
Germ Cell          119
Bladder            131
Colon              144
Head and Neck      150
Pancreas           177
Skin               187
Ovarian            187
Sarcoma            187
Breast             203
Neuro              216
Lung               229
Malignant Heme     296
dtype: int64

# tissue specificity of biomarkers

In [179]:
dz_to_gene_to_numstudy_df.index

Index(['Biliary', 'Anal', 'Appendix', 'Bladder', 'Bone', 'Breast', 'Cervical',
       'Colon', 'Germ Cell', 'Esophageal', 'Eye', 'Gastric', 'Head and Neck',
       'Malignant Heme', 'Renal', 'Liver', 'Lung', 'Skin', 'Neuro',
       'Neuroendocrine', 'Ovarian', 'Pancreas', 'Penile', 'Peritoneal',
       'Prostate', 'Sarcoma', 'Small Intestine', 'Testicular', 'Thymus',
       'Uterine', 'Covid', 'Misc Cancer', 'Unspecified'],
      dtype='object')

In [195]:
#numberbiomarkers 
biomarker_to_numcancer = (dz_to_gene_to_numstudy_df.T[['Biliary', 'Anal', 'Appendix', 'Bladder', 'Bone', 'Breast', 'Cervical',
       'Colon', 'Germ Cell', 'Esophageal', 'Eye', 'Gastric', 'Head and Neck',
       'Malignant Heme', 'Renal', 'Liver', 'Lung', 'Skin', 'Neuro',
       'Neuroendocrine', 'Ovarian', 'Pancreas', 'Penile', 'Peritoneal',
       'Prostate', 'Sarcoma', 'Small Intestine', 'Testicular', 'Thymus',
       'Uterine']]>0).sum(axis=1).sort_values()

In [199]:
len(biomarker_to_numcancer[biomarker_to_numcancer==0].index)

62

62 biomarkers were associated with an unspecified cancer type.

183 biomarkers are associated with a specific cancer group while 3 biomarkers ( ERBB2, PGR, and ESR1 or HER2, PR, and ER, respectively) are associated with 29 cancer types. 

In [218]:
biomarker_to_numcancer_nozero

PDS5B      1
FAN1       1
FAAP20     1
ENPP3      1
PDGFA      1
          ..
NRAS      27
CD274     28
ERBB2     29
PGR       29
ESR1      29
Length: 524, dtype: int64

In [217]:
biomarker_to_numcancer_nozero = biomarker_to_numcancer[biomarker_to_numcancer>0]
biomarker_to_numcancer_nozero_count = Counter(biomarker_to_numcancer_nozero.values)
biomarker_to_numcancer_nozero_count = pd.DataFrame(pd.Series(biomarker_to_numcancer_nozero_count),columns = ['num_biomarkers'])
biomarker_to_numcancer_nozero_count.to_csv('biomarkers_tissue_frequency.csv')
biomarker_to_numcancer_nozero_count

Unnamed: 0,num_biomarkers
1,183
2,56
3,29
4,24
5,20
6,20
7,23
8,8
9,7
10,8
