# 3. Failure causes
Identify most common causes of failure for these trials using the “reason_for_stop” field in the CTRP API.

- Lack of accrual
- Lack of efficacy
- Toxicity
- Sponsor withdrawal
- Others?


In [1]:
from collections import defaultdict, Counter
import pandas as pd
import os, glob
import requests
import json
import re
from scipy import stats


In [2]:
%%time
active_trials = json.load(open('interventional_trials_with_descendants2024-07-26.json','r'))


CPU times: user 18.3 s, sys: 4.85 s, total: 23.2 s
Wall time: 24 s


In [3]:
len(active_trials['data']), active_trials['total']


(20894, 20894)

In [23]:
biomarker_trials = []
nonbiomarker_trials = []

for study in active_trials['data']:
    genes = [g for g in study['biomarkers_new']['inclusion']['TREE']['symbols_dz'] if type(g)==str]
    if len(genes) >0:
        biomarker_trials.append(study)
    else:
        nonbiomarker_trials.append(study)
  

In [24]:
trial_data = active_trials['data']

In [26]:
keys_arr = sorted(trial_data[0].keys())
keys_to_values = {}
for key in keys_arr:
    # print('******')
    # print(key)
    key_list = []
    for trial in trial_data:
        key_list.append(trial[key])
    try:
        key_list_counter = Counter(key_list)
        keys_to_values[key] = key_list_counter
        # if len(key_list_counter)<20:
        #     print(key_list_counter)
    except:
        print('ERROR: not typable list',key)

ERROR: not typable list anatomic_sites
ERROR: not typable list arms
ERROR: not typable list associated_studies
ERROR: not typable list biomarkers
ERROR: not typable list biomarkers_new
ERROR: not typable list central_contact
ERROR: not typable list collaborators
ERROR: not typable list diseases
ERROR: not typable list diseases_new
ERROR: not typable list eligibility
ERROR: not typable list interventions_new
ERROR: not typable list keywords
ERROR: not typable list masking
ERROR: not typable list nci_programs
ERROR: not typable list other_ids
ERROR: not typable list outcome_measures
ERROR: not typable list phase_new
ERROR: not typable list prior_therapy
ERROR: not typable list prior_therapy_new
ERROR: not typable list sites
ERROR: not typable list status_history


In [27]:
studystop_dict = keys_to_values['why_study_stopped']
len(studystop_dict)

2334

In [28]:
studystop_dict[None], 

(17196,)

In [34]:
sum(studystop_dict.values())

20894

In [30]:
sum(studystop_dict.values())-studystop_dict[None]

3698

In [31]:
1-studystop_dict[None]/sum(studystop_dict.values())

0.17698860917009673

clean up the 2334 reasons studies stopped
17.7% of studies failed or 17196 of the 20894 interventional trials

In [32]:
studystop_dict_filt = defaultdict(int)
for key, value in studystop_dict.items():
    studystop_dict_filt[str(key).lower()]+=value


In [33]:
len(studystop_dict_filt)

2186

In [14]:
studystop_dict_filt_df = pd.DataFrame({'reason':studystop_dict_filt.keys(), 'num_trials':studystop_dict_filt.values()}).sort_values(by=['num_trials'],ascending=False)
display(studystop_dict_filt_df[:5])

Unnamed: 0,reason,num_trials
0,none,17196
77,not provided,130
205,low accrual,124
374,slow accrual,106
2,pi request,95


In [15]:
# studystop_dict_filt_df.to_csv('studystop_reason_df.csv')

manual annotation

# then divide by phase

In [35]:
studystop_dict_filt_df_annon  = pd.read_excel('studystop_reason_df.xlsx')
studystop_dict_filt_df_annon['reason'] = studystop_dict_filt_df_annon['reason'].str.replace("_x000D_", "\r")
studystop_dict_filt_df_annon['reason'] = studystop_dict_filt_df_annon['reason'].str.replace('\W', '')
studystop_dict_filt_df_annon.reason.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
studystop_dict_filt_df_annon.reason.replace({r'[^\w\s]':''}, regex=True, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  studystop_dict_filt_df_annon.reason.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)


In [17]:
reason_to_group = pd.Series(studystop_dict_filt_df_annon.group.values, index=studystop_dict_filt_df_annon.reason.values).to_dict()


In [40]:
def get_biomarker_status(study):
    genes = [g for g in study['biomarkers_new']['inclusion']['TREE']['symbols_dz'] if type(g)==str]
    if len(genes) >0:
        return 'biomarker'
    else:
        return 'nonbiomarker'

In [37]:
def get_phase_study(study):
    """
    get phase of trial as a string, if multi-phase trial, trial gets upgraded to highest phase
    input:
    study <dict>
    return <str> phase
    """
    phase_str = str(study['phase']).strip()
    if phase_str=='I':
        return 'I'
    elif phase_str=='II' or phase_str =='I_II':
        return 'II'
    elif phase_str=='III' or phase_str =='II_III':
        return 'III'
    elif phase_str=='IV':
        return 'IV'
    elif phase_str=='O':
        return 'other'
    elif phase_str=='NA':
        return 'other'
    else:
        print(phase_str, type(phase_str), phase_str=='0' )
        return 'other'
    

In [41]:
phase_to_reason_count = defaultdict(list)
phase_to_reason_count_biomarker = defaultdict(list)
phase_to_reason_count_nonbiomarker = defaultdict(list)


trial_id_to_phasegene_info = {}
for trial in active_trials['data']:
    phase = get_phase_study(trial)
    biomarker_status = get_biomarker_status(trial)
    genes = [g for g in trial['biomarkers_new']['inclusion']['TREE']['symbols_dz'] if type(g)==str]
    str_reason = trial['why_study_stopped']
    if str_reason is not None:
        str_reason = str_reason.lower()
        str_reason = re.sub(r'[^\w\s]','',str_reason)
        str_reason = re.sub(r'[^\x00-\x7F]','',str_reason)
        

        if str_reason not in reason_to_group:
            group = 'other'
        else:
            group = reason_to_group[str_reason]
        phase_to_reason_count[phase].append(group)
        phase_to_reason_count['total'].append(group)
        if biomarker_status =='biomarker':
            phase_to_reason_count_biomarker[phase].append(group)
            phase_to_reason_count_biomarker['total'].append(group)
        else:
            phase_to_reason_count_nonbiomarker[phase].append(group)
            phase_to_reason_count_nonbiomarker['total'].append(group)
            
    
        trial_id_to_phasegene_info[trial['nci_id']] = {'failure':group,'phase':phase,'biomarker_status':biomarker_status,'genes_arr':genes}

In [58]:
def process_and_save_files(phase_to_reason_dict, prefix='phase_to_reason_df'):
    phase_to_reason_counter = {}
    for phase, reason_list in phase_to_reason_dict.items():
        phase_to_reason_counter[phase] = Counter(reason_list)
        
    phase_to_reason_df = pd.DataFrame(phase_to_reason_counter).fillna(0)
    phase_to_reason_df.to_csv(prefix+'.csv')
    phase_to_reason_df_total = phase_to_reason_df.total
    phase_to_reason_df = phase_to_reason_df[['I', 'II','III','IV','other']]
    nl = "\n"
    chi2, p, dof, expected = stats.chi2_contingency(phase_to_reason_df)
    print(f"Chi2 value= {chi2}{nl}p-value= {p}{nl}Degrees of freedom= {dof}{nl}")
    phase_to_reason_df['I'] = phase_to_reason_df['I']/phase_to_reason_df_total/sum(phase_to_reason_df['I'])*10000
    phase_to_reason_df['II'] = phase_to_reason_df['II']/phase_to_reason_df_total/sum(phase_to_reason_df['II'])*10000
    phase_to_reason_df['III'] = phase_to_reason_df['III']/phase_to_reason_df_total/sum(phase_to_reason_df['III'])*10000
    phase_to_reason_df['IV'] = phase_to_reason_df['IV']/phase_to_reason_df_total/sum(phase_to_reason_df['IV'])*10000
    phase_to_reason_df['other'] = phase_to_reason_df['other']/phase_to_reason_df_total/sum(phase_to_reason_df['other'])*10000
    phase_to_reason_df.to_csv(prefix+'_norm.csv')
    
    # display(phase_to_reason_df)

    
    phase_to_reason_df["reason"] = phase_to_reason_df.index
    phase_to_reason_df_long = phase_to_reason_df.melt('reason')
    phase_to_reason_df_long.columns = ['reason', 'phase','norm']
    phase_to_reason_df_long.to_csv(prefix+'_long.csv')
    
    display(phase_to_reason_df_total/sum(phase_to_reason_df_total))

    return phase_to_reason_df_total

In [60]:
failure_all = process_and_save_files(phase_to_reason_count, prefix='phase_to_reason_df')

Chi2 value= 343.0498098637228
p-value= 9.664911966370125e-52
Degrees of freedom= 36



sponsor      0.247161
funding      0.052461
other        0.219037
accrual      0.287453
safety       0.025419
pi           0.022985
logistics    0.046782
efficacy     0.082477
soc          0.008924
none         0.007301
Name: total, dtype: float64

In [61]:
failure_biomarker = process_and_save_files(phase_to_reason_count_biomarker, prefix='phase_to_reason_df_biomarker')

Chi2 value= 85.0961019851055
p-value= 7.464997853593164e-06
Degrees of freedom= 36



sponsor      0.302793
safety       0.032402
funding      0.042458
other        0.202235
accrual      0.263687
logistics    0.039106
efficacy     0.089385
soc          0.006704
none         0.004469
pi           0.016760
Name: total, dtype: float64

In [62]:
failure_nonbiomarker = process_and_save_files(phase_to_reason_count_nonbiomarker, prefix='phase_to_reason_df_nonbiomarker')


Chi2 value= 264.72118233635285
p-value= 1.2431183314559828e-36
Degrees of freedom= 36



sponsor      0.229397
funding      0.055655
other        0.224402
accrual      0.295041
pi           0.024973
safety       0.023189
logistics    0.049233
efficacy     0.080271
soc          0.009633
none         0.008205
Name: total, dtype: float64

In [78]:
comp_failure = pd.DataFrame({'biomarker':failure_biomarker,'nonbiomarker':failure_nonbiomarker})
chi2, p, dof, expected = stats.chi2_contingency(comp_failure)
nl = "\n"
print(f"Chi2 value= {chi2}{nl}p-value= {p}{nl}Degrees of freedom= {dof}{nl}")

print('table after adding all which is sum of biomarker and nonbiomarker')
comp_failure['all'] = comp_failure.sum(axis=1)
comp_failure/comp_failure.sum(axis=0)

Chi2 value= 29.255529462321245
p-value= 0.0005867513159805689
Degrees of freedom= 9

table after adding all which is sum of biomarker and nonbiomarker


Unnamed: 0,biomarker,nonbiomarker,all
accrual,0.263687,0.295041,0.287453
efficacy,0.089385,0.080271,0.082477
funding,0.042458,0.055655,0.052461
logistics,0.039106,0.049233,0.046782
none,0.004469,0.008205,0.007301
other,0.202235,0.224402,0.219037
pi,0.01676,0.024973,0.022985
safety,0.032402,0.023189,0.025419
soc,0.006704,0.009633,0.008924
sponsor,0.302793,0.229397,0.247161


biomarker trials and nonbiomarker trials have have different failure causes (Chi squared test, p value = 5.8e-3). Interestingly, while overall, accrual was the most likely cause of trial stoppage at 28.7%, problems with sponsorship was the most common cause of failure for biomarker trials (30.3%) and a higher percentage compared to that for nonbiomarker trials (22.9%). 

# find genes go term for phase 1 failure reasons
hypothesis; there are pathways we shouldn't target in early phases 


In [24]:
trial_id_to_phasegene_info_df = pd.DataFrame.from_dict(trial_id_to_phasegene_info,orient='index')

trial_id_to_phasegene_info_df.to_csv('trial_id_to_phasegene_info_df.csv')

In [25]:
trial_id_to_phasegene_info_df[:5]

Unnamed: 0,failure,phase,genes_arr
NCI-2021-08586,sponsor,II,"[PGR, ERBB2, CD274, ESR1]"
NCI-2018-01041,sponsor,II,[]
NCI-2021-10925,sponsor,I,[]
NCI-2022-04731,funding,II,[]
NCI-2023-01818,other,II,[]


In [26]:
trial_id_to_phasegene_info_df.failure.value_counts()

failure
accrual      1063
sponsor       914
other         810
efficacy      305
funding       194
logistics     173
safety         94
pi             85
soc            33
none           27
Name: count, dtype: int64

In [38]:
genes_failure = []
genes_failure_phase1 = []
for idx, row in trial_id_to_phasegene_info_df.iterrows():
    genes_failure +=row['genes_arr']
    if row['phase'] =='I':
        genes_failure_phase1 +=row['genes_arr']

In [40]:
len(set(genes_failure)), len(set(genes_failure_phase1))

(299, 198)

In [33]:
genes_failure_efficacy = []
genes_failure_phase1_efficacy = []
genes_failure_otherphase_efficacy = []

genes_failure_accrual = []
genes_failure_phase1_accrual = []
genes_failure_otherphase_accrual = []


genes_failure_safety = []
genes_failure_phase1_safety = []
genes_failure_otherphase_safety = []

for idx, row in trial_id_to_phasegene_info_df.iterrows():
    if row['failure'] =='efficacy':
    
        genes_failure_efficacy +=row['genes_arr']
        if row['phase'] =='I':
            genes_failure_phase1_efficacy +=row['genes_arr']
        elif (row['phase'] =='II') or (row['phase'] =='III') or (row['phase'] =='IV') :
            genes_failure_otherphase_efficacy +=row['genes_arr']

    elif row['failure'] =='accrual':
    
        genes_failure_accrual +=row['genes_arr']
        if row['phase'] =='I':
            genes_failure_phase1_accrual +=row['genes_arr']
        elif (row['phase'] =='II') or (row['phase'] =='III') or (row['phase'] =='IV') :
            genes_failure_otherphase_accrual +=row['genes_arr']

            
    elif row['failure'] =='safety':
    
        genes_failure_safety +=row['genes_arr']
        if row['phase'] =='I':
            genes_failure_phase1_safety +=row['genes_arr']
        elif (row['phase'] =='II') or (row['phase'] =='III') or (row['phase'] =='IV') :
            genes_failure_otherphase_safety +=row['genes_arr']
            
            
genes_failure_efficacy = set(genes_failure_efficacy)
genes_failure_phase1_efficacy = set(genes_failure_phase1_efficacy)
genes_failure_otherphase_efficacy = set(genes_failure_otherphase_efficacy)


genes_failure_accrual = set(genes_failure_accrual)
genes_failure_phase1_accrual = set(genes_failure_phase1_accrual)
genes_failure_otherphase_accrual = set(genes_failure_otherphase_accrual)


genes_failure_safety = set(genes_failure_safety)
genes_failure_phase1_safety = set(genes_failure_phase1_safety)
genes_failure_otherphase_safety = set(genes_failure_otherphase_safety)



In [59]:
len(genes_failure_efficacy), len(genes_failure_phase1_efficacy)

(59, 30)

In [36]:
len(genes_failure_safety), len(genes_failure_phase1_safety), len(genes_failure_otherphase_safety)

(31, 6, 29)

In [37]:
genes_failure_phase1_safety

{'ALK', 'ERBB2', 'ERBB3', 'ESR1', 'PGR', 'PRLR'}

In [28]:
# genes_failure_otherphase_efficacy

In [29]:
# these are phase 1 only failure genes for efficacy
print(len(genes_failure_phase1_efficacy - genes_failure_otherphase_efficacy))
for g in sorted(genes_failure_phase1_efficacy - genes_failure_otherphase_efficacy):
    print(g)
    
sorted(genes_failure_phase1_efficacy - genes_failure_otherphase_efficacy)

14
ARID1A
CCNE1
EML4
EZH2
GUCY2C
KDM6A
KMT2C
KMT2D
PDE3A
SF3B1
SLFN12
SMARCA2
SMARCA4
SRSF2


['ARID1A',
 'CCNE1',
 'EML4',
 'EZH2',
 'GUCY2C',
 'KDM6A',
 'KMT2C',
 'KMT2D',
 'PDE3A',
 'SF3B1',
 'SLFN12',
 'SMARCA2',
 'SMARCA4',
 'SRSF2']

In [64]:
# these are phase 2+ only failure genes - for efficacy
print(len(genes_failure_otherphase_efficacy - genes_failure_phase1_efficacy))
for g in sorted(genes_failure_otherphase_efficacy - genes_failure_phase1_efficacy):
    print(g)
    
sorted(genes_failure_otherphase_efficacy - genes_failure_phase1_efficacy)

29
AR
ARAF
ATF1
BAIAP2L1
BICC1
BRCA1
BRCA2
CASP7
CDKN2A
CTAG2
ERBB3
EWSR1
FGFR1
FGFR2
FGFR3
FLT3
KEAP1
MS4A1
NFE2L2
NTRK1
NTRK2
NTRK3
PIK3CA
POLD1
POLE
RAF1
ROS1
TACC3
TNFRSF8


['AR',
 'ARAF',
 'ATF1',
 'BAIAP2L1',
 'BICC1',
 'BRCA1',
 'BRCA2',
 'CASP7',
 'CDKN2A',
 'CTAG2',
 'ERBB3',
 'EWSR1',
 'FGFR1',
 'FGFR2',
 'FGFR3',
 'FLT3',
 'KEAP1',
 'MS4A1',
 'NFE2L2',
 'NTRK1',
 'NTRK2',
 'NTRK3',
 'PIK3CA',
 'POLD1',
 'POLE',
 'RAF1',
 'ROS1',
 'TACC3',
 'TNFRSF8']

In [67]:
#all phase genes
genes_failure_phase1_efficacy.intersection( genes_failure_otherphase_efficacy)

{'ALK',
 'BRAF',
 'CD274',
 'CTAG1A',
 'CTAG1B',
 'EGFR',
 'ERBB2',
 'ESR1',
 'HLA-A',
 'HRAS',
 'KLK3',
 'KRAS',
 'NRAS',
 'PGR',
 'SMARCB1',
 'TP53'}

In [30]:
# these are phase 1 only failure genes for accrual
print(len(genes_failure_phase1_accrual - genes_failure_otherphase_accrual))
for g in sorted(genes_failure_phase1_accrual - genes_failure_otherphase_accrual):
    print(g)
    
sorted(genes_failure_phase1_accrual - genes_failure_otherphase_accrual)

20
BTK
CCNE1
CD33
CD38
CTAG1A
CTAG1B
CTAG2
EPCAM
ETV6
FGFR2
FGFR3
IL2RA
IRAK1
MSH3
POLD1
POLE
PTPRC
ROR1
TYK2
UGT1A1


['BTK',
 'CCNE1',
 'CD33',
 'CD38',
 'CTAG1A',
 'CTAG1B',
 'CTAG2',
 'EPCAM',
 'ETV6',
 'FGFR2',
 'FGFR3',
 'IL2RA',
 'IRAK1',
 'MSH3',
 'POLD1',
 'POLE',
 'PTPRC',
 'ROR1',
 'TYK2',
 'UGT1A1']

In [31]:
# these are phase 2+ only failure genes - for accrual
print(len(genes_failure_otherphase_accrual - genes_failure_phase1_accrual))
for g in sorted(genes_failure_otherphase_accrual - genes_failure_phase1_accrual):
    print(g)
    
sorted(genes_failure_otherphase_accrual - genes_failure_phase1_accrual)

96
ABL1
AKAP4
AR
ARAF
ARID1A
ATM
BARD1
BCR
BRIP1
CBFB
CBL
CCND1
CD22
CD274
CDK12
CEACAM5
CGA
CGB3
CHEK2
COL7A1
CREBBP
CSF3R
CTLA4
DDR2
EP300
EWSR1
EZH2
FANCA
FBXW7
FGFR1
FH
FLCN
FUS
GATA1
GATA2
GEN1
H3-3A
H3-3B
IDH1
IDH2
KMT2A
MAGEA1
MAGEA4
MAP2K1
MAP2K2
MAPK1
MAPK3
MGMT
MLANA
MPL
MYCN
NBN
NF1
NOTCH1
NOTCH2
NOTCH3
NOTCH4
NPM1
PALB2
PDCD1
PDGFA
PDGFB
PDGFRA
PDGFRB
PHF6
PIK3CA
PTPN11
PTTG1
RAD51B
RAD51C
RAD51D
RAD54L
RAF1
RB1
RET
ROPN1
RUNX2
RUNX3
SDHA
SDHAF2
SDHB
SDHC
SDHD
SF3B1
SLC45A2
SPA17
SPANXB1
SRSF2
STAG2
STAT3
TAF15
TG
TNFRSF8
U2AF1
WT1
ZRSR2


['ABL1',
 'AKAP4',
 'AR',
 'ARAF',
 'ARID1A',
 'ATM',
 'BARD1',
 'BCR',
 'BRIP1',
 'CBFB',
 'CBL',
 'CCND1',
 'CD22',
 'CD274',
 'CDK12',
 'CEACAM5',
 'CGA',
 'CGB3',
 'CHEK2',
 'COL7A1',
 'CREBBP',
 'CSF3R',
 'CTLA4',
 'DDR2',
 'EP300',
 'EWSR1',
 'EZH2',
 'FANCA',
 'FBXW7',
 'FGFR1',
 'FH',
 'FLCN',
 'FUS',
 'GATA1',
 'GATA2',
 'GEN1',
 'H3-3A',
 'H3-3B',
 'IDH1',
 'IDH2',
 'KMT2A',
 'MAGEA1',
 'MAGEA4',
 'MAP2K1',
 'MAP2K2',
 'MAPK1',
 'MAPK3',
 'MGMT',
 'MLANA',
 'MPL',
 'MYCN',
 'NBN',
 'NF1',
 'NOTCH1',
 'NOTCH2',
 'NOTCH3',
 'NOTCH4',
 'NPM1',
 'PALB2',
 'PDCD1',
 'PDGFA',
 'PDGFB',
 'PDGFRA',
 'PDGFRB',
 'PHF6',
 'PIK3CA',
 'PTPN11',
 'PTTG1',
 'RAD51B',
 'RAD51C',
 'RAD51D',
 'RAD54L',
 'RAF1',
 'RB1',
 'RET',
 'ROPN1',
 'RUNX2',
 'RUNX3',
 'SDHA',
 'SDHAF2',
 'SDHB',
 'SDHC',
 'SDHD',
 'SF3B1',
 'SLC45A2',
 'SPA17',
 'SPANXB1',
 'SRSF2',
 'STAG2',
 'STAT3',
 'TAF15',
 'TG',
 'TNFRSF8',
 'U2AF1',
 'WT1',
 'ZRSR2']

In [32]:
#all phase genes - accrual
genes_failure_phase1_accrual.intersection( genes_failure_otherphase_accrual)

{'ALK',
 'BCL2',
 'BCL6',
 'BRAF',
 'BRCA1',
 'BRCA2',
 'CD19',
 'CDKN2A',
 'EGFR',
 'ERBB2',
 'ESR1',
 'FLT3',
 'HLA-A',
 'HRAS',
 'JAK1',
 'JAK2',
 'JAK3',
 'KIT',
 'KLK3',
 'KRAS',
 'MET',
 'MLH1',
 'MS4A1',
 'MSH2',
 'MSH6',
 'MSLN',
 'MYC',
 'MYD88',
 'NRAS',
 'PGR',
 'PMS2',
 'PTEN',
 'ROS1',
 'RUNX1',
 'TP53'}