In [None]:
import pandas as pd
import numpy as np

In [21]:
#load the pilot dataset
pilot_df = pd.read_csv('terminated_ground_truth.csv')
pilot_df.head(10)

Unnamed: 0,nct_id,brief_title,why_stopped,termination_category,brief_summary,detailed_description
0,NCT00025493,S0029 Docetaxel in Treating Older Women With M...,poor accrual,Enrollment,RATIONALE: Drugs used in chemotherapy use diff...,
1,NCT00726986,"Sorafenib, Cisplatin, and Etoposide in Treatin...",Extreme toxicity,Safety,RATIONALE: Sorafenib may stop the growth of tu...,
2,NCT04218825,REACH: Study to Determine the Aetiology of Chl...,After an extensive assessment of the study fea...,Enrollment,Adult patients with early stage MF-CTCL (stage...,
3,NCT01548651,Effect of Saxagliptin Treatment on Myocardial ...,difficulty with enrollment of subjects,Enrollment,The purpose of the study is to examine the eff...,
4,NCT00531271,ABI-008 Trial in Patients With Metastatic Brea...,,Unknown,To determine the maximum tolerated dose (MTD) ...,
5,NCT00204230,MMF and Calcineurin Inhibitor Withdrawal in CAN,,Unknown,"Prospective, randomised study: Effect of mycop...","Prospective, randomised study: Effect of mycop..."
6,NCT04130438,Efficacy of Medical Therapy in Women and Men W...,"Low recruitment, insufficient funding",Enrollment,The proposed clinical trial is relevant to pub...,
7,NCT00386659,Immune Reconstitution in naïve HIV Patients Wi...,,Unknown,"Pilot, randomized, parallel, open-label, contr...",
8,NCT02590003,Single Agent Versus Combination Chemotherapy t...,Trial was stopped after non-response to treatm...,Other/Unclear,This study will enroll elderly patients with a...,The primary objective of this trial is to comp...
9,NCT04973137,A Study to Evaluate the Efficacy and Safety of...,The AFFIRM-AL clinical trial did not meet its ...,Efficacy,A Phase 3 study to evaluate the efficacy and s...,


In [24]:
#count how many termination_category as `Other/Unclear`
pilot_df['termination_category'].value_counts()['Other/Unclear']


np.int64(5309)

In [30]:
unclear_df = pilot_df[pilot_df['termination_category'] == 'Other/Unclear']

In [35]:
unclear_df.head() 

Unnamed: 0,nct_id,brief_title,why_stopped,termination_category,brief_summary,detailed_description
8,NCT02590003,Single Agent Versus Combination Chemotherapy t...,Trial was stopped after non-response to treatm...,Other/Unclear,This study will enroll elderly patients with a...,The primary objective of this trial is to comp...
11,NCT01639664,COMPACT 2 - COMbining Plasma-filtration and Ad...,The interim analysis requested by the EDSMC sh...,Other/Unclear,The study objective is to clarify whether the ...,Septic shock is a life-threatening clinical co...
13,NCT02822482,Copanlisib in Association With Cetuximab in Pa...,futilty of the treatment,Other/Unclear,The study consists of two distinct and sequent...,
16,NCT01094262,A Safety and Tolerability Study of JNJ-4216044...,Logistic reasons associated with the FDA-impos...,Other/Unclear,The purpose of this study is to evaluate pain ...,This is a double-blind (neither the physician ...
18,NCT02259348,Repeat Transplantation for Relapsed or Refract...,Investigator's decision.,Other/Unclear,This pilot phase II trial studies how well a n...,PRIMARY OBJECTIVE:~* To estimate engraftment b...


In [None]:
#select rows where nct_id is NCT02590003
#Read the full rows without truncation

unclear_df[unclear_df['nct_id'] == 'NCT02590003']

Unnamed: 0,nct_id,brief_title,why_stopped,termination_category,brief_summary,detailed_description
8,NCT02590003,Single Agent Versus Combination Chemotherapy to Treat High-risk Elderly With Non-small Cell Lung Cancer,Trial was stopped after non-response to treatment.,Other/Unclear,This study will enroll elderly patients with advanced non-small cell lung cancer who are at high risk of developing chemotherapy toxicity (side effects). Patients will receive treatment with either a platinum-based doublet chemotherapy with carboplatin/nab-paclitaxel or single agent nab-paclitaxel of chemotherapy. Response to treatment and treatment toxicity will be compared in the two treatment groups to determine the best treatment strategy for this group of patients.,"The primary objective of this trial is to compare the treatment failure-free survival rate in high-risk elderly patients, identified by geriatric assessment, treated with either a platinum-based doublet chemotherapy with carboplatin/nab-paclitaxel or single agent nab-paclitaxel in advanced non-small cell lung cancer. Treatment failure-free survival is the most appropriate primary outcome as it captures excessive toxicity due to chemotherapy in addition to death and disease progression.~The secondary objectives are to evaluate grade 3-5 toxicities, overall response rate, progression free survival, symptom assessment, and overall survival between the two randomization arms."


In [40]:
DATA_DIR = r"c:/Users/1234/OneDrive - Vanderbilt/Projects/LLM-clinical trials/CT_data_full/main_data"

import os
from collections import Counter
import re

def analyze_full_pipeline():
    print("Loading studies.txt...")
    studies = pd.read_csv(
        os.path.join(DATA_DIR, "studies.txt"),
        sep="|",
        usecols=["nct_id", "overall_status", "study_type", "why_stopped"],
        low_memory=False
    )
    total_studies = len(studies)
    unique_ids = studies["nct_id"].nunique()
    print(f"Total Studies in file: {total_studies}")
    print(f"Unique NCT IDs: {unique_ids}")
    
    print("Loading designs.txt...")
    designs = pd.read_csv(
        os.path.join(DATA_DIR, "designs.txt"),
        sep="|",
        usecols=["nct_id", "primary_purpose"],
        low_memory=False
    )
    
    # Merge
    df = studies.merge(designs, on="nct_id", how="left")
    
    # Step 1: Filter Interventional
    interventional = df[df["study_type"] == "INTERVENTIONAL"]
    print(f"Step 1 (Interventional): {len(interventional)}")
    
    # Step 2: Filter Terminated
    terminated = interventional[interventional["overall_status"] == "TERMINATED"]
    print(f"Step 2 (Terminated): {len(terminated)}")
    
    # Step 3: Filter Treatment
    treatment = terminated[terminated["primary_purpose"] == "TREATMENT"]
    print(f"Step 3 (Treatment): {len(treatment)}")
    
    # Step 4: COVID Analysis
    # Ensure why_stopped is string
    treatment = treatment.copy()
    treatment["why_stopped"] = treatment["why_stopped"].fillna("").astype(str)
    
    covid_mask = treatment["why_stopped"].str.contains("covid|coronavirus|pandemic|sars-cov-2", case=False)
    covid_count = covid_mask.sum()
    print(f"Step 4 (COVID Excluded): {covid_count}")
    
    final_prospects = treatment[~covid_mask]
    print(f"Final Candidates for Ground Truth: {len(final_prospects)}")
    
    # Text Analysis on Final Candidates
    print("\n--- Why Stopped Analysis (Final Candidates) ---")
    
    final_prospects = treatment[~covid_mask]
    print(f"Final Candidates for Ground Truth: {len(final_prospects)}")
    return final_prospects

In [15]:
final_prospects = analyze_full_pipeline()

Loading studies.txt...
Total Studies in file: 557381
Unique NCT IDs: 557381
Loading designs.txt...
Step 1 (Interventional): 425719
Step 2 (Terminated): 28088
Step 3 (Treatment): 21041
Step 4 (COVID Excluded): 841
Final Candidates for Ground Truth: 20200

--- Why Stopped Analysis (Final Candidates) ---
Final Candidates for Ground Truth: 20200


In [16]:
# Text Analysis on Final Candidates
print("\n--- Why Stopped Analysis (Final Candidates) ---")

# Top 20 Exact Reasons
print("\nTop 20 Exact Reasons:")
print(final_prospects["why_stopped"].value_counts().head(20))
# Top 20 Words
text = " ".join(final_prospects["why_stopped"].tolist()).lower()
# Simple tokenization
words = re.findall(r'\b[a-z]{3,}\b', text) # Ignore short words
stop_words = {'the', 'was', 'and', 'for', 'due', 'not', 'study', 'with', 'this', 'were', 'from', 'that', 'are', 'have', 'been'} 
filtered_words = [w for w in words if w not in stop_words]

common_words = Counter(filtered_words).most_common(20)
print("\nTop 20 Common Words (Filtered):")
print(common_words)


--- Why Stopped Analysis (Final Candidates) ---

Top 20 Exact Reasons:
why_stopped
                                                   2321
Slow accrual                                        178
Sponsor decision                                    171
Low accrual                                         106
See termination reason in detailed description.      96
slow accrual                                         96
Sponsor Decision                                     96
low accrual                                          93
Lack of enrollment                                   89
Slow enrollment                                      75
Poor accrual                                         74
Business decision                                    72
Lack of efficacy                                     70
Slow recruitment                                     69
Low enrollment                                       69
Slow Accrual                                         64
Lack of funding     

In [43]:
#read pilot ground truth
pilot_df = pd.read_csv(r"c:/Users/1234/OneDrive - Vanderbilt/Projects/LLM-clinical trials/pilot_ground_truth_with_fields.csv")

In [48]:
pd.set_option('display.max_colwidth', 200)
pilot_df.head()

Unnamed: 0,nct_id,brief_title,why_stopped,brief_summary,detailed_description,medical_field,medical_subfield,field_source
0,NCT04566133,Combination of Trametinib (MEK Inhibitor) and Hydroxychloroquine (HCQ) (Autophagy Inhibitor) in Patients With KRAS Mutation Refractory Bile Tract Carcinoma (BTC).,Slow accrual,Background:~Bile duct cancer is cancer of the slender tubes of the biliary tract. These tubes carry bile through the liver. Such cancer tumors often have an abnormal or mutated gene. Researchers t...,"Background:~* Among the new cases of bile tract carcinoma (BTC) that are diagnosed every year in the United States, there are approximately 6,500 cases of gallbladder carcinoma, 3,000 cases of ext...",Oncology,Other cancer,MeSH
1,NCT02815488,"A Study to Investigate Safety, Tolerability, Pharmacokinetics and Pharmacodynamics of Single and Repeat Doses of CHF6297 in Healthy Subjects and Patients With COPD",Very poor recruitment in the Part 4 of the study,CHF6297 is a potent and selective inhibitor of human MAP kinase p38 being developed as an anti-inflammatory agent for the treatment of inflammatory airways diseases. The purpose of this study is t...,,Pulmonology,Pulmonology disorder,MeSH
2,NCT02721888,Effect of Liraglutide on Fatty Liver Content and Lipoprotein Metabolism,Failure of inclusion,"Non-alcoholic fatty liver disease (NAFLD) is commonly associated with obesity, metabolic syndrome and type 2 diabetes. NAFLD, in patients with type 2 diabetes, has been shown to be associated with...",,Endocrinology,Endocrinology disorder,MeSH
3,NCT01760304,Changes in Cardiac Function in COPD Patients After Administration of Budesonide/Formoterol (Symbicort®) Versus Placebo,Sponsor decided to stop the study due to expiration of blinded placebo .,To investigate whether Budesonide/Formoterol (Symbicort ®) therapy can improve heart function at rest by decreasing lung hyperinflation in patients with COPD (Chronic Obstructive Pulmonary Disease).,Patients with moderate to advanced COPD are known to have static hyperinflation (at rest) as a consequence of expiratory flow limitation. Hyperinflation is easily detected by measuring lung volume...,Pulmonology,Pulmonology disorder,MeSH
4,NCT00799942,Open-lable Extension Study on Safety and Efficacy of Neramexane to Treat Congenital and Acquired Nystagmus,,"The purpose of this study is to investigate the long-term safety, tolerability and efficacy of neramexane mesylate in the treatment of congenital idiopathic nystagmus (CIN). In addition, a subgrou...",,Ophthalmology,Ophthalmology disorder,MeSH
