In [1]:
import requests
import pandas as pd
import json
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
def fetch_clinical_trials(max_results=100):
    """Fetch clinical trials data from ClinicalTrials.gov API v2"""
    base_url = "https://clinicaltrials.gov/api/v2/studies"
    
    # Define the fields we want to retrieve - focusing on text-rich fields for LLM processing
    fields = [
        "NCTId",
        "BriefTitle",
        "OfficialTitle",
        "BriefSummary",
        "DetailedDescription",
        "Condition",
        "InterventionDescription",
        "EligibilityCriteria",
        "StudyType",
        "OverallStatus",
        "StudyType",
        "EnrollmentCount",
        "StartDate",
        "CompletionDate",
        "LastUpdatePostDate",
        "LeadSponsorName",
        "LocationFacility",
        "LocationCity",
        "LocationState",
        "LocationCountry"
    ]
    
    params = {
        "query.term": "AREA[LastUpdatePostDate]RANGE[2023-01-01,MAX]",  # Recent trials
        "fields": ",".join(fields),
        "pageSize": min(max_results, 1000),  # API limits pageSize to 1000
        "format": "json",
        "markupFormat": "markdown",
        "sort": ["LastUpdatePostDate:desc"]  # Get most recent trials first
    }
    
    response = requests.get(base_url, params=params)
    
    # Print debug information
    print(f"Status Code: {response.status_code}")
    print(f"Response URL: {response.url}")
    print(f"Response Content: {response.text[:500]}...")  # Print first 500 chars of response
    
    if response.status_code != 200:
        raise Exception(f"API request failed with status code {response.status_code}")
        
    try:
        return response.json()
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {str(e)}")
        print(f"Response content: {response.text}")
        raise

In [16]:
def preprocess_trial_data(trials_data):
    """Preprocess the clinical trials data for LLM processing"""
    # Extract studies from the response
    studies = trials_data.get('studies', [])
    
    # Create a list to store processed trials
    processed_trials = []
    
    for study in studies:
        # Extract the protocol section which contains most of the text data
        protocol = study.get('protocolSection', {})
        
        # Create a structured dictionary for each trial
        trial = {
            'nct_id': study.get('protocolSection', {}).get('identificationModule', {}).get('nctId'),
            'title': study.get('protocolSection', {}).get('identificationModule', {}).get('briefTitle'),
            'official_title': study.get('protocolSection', {}).get('identificationModule', {}).get('officialTitle'),
            'brief_summary': study.get('protocolSection', {}).get('descriptionModule', {}).get('briefSummary'),
            'detailed_description': study.get('protocolSection', {}).get('descriptionModule', {}).get('detailedDescription'),
            'conditions': study.get('protocolSection', {}).get('conditionsModule', {}).get('conditions', []),
            'interventions': study.get('protocolSection', {}).get('armsInterventionsModule', {}).get('interventions', []),
            'eligibility_criteria': study.get('protocolSection', {}).get('eligibilityModule', {}).get('eligibilityCriteria'),
            'study_type': study.get('protocolSection', {}).get('designModule', {}).get('studyType'),
            'status': study.get('protocolSection', {}).get('statusModule', {}).get('overallStatus'),
            'enrollment': study.get('protocolSection', {}).get('statusModule', {}).get('enrollmentCount'),
            'start_date': study.get('protocolSection', {}).get('statusModule', {}).get('startDateStruct', {}).get('date'),
            'completion_date': study.get('protocolSection', {}).get('statusModule', {}).get('completionDateStruct', {}).get('date'),
            'last_update': study.get('protocolSection', {}).get('statusModule', {}).get('lastUpdatePostDateStruct', {}).get('date'),
            'sponsor': study.get('protocolSection', {}).get('sponsorCollaboratorsModule', {}).get('leadSponsor', {}).get('name'),
            'locations': study.get('protocolSection', {}).get('contactsLocationsModule', {}).get('locations', [])
        }
        
        processed_trials.append(trial)
    
    return processed_trials

In [None]:
# Example usage
trials_data = fetch_clinical_trials(max_results=50)
processed_trials = preprocess_trial_data(trials_data)

# Convert to DataFrame for easier analysis
df = pd.DataFrame(processed_trials)

# Display first few rows
print("\nFirst few trials:")
print(df.head())

Status Code: 200
Response URL: https://clinicaltrials.gov/api/v2/studies?query.term=AREA%5BLastUpdatePostDate%5DRANGE%5B2023-01-01%2CMAX%5D&fields=NCTId%2CBriefTitle%2COfficialTitle%2CBriefSummary%2CDetailedDescription%2CCondition%2CInterventionDescription%2CEligibilityCriteria%2CStudyType%2COverallStatus%2CStudyType%2CEnrollmentCount%2CStartDate%2CCompletionDate%2CLastUpdatePostDate%2CLeadSponsorName%2CLocationFacility%2CLocationCity%2CLocationState%2CLocationCountry&pageSize=50&format=json&markupFormat=markdown&sort=LastUpdatePostDate%3Adesc
Response Content: {"studies":[
{"protocolSection":{"identificationModule":{"nctId":"NCT05621291","briefTitle":"A Study to Evaluate Next-Generation Sequencing (NGS) Testing and Monitoring of B-cell Recovery to Guide Management Following Chimeric Antigen Receptor T-cell (CART) Induced Remission in Children and Young Adults With B Lineage Acute Lymphoblastic Leu...","officialTitle":"A Pilot Trial to Evaluate Next-Generation Sequencing (NGS) Testing 