In [55]:
import requests
import pandas as pd
import json
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.tokenize import sent_tokenize
from typing import List, Dict
import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nickj\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [15]:
def fetch_clinical_trials(max_results=100):
    """Fetch clinical trials data from ClinicalTrials.gov API v2"""
    base_url = "https://clinicaltrials.gov/api/v2/studies"
    
    # Define the fields we want to retrieve - focusing on text-rich fields for LLM processing
    fields = [
        "NCTId",
        "BriefTitle",
        "OfficialTitle",
        "BriefSummary",
        "DetailedDescription",
        "Condition",
        "InterventionDescription",
        "EligibilityCriteria",
        "StudyType",
        "OverallStatus",
        "StudyType",
        "EnrollmentCount",
        "StartDate",
        "CompletionDate",
        "LastUpdatePostDate",
        "LeadSponsorName",
        "LocationFacility",
        "LocationCity",
        "LocationState",
        "LocationCountry"
    ]
    
    params = {
        "query.term": "AREA[LastUpdatePostDate]RANGE[2023-01-01,MAX]",  # Recent trials
        "fields": ",".join(fields),
        "pageSize": min(max_results, 1000),  # API limits pageSize to 1000
        "format": "json",
        "markupFormat": "markdown",
        "sort": ["LastUpdatePostDate:desc"]  # Get most recent trials first
    }
    
    response = requests.get(base_url, params=params)
    
    # Print debug information
    print(f"Status Code: {response.status_code}")
    print(f"Response URL: {response.url}")
    print(f"Response Content: {response.text[:500]}...")  # Print first 500 chars of response
    
    if response.status_code != 200:
        raise Exception(f"API request failed with status code {response.status_code}")
        
    try:
        return response.json()
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {str(e)}")
        print(f"Response content: {response.text}")
        raise

In [51]:
def preprocess_trial_data(trials_data):
    """Preprocess the clinical trials data for LLM processing"""
    # Extract studies from the response
    studies = trials_data.get('studies', [])
    
    # Create a list to store processed trials
    processed_trials = []
    
    for study in studies:
        # Extract the protocol section which contains most of the text data
        protocol = study.get('protocolSection', {})
        
        # Create a structured dictionary for each trial
        trial = {
            'nct_id': study.get('protocolSection', {}).get('identificationModule', {}).get('nctId'),
            'title': study.get('protocolSection', {}).get('identificationModule', {}).get('briefTitle'),
            'official_title': study.get('protocolSection', {}).get('identificationModule', {}).get('officialTitle'),
            'brief_summary': study.get('protocolSection', {}).get('descriptionModule', {}).get('briefSummary'),
            'detailed_description': study.get('protocolSection', {}).get('descriptionModule', {}).get('detailedDescription'),
            'conditions': study.get('protocolSection', {}).get('conditionsModule', {}).get('conditions', []),
            'interventions': [intervention.get('description') for intervention in study.get('protocolSection', {}).get('armsInterventionsModule', {}).get('interventions', [])],
            'eligibility_criteria': study.get('protocolSection', {}).get('eligibilityModule', {}).get('eligibilityCriteria'),
            'study_type': study.get('protocolSection', {}).get('designModule', {}).get('studyType'),
            'status': study.get('protocolSection', {}).get('statusModule', {}).get('overallStatus'),
            'enrollment': study.get('protocolSection', {}).get('statusModule', {}).get('enrollmentCount'),
            'start_date': study.get('protocolSection', {}).get('statusModule', {}).get('startDateStruct', {}).get('date'),
            'completion_date': study.get('protocolSection', {}).get('statusModule', {}).get('completionDateStruct', {}).get('date'),
            'last_update': study.get('protocolSection', {}).get('statusModule', {}).get('lastUpdatePostDateStruct', {}).get('date'),
            'sponsor': study.get('protocolSection', {}).get('sponsorCollaboratorsModule', {}).get('leadSponsor', {}).get('name'),
            'facility': [location.get('facility') for location in study.get('protocolSection', {}).get('contactsLocationsModule', {}).get('locations', [])],
            'primary_outcome': study.get('protocolSection', {}).get('outcomesModule', {}).get('primaryOutcomes', [])
        }
        
        processed_trials.append(trial)
    
    return processed_trials

In [None]:
def clean_text(text: str) -> str:
    """Clean and normalize text for embedding"""
    if not isinstance(text, str):
        return ""
    
    # Remove special characters and extra whitespace
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,!?-]', '', text)
    text = text.strip()
    
    return text

def combine_trial_text(trial: Dict) -> str:
    """Combine relevant text fields into a single document"""
    text_parts = []
    
    # Add title and summary
    if trial.get('title'):
        text_parts.append(f"Title: {clean_text(trial['title'])}")
    if trial.get('brief_summary'):
        text_parts.append(f"Summary: {clean_text(trial['brief_summary'])}")
    if trial.get('detailed_description'):
        text_parts.append(f"Description: {clean_text(trial['detailed_description'])}")
    
    # Add conditions
    if trial.get('conditions'):
        conditions = [clean_text(cond) for cond in trial['conditions'] if cond]
        if conditions:
            text_parts.append(f"Conditions: {', '.join(conditions)}")
    
    # Add interventions
    if trial.get('interventions'):
        interventions = [clean_text(interv.get('description', '')) for interv in trial['interventions'] if interv]
        if interventions:
            text_parts.append(f"Interventions: {', '.join(interventions)}")
    
    # Add eligibility criteria
    if trial.get('eligibility_criteria'):
        text_parts.append(f"Eligibility: {clean_text(trial['eligibility_criteria'])}")
    
    # Add study type and status
    if trial.get('study_type'):
        text_parts.append(f"Study Type: {clean_text(trial['study_type'])}")
    if trial.get('status'):
        text_parts.append(f"Status: {clean_text(trial['status'])}")
    
    return "\n".join(text_parts)

def chunk_text(text: str, max_tokens: int = 512) -> List[str]:
    """Split text into chunks of approximately max_tokens"""
    # Split into sentences
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0
    
    for sentence in sentences:
        # Rough estimate of tokens (words + punctuation)
        sentence_length = len(sentence.split())
        
        if current_length + sentence_length > max_tokens and current_chunk:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length
        else:
            current_chunk.append(sentence)
            current_length += sentence_length
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

def prepare_trials_for_embedding(df: pd.DataFrame) -> List[Dict]:
    """Prepare trials data for embedding creation"""
    processed_trials = []
    
    for _, trial in df.iterrows():
        # Combine text fields
        combined_text = combine_trial_text(trial.to_dict())
        
        # Split into chunks if needed
        chunks = chunk_text(combined_text)
        
        # Create embedding-ready documents
        for i, chunk in enumerate(chunks):
            processed_trial = {
                'nct_id': trial['nct_id'],
                'chunk_id': i,
                'text': chunk,
                'metadata': {
                    'title': trial['title'],
                    'study_type': trial['study_type'],
                    'status': trial['status'],
                    'conditions': trial['conditions'],
                    'start_date': trial['start_date'],
                    'completion_date': trial['completion_date']
                }
            }
            processed_trials.append(processed_trial)
    
    return processed_trials

In [52]:
# Example usage
trials_data = fetch_clinical_trials(max_results=50)
processed_trials = preprocess_trial_data(trials_data)

# Convert to DataFrame for easier analysis
df = pd.DataFrame(processed_trials)

# Display first few rows
print("\nFirst few trials:")
print(df.head())

Status Code: 200
Response URL: https://clinicaltrials.gov/api/v2/studies?query.term=AREA%5BLastUpdatePostDate%5DRANGE%5B2023-01-01%2CMAX%5D&fields=NCTId%2CBriefTitle%2COfficialTitle%2CBriefSummary%2CDetailedDescription%2CCondition%2CInterventionDescription%2CEligibilityCriteria%2CStudyType%2COverallStatus%2CStudyType%2CEnrollmentCount%2CStartDate%2CCompletionDate%2CLastUpdatePostDate%2CLeadSponsorName%2CLocationFacility%2CLocationCity%2CLocationState%2CLocationCountry&pageSize=50&format=json&markupFormat=markdown&sort=LastUpdatePostDate%3Adesc
Response Content: {"studies":[
{"protocolSection":{"identificationModule":{"nctId":"NCT06964126","briefTitle":"Psycho-Social Effects of Gardening Activities in the Elderly","officialTitle":"\"Psycho-Social Effects of Gardening Activities in the Elderly: An Investigation on Self-Esteem and Life Satisfaction"},"statusModule":{"overallStatus":"RECRUITING","startDateStruct":{"date":"2025-04-01"},"completionDateStruct":{"date":"2025-08-30"},"lastUpdat

In [54]:
df.iloc[4]

nct_id                                                        NCT06507748
title                   A Study to Evaluate the Feasibility of a Physi...
official_title          A Study to Evaluate the Feasibility of a Physi...
brief_summary           Background:\n\nNeurofibromatosis type 1 (NF1) ...
detailed_description    Background:\n\n* Chronic pain is a common and ...
conditions                                     [Neurofibromatosis Type 1]
interventions                   [Novel objective pain measurement device]
eligibility_criteria    * INCLUSION CRITERIA:\n* History of clinical o...
study_type                                                 INTERVENTIONAL
status                                                 NOT_YET_RECRUITING
enrollment                                                           None
start_date                                                     2025-05-14
completion_date                                                2026-05-01
last_update                           