In [1]:
%pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
# --- Setup and Imports ---
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re # For cleaning text
import math # For isnan

In [14]:

# --- Configuration ---
XLSX_FILENAME = 'trials.xlsx' # Make sure your file is named this or change here

# Columns containing text descriptions relevant for semantic matching
# Conditions is the primary target as requested
TEXT_COLUMNS_FOR_EMBEDDING = ['Conditions'] # Adding Brief Summary can provide context
# You could potentially add 'Interventions' or 'Study Title' too depending on desired specificity

# Columns and criteria for structured filtering
FILTER_PRIMARY_OUTCOME_COLUMN = 'Primary Outcome Measures'
FILTER_PRIMARY_OUTCOME_TERM = 'Overall Survival'

FILTER_PHASES_COLUMN = 'Phases'
# Note: Splitting Phase combinations for checking is safer.
# Example: 'PHASE2|PHASE3' should match if user wants PHASE2 or PHASE3
ACCEPTABLE_PHASES = ['PHASE1|PHASE2', 'PHASE2', 'PHASE2|PHASE3', 'PHASE3', 'PHASE4']
ACCEPTABLE_INDIVIDUAL_PHASES = ACCEPTABLE_PHASES
# Let's create a set of individual phases for flexible checking
# ACCEPTABLE_INDIVIDUAL_PHASES = set()
# for phase_combo in ACCEPTABLE_PHASES:
#     for phase in phase_combo.split('|'):
#         ACCEPTABLE_INDIVIDUAL_PHASES.add(phase.strip())


FILTER_STUDY_TYPE_COLUMN = 'Study Type'
FILTER_STUDY_TYPE_VALUE = 'INTERVENTIONAL'

# Relevance score threshold (semantic similarity).
# Only trials with a semantic similarity score AT or ABOVE this threshold will be considered relevant.
# Needs tuning. Medical text similarity can be lower than general text for related concepts.
# Start with a value, evaluate results, and adjust.
RELEVANCE_SCORE_THRESHOLD = 0.5 # Slightly lower threshold might be needed for trial text variability



In [11]:

# --- Helper Functions ---

def clean_text(text):
    """Basic text cleaning: lowercase, remove special characters except hyphen, handle spaces."""
    if isinstance(text, str):
        text = text.lower()
        # Keep letters, numbers, spaces, hyphens, and maybe some punctuation like comma/slash useful in medical text?
        text = re.sub(r'[^a-z0-9\s-/]', '', text) # Added / for common phase notations like I/II
        text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
        return text
    return ''

# Helper for sorting with None/NaN values (None/NaN goes to the end)
def sort_key_with_none(value, reverse=True):
    if value is None or (isinstance(value, float) and math.isnan(value)):
        return float('-inf') if reverse else float('inf')
    return value

# Function to check if a trial's phases match the acceptable list
def check_phases(trial_phases_raw):
    if not isinstance(trial_phases_raw, str):
        return False
    trial_phases_cleaned = trial_phases_raw
    # Split the trial's phase string by | or space and check if any individual phase is in our acceptable set
    trial_individual_phases = re.split(r'[|\s]+', trial_phases_cleaned)

    # Check if *any* phase mentioned in the trial is in our list of acceptable *individual* phases
    # This makes the filter more flexible (e.g., 'PHASE2|PHASE3' passes if we accept PHASE2 or PHASE3)
    # If you require the *exact* combination to be in ACCEPTABLE_PHASES list, simplify this logic.
    # Assuming flexibility is better for matching.
    for phase in trial_individual_phases:
        if phase.upper() in [p.replace('|', '/') for p in ACCEPTABLE_PHASES] or phase.upper().replace('/', '|') in ACCEPTABLE_PHASES:
             # Handle cases like 'PHASE III' vs 'PHASE3' - clean_text helps, but maybe add a map?
             # Let's rely on clean_text and upper() for now.
             # Direct check against the original ACCEPTABLE_PHASES list:
             if trial_phases_raw.upper() in ACCEPTABLE_PHASES:
                 return True
             # Check individual components against ACCEPTABLE_INDIVIDUAL_PHASES
             if phase.upper() in ACCEPTABLE_INDIVIDUAL_PHASES:
                 return True

    return False



In [4]:

# --- Data Loading and Preprocessing ---

print(f"Loading data from {XLSX_FILENAME}...")
try:
    # Use pd.read_excel for .xlsx files
    df = pd.read_excel(XLSX_FILENAME)
    print("Data loaded successfully.")
    print(f"Initial data shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: {XLSX_FILENAME} not found. Please make sure the XLSX file is in the same directory.")
    exit()

# Preprocess text columns for embedding
# Combine relevant text columns into a single column
# Ensure columns exist before accessing
text_cols_present = [col for col in TEXT_COLUMNS_FOR_EMBEDDING if col in df.columns]

if not text_cols_present:
    print(f"Error: None of the specified text columns for embedding ({TEXT_COLUMNS_FOR_EMBEDDING}) found in the file.")
    exit()
    
# print(df[text_cols_present].head())
df['combined_text_for_embedding'] = df[text_cols_present[0]].fillna('').astype(str)
# df['combined_text_for_embedding'] = df[text_cols_present].fillna('').agg(' '.join, axis=1)
df['combined_text_cleaned_for_embedding'] = df['combined_text_for_embedding']



Loading data from trials.xlsx...
Data loaded successfully.
Initial data shape: (7498, 256)


In [5]:

# Load Medical Domain Sentence Transformer model
print("\nLoading Medical Domain Sentence Transformer model...")
try:
    # Using PubMedBERT finetuned for sentence similarity
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading Sentence Transformer model: {e}")
    print("Please ensure you have internet access or the model files are cached, or try a different model name.")
    exit()

# Generate embeddings for the combined text (done once for the whole dataset)
print("\nGenerating text embeddings for trial data...")
# Only generate embeddings for rows where the text column is not empty after cleaning
# This saves computation and avoids issues with empty strings in some models
non_empty_indices = df.index[df['combined_text_cleaned_for_embedding'].str.strip() != ''].tolist()
non_empty_texts = df.loc[non_empty_indices, 'combined_text_cleaned_for_embedding'].tolist()

if not non_empty_texts:
    print("Warning: No non-empty text found in the specified columns for embedding. Cannot generate embeddings.")
    drug_embeddings = np.array([]) # Create empty array to avoid errors
else:
    drug_embeddings = model.encode(non_empty_texts, show_progress_bar=True, convert_to_numpy=True)
    print(f"Embeddings generated for {len(non_empty_indices)} trials.")

# Create a mapping from original DataFrame index to embedding index
index_to_embedding_index = {original_idx: emb_idx for emb_idx, original_idx in enumerate(non_empty_indices)}




Loading Medical Domain Sentence Transformer model...
Model loaded successfully.

Generating text embeddings for trial data...


Batches:   0%|          | 0/231 [00:00<?, ?it/s]

Embeddings generated for 7388 trials.


In [16]:

# --- Search Function ---

def find_relevant_trials(df: pd.DataFrame, drug_embeddings: np.ndarray, index_to_embedding_index: dict,
                         model: SentenceTransformer,
                         user_cancer_type_raw: str, user_stage_raw: str, user_biomarkers_raw: str,
                         relevance_threshold: float = RELEVANCE_SCORE_THRESHOLD):
    """
    Finds and ranks relevant clinical trials based on user-provided cancer information,
    structured filters, and semantic similarity.

    Args:
        df: The pre-processed DataFrame.
        drug_embeddings: Pre-calculated embeddings for the trial data (non-empty texts).
        index_to_embedding_index: Mapping from DataFrame index to embedding index.
        model: The loaded Sentence Transformer model.
        user_cancer_type_raw: The raw string input for cancer type.
        user_stage_raw: The raw string input for stage.
        user_biomarkers_raw: The raw string input for biomarkers (comma-separated).
        relevance_threshold: The minimum semantic similarity score for a trial to be considered relevant.

    Returns:
        A list of dictionaries, each representing a relevant trial result with details and scores.
    """
    # Clean user inputs
    user_cancer_type_cleaned = user_cancer_type_raw
    user_stage_cleaned = user_stage_raw
    user_biomarkers_cleaned_list = [b.strip() for b in user_biomarkers_raw.split(',') if b.strip()]

    # Create the full user query string for embedding
    # Include all parts of the user's profile for semantic matching
    user_query_text = f"{user_cancer_type_cleaned} {user_stage_cleaned} {' '.join(user_biomarkers_cleaned_list)}"

    print(f"\n--- Searching for trials for profile: {user_query_text.strip()} ---")
    if not user_query_text.strip():
         print("Warning: User query is empty after cleaning. Cannot perform search.")
         return []

    # Generate embedding for the user query
    try:
        user_embedding = model.encode(user_query_text, convert_to_numpy=True)
    except Exception as e:
        print(f"Error generating user query embedding: {e}")
        return []

    potential_results = []

    # Iterate through the pre-processed DataFrame
    for index, row in df.iterrows():

        # Skip rows that did not have embeddings generated
        if index not in index_to_embedding_index:
            continue

        # --- Apply Structured Filters ---

        # 1. Filter by Primary Outcome Measures
        primary_outcome_text = str(row.get(FILTER_PRIMARY_OUTCOME_COLUMN, '')).lower()
        if FILTER_PRIMARY_OUTCOME_TERM.lower() not in primary_outcome_text:
             #print(f"Skipping trial {row.get('NCT Number', index)}: Did not match Primary Outcome filter.")
             continue # Skip if Primary Outcome filter not met

        # 2. Filter by Phases
        trial_phases_raw = row.get(FILTER_PHASES_COLUMN, '')
        if not check_phases(trial_phases_raw):
             #print(f"Skipping trial {row.get('NCT Number', index)}: Did not match Phases filter (was '{trial_phases_raw}').")
             continue # Skip if Phases filter not met


        # 3. Filter by Study Type
        trial_study_type = str(row.get(FILTER_STUDY_TYPE_COLUMN, '')).upper() # Assume uppercase for filter value
        if trial_study_type != FILTER_STUDY_TYPE_VALUE.upper():
             #print(f"Skipping trial {row.get('NCT Number', index)}: Did not match Study Type filter (was '{trial_study_type}').")
             continue # Skip if Study Type filter not met


        # --- Calculate Semantic Similarity (Only for trials passing filters) ---
        embedding_index = index_to_embedding_index[index]
        semantic_sim = cosine_similarity([user_embedding], [drug_embeddings[embedding_index]])[0][0]

        # --- Filter by Relevance Threshold ---
        if semantic_sim >= relevance_threshold:
            # Store Result with Data and Scores if above threshold
            potential_results.append({
                'index': index,
                'semantic_similarity': semantic_sim,
                # Include relevant original data for display/explanation
                'NCT Number': row.get('NCT Number', 'N/A'),
                'Study Title': row.get('Study Title', 'N/A'),
                'Study Status': row.get('Study Status', 'N/A'),
                'Conditions': row.get('Conditions', 'N/A'),
                'Interventions': row.get('Interventions', 'N/A'),
                'Phases': row.get('Phases', 'N/A'),
                'Brief Summary': row.get('Brief Summary', 'N/A'),
                'Primary Outcome Measures': row.get('Primary Outcome Measures', 'N/A'),
            })

    # --- Ranking ---
    # Sort results (which are already filtered):
    # 1. Primarily by Semantic Similarity (descending)
    # 2. Secondarily by Phase (later phases often more relevant clinical question)
    #    Let's assign a numerical value to phases for sorting: PHASE4 > PHASE3 > PHASE2|PHASE3 > PHASE2 > PHASE1|PHASE2
    phase_order = {
        'PHASE4': 5,
        'PHASE3': 4,
        'PHASE2|PHASE3': 3,
        'PHASE2': 2,
        'PHASE1|PHASE2': 1
    }
    def get_phase_sort_value(phases_raw):
        if not isinstance(phases_raw, str): return 0 # Treat None/empty as lowest priority
        phases_upper = phases_raw.upper()
        return phase_order.get(phases_upper, 0) # Default to 0 if phase is not in our list

    potential_results.sort(key=lambda x: (
        x['semantic_similarity'], # Primary: Semantic Similarity (desc)
        # get_phase_sort_value(x.get('Phases')), # Secondary: Phase (desc)
        #  x.get('Study Status', 'ZZZ') # Tertiary: Study Status (alphabetical, putting 'Recruiting' earlier)
    ), reverse=True) # Sort overall descending based on the tuple components

    # --- Present Results ---
    print(f"\nFound {len(potential_results)} relevant trials (Semantic Similarity >= {relevance_threshold:.2f}):")

    if not potential_results:
        print("No relevant trials found for this profile based on the filters and similarity threshold.")
    else:
        # Prepare formatted output
        formatted_output = []
        for i, result in enumerate(potential_results):
            formatted_output.append(f"\n--- Result {i+1} ---")
            formatted_output.append(f"NCT Number: {result['NCT Number']}")
            formatted_output.append(f"Study Title: {result['Study Title']}")
            formatted_output.append(f"Status: {result['Study Status']}")
            formatted_output.append(f"Phases: {result['Phases']}")
            formatted_output.append(f"Interventions: {result['Interventions']}")
            formatted_output.append(f"Conditions: {result['Conditions']}")
            formatted_output.append(f"Brief Summary: {result['Brief Summary']}")
            formatted_output.append(f"Primary Outcome: {result['Primary Outcome Measures']}")
            formatted_output.append(f"Relevance Score (Semantic Sim): {result['semantic_similarity']:.4f}")
            formatted_output.append("-" * 50) # Separator

    print("\n".join(formatted_output))
    return potential_results # Return the list of result dictionaries



In [17]:
# --- Manual Input Section ---

print("\nPlease enter patient information for clinical trial matching.")

# --- Example 1: NSCLC, Stage IV, specific mutations ---
user_cancer_type = "Non-Small Cell Lung Cancer"
user_stage = "Stage 4"
user_biomarkers = "EGFR mutation, PD-L1 positive"

find_relevant_trials(df, drug_embeddings, index_to_embedding_index, model, user_cancer_type, user_stage, user_biomarkers)

# --- Example 2: Breast Cancer, metastatic, HER2-low ---
# user_cancer_type = "Breast Cancer"
# user_stage = "metastatic"
# user_biomarkers = "HER2 low"

# find_relevant_trials(df, drug_embeddings, index_to_embedding_index, model, user_cancer_type, user_stage, user_biomarkers)

# --- Example 3: Prostate cancer, mCRPC, PSMA positive ---
# user_cancer_type = "Prostate cancer"
# user_stage = "metastatic castration-resistant"
# user_biomarkers = "PSMA positive"

# find_relevant_trials(df, drug_embeddings, index_to_embedding_index, model, user_cancer_type, user_stage, user_biomarkers)

# --- Example 4: Urothelial Carcinoma (from your sample data) ---
# user_cancer_type = "Urothelial Carcinoma"
# user_stage = "operable high-risk" # Or try 'bladder cancer' in type
# user_biomarkers = "" # No specific biomarkers mentioned in the sample brief summary

# find_relevant_trials(df, drug_embeddings, index_to_embedding_index, model, user_cancer_type, user_stage, user_biomarkers)


Please enter patient information for clinical trial matching.

--- Searching for trials for profile: Non-Small Cell Lung Cancer Stage 4 EGFR mutation PD-L1 positive ---

Found 36 relevant trials (Semantic Similarity >= 0.50):

--- Result 1 ---
NCT Number: NCT06712355
Study Title: Safety and Effectiveness of BNT327, an Investigational Therapy in Combination With Chemotherapy for Patients With Untreated Small-cell Lung Cancer
Status: RECRUITING
Phases: PHASE3
Interventions: DRUG: BNT327|DRUG: Atezolizumab|DRUG: Etoposide|DRUG: Carboplatin
Conditions: Extensive-stage Small-cell Lung Cancer
Brief Summary: This is a Phase III, multisite, randomized, double-blinded study to investigate BNT327 combined with chemotherapy (etoposide/carboplatin) compared to atezolizumab combined with chemotherapy (etoposide/carboplatin) for the treatment of participants with previously untreated extensive-stage small-cell lung cancer (ES-SCLC).
Primary Outcome: Overall survival (OS), OS defined as the time fro

[{'index': 4219,
  'semantic_similarity': np.float32(0.61991215),
  'NCT Number': 'NCT06712355',
  'Study Title': 'Safety and Effectiveness of BNT327, an Investigational Therapy in Combination With Chemotherapy for Patients With Untreated Small-cell Lung Cancer',
  'Study Status': 'RECRUITING',
  'Conditions': 'Extensive-stage Small-cell Lung Cancer',
  'Interventions': 'DRUG: BNT327|DRUG: Atezolizumab|DRUG: Etoposide|DRUG: Carboplatin',
  'Phases': 'PHASE3',
  'Brief Summary': 'This is a Phase III, multisite, randomized, double-blinded study to investigate BNT327 combined with chemotherapy (etoposide/carboplatin) compared to atezolizumab combined with chemotherapy (etoposide/carboplatin) for the treatment of participants with previously untreated extensive-stage small-cell lung cancer (ES-SCLC).',
  'Primary Outcome Measures': 'Overall survival (OS), OS defined as the time from randomization to death from any cause., Up to approximately 39 months'},
 {'index': 4217,
  'semantic_simila