In [None]:
# Cell 1: Install required libraries (run this once if needed)
%pip install pandas sentence-transformers scikit-learn

In [11]:

# Cell 2: Import necessary modules
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Cell 3: Load the Excel file and extract Study Titles
df = pd.read_excel('trials_filtered_with_coordinates.xlsx')
study_titles = df['Study Title'].tolist()

# Cell 4: Load BioBERT model
# model = SentenceTransformer('gsarti/biobert-nli')
model = SentenceTransformer('neuml/pubmedbert-base-embeddings')

# Cell 5: Compute embeddings for study titles
study_embeddings = model.encode(study_titles)


In [12]:

# Cell 6: Define user input and compute its embedding
cancer_type = "Non-Small Cell Lung Cancer"
biomarkers = "Stage 4"
stage = "EGFR mutation, PD-L1 positive"
user_input = f"{cancer_type} {biomarkers} {stage}"
user_embedding = model.encode([user_input])[0]

# Cell 7: Calculate similarities and find top matches
similarities = cosine_similarity([user_embedding], study_embeddings)[0]
threshold = 0.5  # Adjust this threshold as needed
top_indices = [i for i, sim in enumerate(similarities) if sim >= threshold]
top_matches = [(df['Study Title'].iloc[i], similarities[i]) for i in top_indices]
top_matches.sort(key=lambda x: x[1], reverse=True)  # Sort by similarity in descending order

# Cell 8: Display top matching study titles
if top_matches:
    print("Top Matching Study Titles:")
    for title, sim in top_matches:
        print(f"Study Title: {title}\nSimilarity: {sim:.4f}\n")
else:
    print("No matches found above the threshold.")
    
    
    
# --- Manual Input Section ---

# print("\nPlease enter patient information for clinical trial matching.")

# # # --- Example 1: NSCLC, Stage IV, specific mutations ---
# user_cancer_type = "Non-Small Cell Lung Cancer"
# user_stage = "Stage 4"
# user_biomarkers = "EGFR mutation, PD-L1 positive"

# find_relevant_trials(df, semantic_trial_embeddings, semantic_index_to_embedding_index,
#                      model, user_cancer_type, user_stage, user_biomarkers)

# print("\n" + "="*80 + "\n") # Separator for multiple searches

# # --- Example 2: Breast Cancer, metastatic, HER2-low ---
# user_cancer_type = "Breast Cancer"
# user_stage = "metastatic"
# user_biomarkers = "HER2 low"

# find_relevant_trials(df, semantic_trial_embeddings, semantic_index_to_embedding_index,
                    #  model, user_cancer_type, user_stage, user_biomarkers)

# print("\n" + "="*80 + "\n") # Separator for multiple searches


# # --- Example 3: Prostate cancer, mCRPC, PSMA positive ---
# # Using the example Conditions value directly for high similarity expectation
# user_cancer_type = "Prostate-specific membrane antigen (PSMA)-positive metastatic castration-resistant prostate cancer (mCRPC)"
# user_stage = "" # Stage is included in the Conditions text
# user_biomarkers = "" # Biomarkers are included in the Conditions text

# find_relevant_trials(df, semantic_trial_embeddings, semantic_index_to_embedding_index,
#                      model, user_cancer_type, user_stage, user_biomarkers)

# print("\n" + "="*80 + "\n") # Separator for multiple searches

# # --- Example 4: Urothelial Carcinoma (from your sample data) ---
# user_cancer_type = "Urothelial Carcinoma" # Or try 'bladder cancer' in type
# user_stage = "operable high-risk"
# user_biomarkers = "" # No specific biomarkers mentioned in the sample brief summary

# find_relevant_trials(df, semantic_trial_embeddings, semantic_index_to_embedding_index,
#                      model, user_cancer_type, user_stage, user_biomarkers)

# print("\n" + "="*80 + "\n") # Separator for multiple searches

Top Matching Study Titles:
Study Title: A Study to Compare the Efficacy of Nivolumab and Relatlimab Plus Chemotherapy vs Pembrolizumab Plus Chemotherapy for Stage IV/Recurrent Non-squamous Non-small Cell Lung Cancer With PD-L1 Expression ‚â• 1%
Similarity: 0.7325

Study Title: Study to Compare Furmonertinib to Platinum-Based Chemotherapy for Patients with Locally Advanced or Metastatic Non-Small Cell Lung Cancer (NSCLC) with Epidermal Growth Factor Receptor (EGFR) Exon 20 Insertion Mutations (FURVENT)
Similarity: 0.7190

Study Title: Testing the Addition of Radiation Therapy to the Usual Treatment (Immunotherapy With or Without Chemotherapy) for Advanced Stage Non-small Cell Lung Cancer Patients Who Are PD-L1 Negative
Similarity: 0.7000

Study Title: Safety and Efficacy of Combining APL-101 With Frontline Osimertinib in Patients With EGFR-mutated Metastatic Non-small Cell Lung Cancer (NSCLC)
Similarity: 0.6956

Study Title: Defactinib, Avutometinib and Nivolumab for the Treatment of An