In [None]:
%pip install pandas sentence-transformers scikit-learn ipywidgets

In [None]:
%pip install langchain langchain-community

In [8]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re 
from collections import Counter 
import ipywidgets as widgets
from IPython.display import display, clear_output
import math
from langchain.embeddings import HuggingFaceEmbeddings

In [9]:
# CONFIG

CSV_FILENAME = 'drug_data.csv' 

# TEXT_COLUMNS_FOR_EMBEDDING = ['Cancer Type', 'Brief Study Summary', 'Formatted Study Results']
TEXT_COLUMNS_FOR_EMBEDDING = ['Cancer Type']

OUTCOME_COLUMNS = ['Treatment_OS', 'Control_OS', 'OS_Improvement (%)', 'Treatment_PFS', 'Control_PFS', 'PFS_Improvement (%)']

RELEVANCE_SCORE_THRESHOLD = 0.5

In [10]:
# HELPER FUNCTIONS

# Basic text cleaning // to improve semanting matching quality
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s-]', '', text) # Remove special characters, except hyphens
        text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
        return text
    return ''


def parse_time_to_months(time_str):
    # Parses strings like '18.6 months', '2.1 years', 'N/A', 'Not reached' into numerical months.
    # Returns None for non-numeric or unclear values.

    if isinstance(time_str, (int, float)):
        return float(time_str) # Assume it's already in months if numeric onlu
    
    if not isinstance(time_str, str):
        return None

    time_str = time_str.strip().lower()

    # Handle specific non-numeric values
    if time_str in ['n/a', 'not applicable', 'not reported', 'not reached', 'nr']:
        return None

    # Look for numbers followed by units
    match = re.match(r'(\d+(\.\d+)?)\s*(month|year)s?', time_str)
    if match:
        value = float(match.group(1))
        unit = match.group(3)
        if unit == 'year':
            return value * 12
        elif unit == 'month':
            return value

    return None # Could not parse

def parse_improvement_percentage(perc_str):
    # Parses strings like '41.80%', 'Not statistically significant' into numerical percentage.
    # Returns None for non-numeric or 'Not statistically significant'.

    if isinstance(perc_str, (int, float)):
         return float(perc_str) # Assume it's already a percentage as a float (e.g., 41.8)
    if not isinstance(perc_str, str):
        return None

    perc_str = perc_str.strip().lower()

    if perc_str in ['n/a', 'not applicable', 'not reported', 'not statistically significant', 'nss']:
        return None

    # Look for number potentially ending with %
    match = re.match(r'(\d+(\.\d+)?)\s*%', perc_str)
    if match:
        return float(match.group(1))

    # Handle just a number string without %
    num_match = re.match(r'^(\d+(\.\d+)?)$', perc_str)
    if num_match:
        return float(num_match.group(1)) 

    return None # Could not parse

# those with None values goes to end
def sort_key_with_none(value, reverse=True):
    if value is None or (isinstance(value, float) and math.isnan(value)):
        return float('-inf') if reverse else float('inf')
    return value

In [11]:
print(f"Loading data from {CSV_FILENAME}...")
try:
    df = pd.read_csv(CSV_FILENAME)
    print("Data loaded successfully.")
except FileNotFoundError:
    print(f"Error: {CSV_FILENAME} not found. Please make sure the CSV file is in the same directory.")
    exit()

# Preprocess text columns for embedding
# Combine relevant text columns
df['combined_text_for_embedding'] = df[TEXT_COLUMNS_FOR_EMBEDDING].fillna('').agg(' '.join, axis=1)
df['combined_text_cleaned_for_embedding'] = df['combined_text_for_embedding'].apply(clean_text)

# Parse Outcome Metrics into numerical format
df['Treatment_OS_Months_Parsed'] = df['Treatment_OS'].apply(parse_time_to_months)
df['Control_OS_Months_Parsed'] = df['Control_OS'].apply(parse_time_to_months)

# Calculate numerical OS Improvement where possible
df['Calculated_OS_Improvement_Months'] = df.apply(
    lambda row: row['Treatment_OS_Months_Parsed'] - row['Control_OS_Months_Parsed']
    if row['Treatment_OS_Months_Parsed'] is not None and row['Control_OS_Months_Parsed'] is not None else None,
    axis=1
)

df['OS_Improvement_Percentage_Parsed'] = df['OS_Improvement (%)'].apply(parse_improvement_percentage)
df['PFS_Improvement_Percentage_Parsed'] = df['PFS_Improvement (%)'].apply(parse_improvement_percentage)

Loading data from drug_data.csv...
Data loaded successfully.


In [12]:
# Load Sentence Transformer model
print("\nLoading Sentence Transformer model...")

try:
    model = SentenceTransformer('all-MiniLM-L6-v2') # for quic test
    # model = SentenceTransformer('biobert-v1.1') 
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading Sentence Transformer model: {e}")
    exit()

# Generate embeddings for the combined text
print("\nGenerating text embeddings for drug data...")
drug_embeddings = model.encode(df['combined_text_cleaned_for_embedding'].tolist(), show_progress_bar=True, convert_to_numpy=True)
print("Embeddings generated.")


Loading Sentence Transformer model...
Model loaded successfully.

Generating text embeddings for drug data...


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Embeddings generated.


In [13]:
def find_relevant_drugs(df: pd.DataFrame, drug_embeddings: np.ndarray, model: SentenceTransformer,
                        user_cancer_type_raw: str, user_stage_raw: str, user_biomarkers_raw: str,
                        relevance_threshold: float = RELEVANCE_SCORE_THRESHOLD):
   
    # Clean user inputs
    user_cancer_type_cleaned = clean_text(user_cancer_type_raw)
    user_stage_cleaned = clean_text(user_stage_raw)
    user_biomarkers_cleaned_list = [clean_text(b.strip()) for b in user_biomarkers_raw.split(',') if b.strip()]

    # Create the full user query string for embedding
    # Include all parts of the user's profile for semantic matching
    user_query_text = f"{user_cancer_type_cleaned} {user_stage_cleaned} {' '.join(user_biomarkers_cleaned_list)}"

    print(f"\n--- Searching for drugs for profile: {user_query_text.strip()} ---")
    if not user_query_text.strip():
         print("Warning: User query is empty after cleaning. Cannot perform search.")
         return []


    try:
        user_embedding = model.encode(user_query_text, convert_to_numpy=True)
    except Exception as e:
        print(f"Error generating user query embedding: {e}")
        return []

    potential_results = []

    # Iterate through the pre-processed DataFrame and calculate semantic similarity
    for index, row in df.iterrows():
        # Calculate Semantic Similarity
        semantic_sim = cosine_similarity([user_embedding], [drug_embeddings[index]])[0][0]

        # Filter by Relevance Threshold
        if semantic_sim >= relevance_threshold:
            potential_results.append({
                'index': index,
                'semantic_similarity': semantic_sim,
                'calculated_os_improvement_months': row['Calculated_OS_Improvement_Months'],
                'os_improvement_percentage_parsed': row['OS_Improvement_Percentage_Parsed'],
                'pfs_improvement_percentage_parsed': row['PFS_Improvement_Percentage_Parsed'],
                # Include original data needed for display/explanation
                'Drug Name': row['Drug Name'],
                'Cancer Type': row['Cancer Type'],
                'Brief Study Summary': row['Brief Study Summary'],
                'Treatment_OS': row['Treatment_OS'],
                'Control_OS': row['Control_OS'],
                'OS_Improvement (%)': row['OS_Improvement (%)'],
                'Treatment_PFS': row['Treatment_PFS'],
                'Control_PFS': row['Control_PFS'],
                'PFS_Improvement (%)': row['PFS_Improvement (%)'],
            })

    # --- Ranking ---
    # Sort results (which are already filtered by the threshold):
    
    # 1. Primarily by Semantic Similarity (descending) - This implicitly handles cancer type, stage, biomarkers matching
    # 2. Secondly by Calculated OS Improvement in months (descending, handling None)
    # 3. Secondarily by whether OS Improvement % is a number (numeric first)
    # 4. Thirdly by OS Improvement % value (descending, handling None)
    


    potential_results.sort(key=lambda x: (
        x['semantic_similarity'], # 1: Semantic Similarity (desc)
        sort_key_with_none(x['calculated_os_improvement_months'], reverse=True), # 2: Calc OS diff months (desc)
        sort_key_with_none(x['os_improvement_percentage_parsed'] is not None, reverse=True), # 3: Has numeric OS% (True first)
        sort_key_with_none(x['os_improvement_percentage_parsed'], reverse=True), # 4: OS% value (desc)
        

    ), reverse=True) 

    formatted_output = []
    # --- Present Results ---
    print(f"\nFound {len(potential_results)} potentially relevant drugs (Semantic Similarity >= {relevance_threshold:.2f}):")

    if not potential_results:
        print("No relevant drugs found for this profile based on the specified threshold.")
    else:
        # Prepare formatted output
        
        for i, result in enumerate(potential_results):
            explanation_parts = []
            explanation_parts.append(f"Relevance Score (Semantic Sim): {result['semantic_similarity']:.4f}")

            # Add OS/PFS info to explanation if available
            os_imp_perc_orig = result.get('OS_Improvement (%)', 'N/A')
            pfs_imp_perc_orig = result.get('PFS_Improvement (%)', 'N/A')

            os_imp_str = ""
            if result['calculated_os_improvement_months'] is not None:
                 os_imp_str += f"{result['calculated_os_improvement_months']:.2f} months difference (Treatment OS: {result.get('Treatment_OS', 'N/A')}, Control OS: {result.get('Control_OS', 'N/A')})"
            elif os_imp_perc_orig != 'N/A':
                 os_imp_str += f"OS Improvement: {os_imp_perc_orig}"

            if os_imp_str:
                 explanation_parts.append("OS: " + os_imp_str)

            if pfs_imp_perc_orig != 'N/A':
                 explanation_parts.append(f"PFS Improvement: {pfs_imp_perc_orig}")


            formatted_output.append(f"\n--- Result {i+1} ---")
            formatted_output.append(f"Drug Name: {result['Drug Name']}")
            formatted_output.append(f"Cancer Type: {result['Cancer Type']}") 
            formatted_output.append(f"Summary: {result['Brief Study Summary']}") 

            formatted_output.append("Explanation: " + " | ".join(explanation_parts))


    print("\n".join(formatted_output))
    return potential_results 


In [15]:

# --- TESTING

print("\nPlease enter patient information for drug matching.")

# --- TEst 1
# user_cancer_type = "HR-positive, HER2-negative breast cancer"
# user_stage = "metastatic"
# user_biomarkers = "" # Testing biomarker match

# find_relevant_drugs(df, drug_embeddings, model, user_cancer_type, user_stage, user_biomarkers)


# --- Test 2: A 
user_cancer_type = "Non-Small Cell Lung Cancer"
user_stage = "Stage IV"
user_biomarkers = "EGFR mutation, PD-L1 positive"

find_relevant_drugs(df, drug_embeddings, model, user_cancer_type, user_stage, user_biomarkers)


# --- Test 3:
# user_cancer_type = "Melanoma"
# user_stage = "advanced unresectable"
# user_biomarkers = "BRAF V600E mutation"

# find_relevant_drugs(df, drug_embeddings, model, user_cancer_type, user_stage, user_biomarkers)



# --- Test 5
# user_cancer_type = "Prostate cancer"
# user_stage = "metastatic castration-resistant"
# user_biomarkers = "PSMA-positive, mCRPC"

# find_relevant_drugs(df, drug_embeddings, model, user_cancer_type, user_stage, user_biomarkers)


# --- Test 6
# user_cancer_type = "NSCLC"
# user_stage = "Stage 4 lung cancer"
# user_biomarkers = "PDL1 high"

# find_relevant_drugs(df, drug_embeddings, model, user_cancer_type, user_stage, user_biomarkers)


Please enter patient information for drug matching.

--- Searching for drugs for profile: non-small cell lung cancer stage iv egfr mutation pd-l1 positive ---

Found 17 potentially relevant drugs (Semantic Similarity >= 0.50):

--- Result 1 ---
Drug Name: Osimertinib (TAGRISSO)
Cancer Type: Non-small cell lung cancer (NSCLC) with EGFR exon 19 deletions or exon 21 L858R mutations
Summary: Osimertinib was approved for adjuvant therapy in NSCLC patients with specific mutations after tumor resection. The ADAURA trial showed a significant improvement in disease-free survival with osimertinib compared to placebo, leading to the FDA approval of the drug for this indication.
Explanation: Relevance Score (Semantic Sim): 0.6659 | OS: nan months difference (Treatment OS: Median DFS not reached in the osimertinib arm, Control OS: Median DFS of 19.6 months in the placebo arm) | PFS Improvement: Not provided

--- Result 2 ---
Drug Name: Osimertinib (Tagrisso)
Cancer Type: Non-small cell lung cancer

[{'index': 122,
  'semantic_similarity': np.float32(0.66592246),
  'calculated_os_improvement_months': nan,
  'os_improvement_percentage_parsed': nan,
  'pfs_improvement_percentage_parsed': nan,
  'Drug Name': 'Osimertinib (TAGRISSO)',
  'Cancer Type': 'Non-small cell lung cancer (NSCLC) with EGFR exon 19 deletions or exon 21 L858R mutations',
  'Brief Study Summary': 'Osimertinib was approved for adjuvant therapy in NSCLC patients with specific mutations after tumor resection. The ADAURA trial showed a significant improvement in disease-free survival with osimertinib compared to placebo, leading to the FDA approval of the drug for this indication.',
  'Treatment_OS': 'Median DFS not reached in the osimertinib arm',
  'Control_OS': 'Median DFS of 19.6 months in the placebo arm',
  'OS_Improvement (%)': 'HR 0.17 (95% CI: 0.12, 0.23; <0.0001)',
  'Treatment_PFS': 'Not provided',
  'Control_PFS': 'Not provided',
  'PFS_Improvement (%)': 'Not provided'},
 {'index': 226,
  'semantic_similar