In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_trial_similarity_tfidf(patient_description, trial_mappings):
  all_descriptions = [patient_description] + list(trial_mappings.values())

  # fit transofmr the TF-IDF vectorizer on the trial descriptions
  vectorizer = TfidfVectorizer()
  tfidf_matrix = vectorizer.fit_transform(all_descriptions)

  # get similarities
  patient_vector = tfidf_matrix[0]
  trial_vectors = tfidf_matrix[1:]
  similarities = cosine_similarity(patient_vector, trial_vectors).flatten()

  # convert to dict for later use
  trial_ids = list(trial_mappings.keys())
  trial_similarity_scores = dict(zip(trial_ids, similarities))

  return trial_similarity_scores

example_desc = "Chronic fatigue and muscle pain"

trial_dict = {
    "Trial 1": "Study on fibromyalgia treatment",
    "Trial 2": "Research on chronic pain management",
    "Trial 3": "Clinical trial for asthma patients"}

print(compute_trial_similarity_tfidf(example_desc, trial_dict))

{'Trial 1': 0.0, 'Trial 2': 0.3069923851791559, 'Trial 3': 0.0}


In [22]:
# Clean and make sense of clinical trial data
import pandas as pd

trial_df = pd.read_csv('ctg-studies.csv')
trial_df.dropna(how='any')
trial_dict = {} # maps from trial ID to the population it treats

trial_df['Conditions'] = trial_df['Conditions'].fillna('').astype(str).str.lower()
trial_df['Sex'] = trial_df['Sex'].fillna('').astype(str).str.lower()
trial_df['Age'] = trial_df['Age'].fillna('').astype(str).str.lower()

trial_df = trial_df.sample(n=3000, random_state=42)

age_dict = {"child": (0, 18),
            "child, adult": (0, 65),
            "child, adult, older_adult": (0, 100),
            "adult": (18, 65),
            "adult, older_adult": (18, 100),
            "older_adult": (65, 100)}

sex_dict = {"male": 0,
            "female": 1}

for row in range(len(trial_df)):
  sex = 2

  if trial_df.iloc[row]['Sex'] in sex_dict:
    sex = sex_dict[trial_df.iloc[row]['Sex']]

  age = (0, 100)

  if trial_df.iloc[row]['Age'] in age_dict:
    age = age_dict[trial_df.iloc[row]['Age']]

  curr_trial = {"Condition": trial_df.iloc[row]['Conditions'],
                "Age": age,
                "Sex": sex}

  trial_dict[trial_df.iloc[row]['NCT Number']] = curr_trial

print(trial_dict)

  trial_df = pd.read_csv('ctg-studies.csv')




In [24]:
import pandas as pd

patient_df = pd.read_csv('patient_population_data.csv')

mrr = 0

def calculate_mrr(patient_condition, ranked_trials):
    normalized_condition = patient_condition.lower().split()

    index = 1
    for trial_id, score, trial_description in ranked_trials:
        # check if it contains the target condition
        for token in normalized_condition:
          if token in trial_description.lower():
            # reciprocal rank is 1 divided by the position in the list
            return 1 / index
        index += 1

    # if the condition is not found in any trial, MRR is 0
    return 0.0

for _, patient_row in patient_df.iterrows():
    patient_desc = patient_row['DISEASE_DISORDER']

    trial_conditions = {trial_id: details['Condition'] for trial_id, details in trial_dict.items()}

    matching_results = compute_trial_similarity_tfidf(patient_desc, trial_conditions)

    # list to store eligible trials for this patient
    patient_trial_similarities = []

    for trial_id, trial_details in trial_dict.items():
        similarity_score = matching_results[trial_id]

        # check age match
        age_match = True
        if pd.notna(patient_row['AGE']):
            trial_age_range = trial_details["Age"]
            if not (trial_age_range[0] <= patient_row['AGE'] <= trial_age_range[1]):
                age_match = False

        # check sex match
        sex_match = True
        patient_sex = None
        if pd.notna(patient_row['SEX']):
            patient_sex = 0 if patient_row['SEX'].lower() in ['male', 'man'] else 1 if patient_row['SEX'].lower() in ['female', 'woman'] else 2
            if trial_details["Sex"] != 2 and trial_details["Sex"] != patient_sex:
                sex_match = False

        # include trial if both age and sex match
        if age_match and sex_match:
            patient_trial_similarities.append((trial_id, similarity_score, trial_details['Condition']))

    patient_trial_similarities.sort(key=lambda x: x[1], reverse=True)

    top_10_trials = patient_trial_similarities[:10]

    mrr += calculate_mrr(patient_desc, patient_trial_similarities)

    print(f"\nTop 10 trials for patient w condition '{patient_desc}':")
    for trial_id, score, condition in top_10_trials:
        print(f"Trial ID: {trial_id}, similarity: {score:.4f}, condition: {condition}")



Top 10 Trials for Patient 'tuberculosis':
Trial ID: NCT06700577, Similarity Score: 1.0000, Condition: tuberculosis
Trial ID: NCT06702774, Similarity Score: 0.6727, Condition: tuberculosis (tb)
Trial ID: NCT06618573, Similarity Score: 0.5401, Condition: tuberculosis|systemic lupus erythematosus
Trial ID: NCT06608069, Similarity Score: 0.5184, Condition: tuberculosis, pulmonary|hiv coinfection
Trial ID: NCT06701136, Similarity Score: 0.4722, Condition: rifampicin-resistant pulmonary tuberculosis patients
Trial ID: NCT06700876, Similarity Score: 0.4594, Condition: tuberculosis (tb)|end-stage kidney disease
Trial ID: NCT06637189, Similarity Score: 0.0000, Condition: stimulation in the ovary|embryo|oocyte|oocyte retrieval|fertilization in vitro|blastocyst|pgt-a
Trial ID: NCT06609863, Similarity Score: 0.0000, Condition: advanced hepatocellular carcinoma|atezolizumab|bevacizumab|chemotherapy
Trial ID: NCT06684197, Similarity Score: 0.0000, Condition: dexmedetomidine|total intravenous anesth

In [25]:
print(mrr / len(patient_df))

0.7727660861495764
