In [6]:
import pandas as pd
import torch
import random
from sentence_transformers import SentenceTransformer, util

# === Load Ontology CSV ===
df = pd.read_csv("ENDOH.csv")
df = df[['Preferred Label', 'Parents']].dropna()
df['Preferred Label'] = df['Preferred Label'].astype(str)
df['Cluster'] = df['Parents'].apply(lambda x: x.strip().split('#')[-1])

# === Build full cluster dictionary ===
full_cluster_dict = df.groupby('Cluster')['Preferred Label'].apply(list).to_dict()

# === Redundancy Function ===
def jaccard_sim(a, b):
    a_words = set(a.lower().split('_'))
    b_words = set(b.lower().split('_'))
    union = a_words | b_words
    intersection = a_words & b_words
    return len(intersection) / len(union) if union else 0

# === Initialize SentenceTransformer ===
model = SentenceTransformer('all-MiniLM-L6-v2')

# === Weights ===
w1 = 1
w2 = 1

# === Compute Utility Within Original Cluster ===
def compute_utility_within_cluster(concept_label, cluster_terms):
    # Remove the concept itself
    filtered_terms = [term for term in cluster_terms if term != concept_label]
    if not filtered_terms:
        return None, None, None, None  # Skip if cluster has only one concept

    x_embedding = model.encode(concept_label, convert_to_tensor=True)
    cluster_embeddings = model.encode(filtered_terms, convert_to_tensor=True)

    # Semantic similarity
    cosine_scores = util.cos_sim(x_embedding, cluster_embeddings)
    avg_similarity = torch.mean(cosine_scores).item()

    # Redundancy
    max_redundancy = -1
    max_redundant_term = ""
    for term in filtered_terms:
        score = jaccard_sim(concept_label, term)
        if score > max_redundancy:
            max_redundancy = score
            max_redundant_term = term

    # Utility score
    utility_score = (w1 * avg_similarity) - (w2 * max_redundancy)
    return utility_score, avg_similarity, max_redundancy, max_redundant_term

# === Main Experiment ===
def estimate_lambda(n=35):
    all_labels = df['Preferred Label'].tolist()
    results = []

    for _ in range(n):
        picked = random.choice(all_labels)
        picked_row = df[df['Preferred Label'] == picked].iloc[0]
        picked_cluster = picked_row['Cluster']
        cluster_terms = full_cluster_dict[picked_cluster]

        us, sim, red, red_term = compute_utility_within_cluster(picked, cluster_terms)
        if us is None:
            continue  # Skip clusters with only one concept

        print(f"\n🔹 Concept: {picked}")
        print(f"   🏷️  Cluster: {picked_cluster}")
        print(f"   ✅ Semantic Similarity: {sim:.4f}")
        print(f"   ⚠️  Max Redundancy: {red:.4f} → with '{red_term}'")
        print(f"   📈 Utility Score: {us:.4f}")

        results.append({
            "Concept": picked,
            "Cluster": picked_cluster,
            "Semantic Similarity": sim,
            "Max Redundancy": red,
            "Redundant With": red_term,
            "Utility Score": us
        })

    lambda_estimate = sum(r['Utility Score'] for r in results) / len(results)
    return lambda_estimate, pd.DataFrame(results)

# === Run it ===
lambda_value, results_df = estimate_lambda(n=100)
print(f"\n🧪 Estimated Lambda (Avg. Utility Score of Concepts vs. Own Cluster): {lambda_value:.4f}")


🔹 Concept: Poor_indoor_light_quality
   🏷️  Cluster: Home_as_microenvironment
   ✅ Semantic Similarity: 0.3023
   ⚠️  Max Redundancy: 0.2000 → with 'water_quality'
   📈 Utility Score: 0.1023

🔹 Concept: Construction_dust_hazrad
   🏷️  Cluster: Exposure_to_pollutants_workplace
   ✅ Semantic Similarity: 0.3423
   ⚠️  Max Redundancy: 0.0000 → with 'Industrial_noise_pollution'
   📈 Utility Score: 0.3423

🔹 Concept: Airborne_disease_transmission_in_public_spaces
   🏷️  Cluster: Local_community_infrastructre
   ✅ Semantic Similarity: 0.2828
   ⚠️  Max Redundancy: 0.2857 → with 'Public_green_spaces'
   📈 Utility Score: -0.0029

🔹 Concept: Community_noise_pollution
   🏷️  Cluster: Local_community_infrastructre
   ✅ Semantic Similarity: 0.3414
   ⚠️  Max Redundancy: 0.2000 → with 'Lead_pipes_community'
   📈 Utility Score: 0.1414

🔹 Concept: Disaster_prepardness_in_urban_area
   🏷️  Cluster: Community_resilience
   ✅ Semantic Similarity: 0.0732
   ⚠️  Max Redundancy: 0.0000 → with 'Social_Suppo


🔹 Concept: Organic_pollutants
   🏷️  Cluster: Trans_boundary_pollution
   ✅ Semantic Similarity: 0.4608
   ⚠️  Max Redundancy: 0.0000 → with 'Ozone_Depeletion'
   📈 Utility Score: 0.4608

🔹 Concept: Waste_recycling_facilities
   🏷️  Cluster: Local_community_infrastructre
   ✅ Semantic Similarity: 0.3322
   ⚠️  Max Redundancy: 0.2000 → with 'Waste_managment_facility'
   📈 Utility Score: 0.1322

🔹 Concept: workplace_as_macroenvironment
   🏷️  Cluster: At_the_microenvironmental_level
   ✅ Semantic Similarity: 0.4208
   ⚠️  Max Redundancy: 0.2000 → with 'Home_as_microenvironment'
   📈 Utility Score: 0.2208

🔹 Concept: Access_to_clean_Water
   🏷️  Cluster: Local_community_infrastructre
   ✅ Semantic Similarity: 0.3228
   ⚠️  Max Redundancy: 0.3333 → with 'Access_to_nutritious_food'
   📈 Utility Score: -0.0106

🔹 Concept: regulations_on_water_quality
   🏷️  Cluster: policy_governance_effect
   ✅ Semantic Similarity: 0.7297
   ⚠️  Max Redundancy: 0.6000 → with 'regulations_on_air_Quality'
  


🔹 Concept: Bacterial_contamination_in_water_storage_tanks
   🏷️  Cluster: Local_community_infrastructre
   ✅ Semantic Similarity: 0.3333
   ⚠️  Max Redundancy: 0.2222 → with 'Contaminants_in_public_water_source'
   📈 Utility Score: 0.1111

🔹 Concept: Respiratory_disease
   🏷️  Cluster: Chronic_Disease
   ✅ Semantic Similarity: 0.1822
   ⚠️  Max Redundancy: 0.0000 → with 'toxin_induced_cancer_risk'
   📈 Utility Score: 0.1822

🔹 Concept: Global_heat_Wave
   🏷️  Cluster: Extreme_weather_Events
   ✅ Semantic Similarity: 0.3197
   ⚠️  Max Redundancy: 0.0000 → with 'Climatic_changes'
   📈 Utility Score: 0.3197

🔹 Concept: Accessibility_to_green_space
   🏷️  Cluster: At_the_mesoenvironmental_level
   ✅ Semantic Similarity: 0.2640
   ⚠️  Max Redundancy: 0.4000 → with 'Accessibility_to_cleanstreets'
   📈 Utility Score: -0.1360

🔹 Concept: Unregulated_cosmetic_use
   🏷️  Cluster: At_the_Individual_level
   ✅ Semantic Similarity: 0.2802
   ⚠️  Max Redundancy: 0.0000 → with 'genetic_predispositio

In [1]:
import pandas as pd
import torch
import random
from sentence_transformers import SentenceTransformer, util

# === Load Ontology CSV ===
df = pd.read_csv("ENDOH.csv")
df = df[['Preferred Label', 'Parents']].dropna()
df['Preferred Label'] = df['Preferred Label'].astype(str)
df['Cluster'] = df['Parents'].apply(lambda x: x.strip().split('#')[-1])

# === Build full cluster dictionary ===
full_cluster_dict = df.groupby('Cluster')['Preferred Label'].apply(list).to_dict()

# === Redundancy Function ===
def jaccard_sim(a, b):
    a_words = set(a.lower().split('_'))
    b_words = set(b.lower().split('_'))
    union = a_words | b_words
    intersection = a_words & b_words
    return len(intersection) / len(union) if union else 0

# === Initialize SentenceTransformer ===
model = SentenceTransformer('all-MiniLM-L6-v2')

# === Weights ===
w1 = 1
w2 = 1

# === Compute Utility Within Original Cluster ===
def compute_utility_within_cluster(concept_label, cluster_terms):
    # Remove the concept itself
    filtered_terms = [term for term in cluster_terms if term != concept_label]
    if not filtered_terms:
        return None, None, None, None  # Skip if cluster has only one concept

    x_embedding = model.encode(concept_label, convert_to_tensor=True)
    cluster_embeddings = model.encode(filtered_terms, convert_to_tensor=True)

    # Semantic similarity
    cosine_scores = util.cos_sim(x_embedding, cluster_embeddings)
    avg_similarity = torch.mean(cosine_scores).item()

    # Redundancy
    max_redundancy = -1
    max_redundant_term = ""
    for term in filtered_terms:
        score = jaccard_sim(concept_label, term)
        if score > max_redundancy:
            max_redundancy = score
            max_redundant_term = term

    # Utility score
    utility_score = (w1 * avg_similarity) - (w2 * max_redundancy)
    return utility_score, avg_similarity, max_redundancy, max_redundant_term

# === Main Experiment ===
def estimate_lambda(n=35):
    all_labels = df['Preferred Label'].tolist()
    results = []

    for _ in range(n):
        picked = random.choice(all_labels)
        picked_row = df[df['Preferred Label'] == picked].iloc[0]
        picked_cluster = picked_row['Cluster']
        cluster_terms = full_cluster_dict[picked_cluster]

        us, sim, red, red_term = compute_utility_within_cluster(picked, cluster_terms)
        if us is None:
            continue  # Skip clusters with only one concept

        print(f"\n🔹 Concept: {picked}")
        print(f"   🏷️  Cluster: {picked_cluster}")
        print(f"   ✅ Semantic Similarity: {sim:.4f}")
        print(f"   ⚠️  Max Redundancy: {red:.4f} → with '{red_term}'")
        print(f"   📈 Utility Score: {us:.4f}")

        results.append({
            "Concept": picked,
            "Cluster": picked_cluster,
            "Semantic Similarity": sim,
            "Max Redundancy": red,
            "Redundant With": red_term,
            "Utility Score": us
        })

    lambda_estimate = sum(r['Utility Score'] for r in results) / len(results)
    return lambda_estimate, pd.DataFrame(results)

# === Run it ===
lambda_value, results_df = estimate_lambda(n=102)
print(f"\n🧪 Estimated Lambda (Avg. Utility Score of Concepts vs. Own Cluster): {lambda_value:.4f}")

  from .autonotebook import tqdm as notebook_tqdm



🔹 Concept: Waste_managment_facility
   🏷️  Cluster: Local_community_infrastructre
   ✅ Semantic Similarity: 0.3658
   ⚠️  Max Redundancy: 0.2000 → with 'Clean_sanitation_facility'
   📈 Utility Score: 0.1658

🔹 Concept: At_the_microenvironmental_level
   🏷️  Cluster: Environmental_Determinants_of_Health
   ✅ Semantic Similarity: 0.5469
   ⚠️  Max Redundancy: 0.6000 → with 'At_the_macroenvironmental_level'
   📈 Utility Score: -0.0531

🔹 Concept: Physical_safety_measures_in_workplace
   🏷️  Cluster: At_the_macroenvironmental_level
   ✅ Semantic Similarity: 0.3668
   ⚠️  Max Redundancy: 0.2500 → with 'Mental_health_support_in_workplace'
   📈 Utility Score: 0.1168

🔹 Concept: Erosion_in_irrigation_channels
   🏷️  Cluster: Local_community_infrastructre
   ✅ Semantic Similarity: 0.2481
   ⚠️  Max Redundancy: 0.1250 → with 'Contaminants_in_public_water_source'
   📈 Utility Score: 0.1231

🔹 Concept: Indoor_air_purification
   🏷️  Cluster: Home_as_microenvironment
   ✅ Semantic Similarity: 0.31


🔹 Concept: Exposure_to_pollutants_home
   🏷️  Cluster: At_the_Individual_level
   ✅ Semantic Similarity: 0.4427
   ⚠️  Max Redundancy: 0.6000 → with 'Exposure_to_pollutants_workplace'
   📈 Utility Score: -0.1573

🔹 Concept: air_quality
   🏷️  Cluster: Home_as_microenvironment
   ✅ Semantic Similarity: 0.4134
   ⚠️  Max Redundancy: 0.3333 → with 'water_quality'
   📈 Utility Score: 0.0800

🔹 Concept: Industrial_noise_pollution
   🏷️  Cluster: Exposure_to_pollutants_workplace
   ✅ Semantic Similarity: 0.4213
   ⚠️  Max Redundancy: 0.0000 → with 'Construction_dust_hazrad'
   📈 Utility Score: 0.4213

🔹 Concept: At_the_Individual_level
   🏷️  Cluster: Environmental_Determinants_of_Health
   ✅ Semantic Similarity: 0.4840
   ⚠️  Max Redundancy: 0.6000 → with 'At_the_microenvironmental_level'
   📈 Utility Score: -0.1160

🔹 Concept: Accessibility_to_cleanstreets
   🏷️  Cluster: At_the_mesoenvironmental_level
   ✅ Semantic Similarity: 0.2535
   ⚠️  Max Redundancy: 0.4000 → with 'Accessibility_to


🔹 Concept: regulations_on_air_Quality
   🏷️  Cluster: policy_governance_effect
   ✅ Semantic Similarity: 0.7297
   ⚠️  Max Redundancy: 0.6000 → with 'regulations_on_water_quality'
   📈 Utility Score: 0.1297

🔹 Concept: Radon_Exposure_through_soil
   🏷️  Cluster: Home_as_microenvironment
   ✅ Semantic Similarity: 0.2572
   ⚠️  Max Redundancy: 0.1429 → with 'Exposure_to_synthetic_materials'
   📈 Utility Score: 0.1143

🔹 Concept: Rising_sea_level
   🏷️  Cluster: At_the_global_level
   ✅ Semantic Similarity: 0.2970
   ⚠️  Max Redundancy: 0.0000 → with 'Trans_boundary_pollution'
   📈 Utility Score: 0.2970

🔹 Concept: Arsenic_drinking_Water
   🏷️  Cluster: Exposure_to_toxins_home
   ✅ Semantic Similarity: 0.3747
   ⚠️  Max Redundancy: 0.5000 → with 'Microplastic_drinking_Water'
   📈 Utility Score: -0.1253

🔹 Concept: Sedentary_lifestyle_consequence
   🏷️  Cluster: At_the_Individual_level
   ✅ Semantic Similarity: 0.3094
   ⚠️  Max Redundancy: 0.1429 → with 'Exposure_to_hazards_lifestyle_fac

In [2]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

# === Load Ontology CSV ===
df = pd.read_csv("ENDOH.csv")
df['Preferred Label'] = df['Preferred Label'].astype(str)

# === Assign Clusters ===
df['Cluster'] = df['Parents'].apply(lambda x: x.strip().split('#')[-1] if pd.notna(x) else "ROOT")

# === Build full cluster dictionary ===
full_cluster_dict = df.groupby('Cluster')['Preferred Label'].apply(list).to_dict()

# === Redundancy Function ===
def jaccard_sim(a, b):
    a_words = set(a.lower().split('_'))
    b_words = set(b.lower().split('_'))
    union = a_words | b_words
    intersection = a_words & b_words
    return len(intersection) / len(union) if union else 0

# === Initialize SentenceTransformer ===
model = SentenceTransformer('all-MiniLM-L6-v2')

# === Weights ===
w1 = 1
w2 = 1

# === Compute Utility Within Original Cluster ===
def compute_utility_within_cluster(concept_label, cluster_terms):
    # Remove the concept itself
    filtered_terms = [term for term in cluster_terms if term != concept_label]
    if not filtered_terms:
        return None, None, None, None  # Skip if cluster has only one concept

    x_embedding = model.encode(concept_label, convert_to_tensor=True)
    cluster_embeddings = model.encode(filtered_terms, convert_to_tensor=True)

    # Semantic similarity
    cosine_scores = util.cos_sim(x_embedding, cluster_embeddings)
    avg_similarity = torch.mean(cosine_scores).item()

    # Redundancy
    max_redundancy = -1
    max_redundant_term = ""
    for term in filtered_terms:
        score = jaccard_sim(concept_label, term)
        if score > max_redundancy:
            max_redundancy = score
            max_redundant_term = term

    # Utility score
    utility_score = (w1 * avg_similarity) - (w2 * max_redundancy)
    return utility_score, avg_similarity, max_redundancy, max_redundant_term

# === Leave-One-Out Estimation for All Non-Root Concepts ===
def estimate_lambda_leave_one_out():
    # Exclude root concepts (e.g., those with missing or 'ROOT' parents)
    non_root_df = df[df['Cluster'] != "ROOT"].copy()

    results = []

    for idx, row in non_root_df.iterrows():
        picked = row['Preferred Label']
        picked_cluster = row['Cluster']
        cluster_terms = full_cluster_dict.get(picked_cluster, [])

        # Skip if this is the only concept in the cluster
        if len(cluster_terms) <= 1:
            continue

        us, sim, red, red_term = compute_utility_within_cluster(picked, cluster_terms)
        if us is None:
            continue

        print(f"\n🔹 Concept: {picked}")
        print(f"   🏷️  Cluster: {picked_cluster}")
        print(f"   ✅ Semantic Similarity: {sim:.4f}")
        print(f"   ⚠️  Max Redundancy: {red:.4f} → with '{red_term}'")
        print(f"   📈 Utility Score: {us:.4f}")

        results.append({
            "Concept": picked,
            "Cluster": picked_cluster,
            "Semantic Similarity": sim,
            "Max Redundancy": red,
            "Redundant With": red_term,
            "Utility Score": us
        })

    lambda_estimate = sum(r['Utility Score'] for r in results) / len(results)
    return lambda_estimate, pd.DataFrame(results)

# === Run it ===
lambda_value, results_df = estimate_lambda_leave_one_out()
print(f"\n🧪 Final Estimated Lambda (Avg. Utility Score across all non-root concepts): {lambda_value:.4f}")


🔹 Concept: Waste_managment_facility
   🏷️  Cluster: Local_community_infrastructre
   ✅ Semantic Similarity: 0.3658
   ⚠️  Max Redundancy: 0.2000 → with 'Clean_sanitation_facility'
   📈 Utility Score: 0.1658

🔹 Concept: Industrial_noise_pollution
   🏷️  Cluster: Exposure_to_pollutants_workplace
   ✅ Semantic Similarity: 0.4213
   ⚠️  Max Redundancy: 0.0000 → with 'Construction_dust_hazrad'
   📈 Utility Score: 0.4213

🔹 Concept: Ozone_Depeletion
   🏷️  Cluster: Trans_boundary_pollution
   ✅ Semantic Similarity: 0.4023
   ⚠️  Max Redundancy: 0.0000 → with 'acid_rain'
   📈 Utility Score: 0.4023

🔹 Concept: Airborne_disease_transmission_in_public_spaces
   🏷️  Cluster: Local_community_infrastructre
   ✅ Semantic Similarity: 0.2828
   ⚠️  Max Redundancy: 0.2857 → with 'Public_green_spaces'
   📈 Utility Score: -0.0029

🔹 Concept: Built_in_environment
   🏷️  Cluster: At_the_mesoenvironmental_level
   ✅ Semantic Similarity: 0.1753
   ⚠️  Max Redundancy: 0.0000 → with 'Community_resilience'
   


🔹 Concept: Community_resilience
   🏷️  Cluster: At_the_mesoenvironmental_level
   ✅ Semantic Similarity: 0.1901
   ⚠️  Max Redundancy: 0.0000 → with 'Built_in_environment'
   📈 Utility Score: 0.1901

🔹 Concept: loss_of_urban_forest
   🏷️  Cluster: Accessibility_to_green_space
   ✅ Semantic Similarity: 0.2627
   ⚠️  Max Redundancy: 0.0000 → with 'Parks_and_recreational_facility_gaps'
   📈 Utility Score: 0.2627

🔹 Concept: Exposure_to_toxins_workplace
   🏷️  Cluster: At_the_Individual_level
   ✅ Semantic Similarity: 0.4637
   ⚠️  Max Redundancy: 0.6000 → with 'Exposure_to_pollutants_workplace'
   📈 Utility Score: -0.1363

🔹 Concept: Health_outcomes
   🏷️  Cluster: Environmental_Determinants_of_Health
   ✅ Semantic Similarity: 0.2346
   ⚠️  Max Redundancy: 0.0000 → with 'At_the_microenvironmental_level'
   📈 Utility Score: 0.2346

🔹 Concept: At_the_macroenvironmental_level
   🏷️  Cluster: Environmental_Determinants_of_Health
   ✅ Semantic Similarity: 0.5368
   ⚠️  Max Redundancy: 0.6000 


🔹 Concept: Level_of_pollution
   🏷️  Cluster: At_the_mesoenvironmental_level
   ✅ Semantic Similarity: 0.2569
   ⚠️  Max Redundancy: 0.0000 → with 'Built_in_environment'
   📈 Utility Score: 0.2569

🔹 Concept: air_quality
   🏷️  Cluster: Home_as_microenvironment
   ✅ Semantic Similarity: 0.4134
   ⚠️  Max Redundancy: 0.3333 → with 'water_quality'
   📈 Utility Score: 0.0800

🔹 Concept: Global_heat_Wave
   🏷️  Cluster: Extreme_weather_Events
   ✅ Semantic Similarity: 0.3197
   ⚠️  Max Redundancy: 0.0000 → with 'Climatic_changes'
   📈 Utility Score: 0.3197

🔹 Concept: At_the_global_level
   🏷️  Cluster: Environmental_Determinants_of_Health
   ✅ Semantic Similarity: 0.4724
   ⚠️  Max Redundancy: 0.6000 → with 'At_the_microenvironmental_level'
   📈 Utility Score: -0.1276

🔹 Concept: Social_Support_network
   🏷️  Cluster: Community_resilience
   ✅ Semantic Similarity: 0.0732
   ⚠️  Max Redundancy: 0.0000 → with 'Disaster_prepardness_in_urban_area'
   📈 Utility Score: 0.0732

🔹 Concept: High_