In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
sys.path.append('/content/drive/MyDrive/master_thesis2')

In [None]:
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz

Collecting spacy<3.8.0,>=3.7.0 (from scispacy)
  Downloading spacy-3.7.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy<3.8.0,>=3.7.0->scispacy)
  Downloading thinc-8.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Downloading spacy-3.7.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading thinc-8.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (922 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m922.4/922.4 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: thinc, spacy
  Attempting uninstall: thinc
    Found existing installation: thinc 8.1.12
    Uninstalling thinc-8.1.12:
      Successfully uninstalled thinc-8.1.12
  Attempting uninstall: spacy
    Found existing ins

1. Build the reference corpus

In [None]:
!pip install rdflib



In [None]:
import os
import re
import numpy as np
import spacy
import networkx as nx

from tqdm import tqdm
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import chi2_contingency

# For RDF output
from rdflib import Graph as RDFGraph, Namespace, URIRef, RDF
from rdflib.namespace import RDFS
from scispacy.linking import EntityLinker
from scispacy.abbreviation import AbbreviationDetector

# 1) LOAD SCISPACY MODEL (WITH UMLS LINKER)
def load_scispacy_model_with_linker(threshold=0.8):
    """
    Loads a SciSpaCy model (en_core_sci_sm) and adds:
      - AbbreviationDetector
      - scispacy_linker for UMLS with the specified threshold.
    """
    print("Loading SciSpaCy model...")
    nlp = spacy.load("en_core_sci_sm")
    print("Initial pipeline components:", nlp.pipe_names)

    nlp.add_pipe("abbreviation_detector", last=True)
    nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})
    return nlp

# 2. TEXT NORMALIZATION
def normalize_text(term):
    """
    Normalize term by removing underscores and ensuring consistent capitalization.
    """
    term = term.replace("_", " ").strip()
    term = " ".join(word.capitalize() for word in term.split())
    return term

# 3. CONTEXTUAL DISAMBIGUATION
def disambiguate_entity(entity, context, linker):
    """
    Disambiguate an entity using its surrounding context and UMLS linking.
    """
    CANCER_SEMTYPES = {"T191"}  # Neoplastic Processes
    if entity._.kb_ents:
        for cui, score in entity._.kb_ents:
            concept = linker.kb.cui_to_entity[cui]
            if CANCER_SEMTYPES.intersection(concept.types) and "breast" in context.lower():
                return concept
    return None

# 4. EXTRACT CANCER ENTITIES
def extract_cancer_entities(text, nlp):
    """
    Extracts entities related to cancer (T191) using SciSpaCy's EntityLinker.
    """
    doc = nlp(text)
    linker = nlp.get_pipe("scispacy_linker")

    cancer_entities = []
    for ent in doc.ents:
        context = text
        concept = disambiguate_entity(ent, context, linker)
        if concept:
            term = normalize_text(ent.text)
            cancer_entities.append(term)

    return cancer_entities

# 5. CHI-SQUARE SIGNIFICANCE
def calculate_statistical_significance(domain_freq, ref_freq, total_domain_terms, total_ref_terms):
    """
    Calculates the chi-square p-value for term frequency significance.
    """
    table = [
        [domain_freq, ref_freq],
        [total_domain_terms - domain_freq, total_ref_terms - ref_freq]
    ]
    _, p_val, _, _ = chi2_contingency(table)
    return p_val

# 6. TAXONOMY BUILDING
def initialize_taxonomy():
    """
    Initializes a taxonomy graph with a root node "Breast Cancer".
    """
    G = nx.DiGraph()
    G.add_node("Breast Cancer")
    return G

def insert_concept_recursive(G, concept, descriptions, threshold=0.5, start_node="Breast Cancer", depth_limit=50, current_depth=0):
    """
    Inserts a concept under the most semantically similar child node.
    """
    if current_depth >= depth_limit:
        print(f"Depth limit reached for concept '{concept}'. Adding under '{start_node}'.")
        G.add_edge(start_node, concept)
        return

    children = list(G.successors(start_node))
    if not children:
        G.add_edge(start_node, concept)
        return

    concept_emb = descriptions[concept].reshape(1, -1)
    child_embs = np.array([descriptions[child] for child in children])
    sims = cosine_similarity(concept_emb, child_embs)[0]
    max_sim_idx = np.argmax(sims)
    max_sim = sims[max_sim_idx]

    if max_sim > threshold:
        next_node = children[max_sim_idx]
        insert_concept_recursive(G, concept, descriptions, threshold, next_node, depth_limit, current_depth + 1)
    else:
        G.add_edge(start_node, concept)

# 7. TAXONOMY POSTPROCESSING
def clean_ambiguous_nodes(taxonomy, descriptions, threshold=0.8):
    """
    Merge or reassign nodes with ambiguous or redundant meanings.
    """
    nodes = list(taxonomy.nodes())
    for i, node1 in enumerate(nodes):
        for node2 in nodes[i + 1:]:
            if node1 != node2 and cosine_similarity([descriptions[node1]], [descriptions[node2]])[0][0] > threshold:
                nx.relabel_nodes(taxonomy, {node2: node1}, copy=False)

def remove_singleton_nodes(taxonomy):
    """
    Remove singleton nodes that have no meaningful connections.
    """
    for node in list(taxonomy.nodes()):
        if taxonomy.degree(node) == 0:
            taxonomy.remove_node(node)

# 8. EXPORT TAXONOMY TO RDF
def export_taxonomy_to_rdf(nx_graph, output_path):
    """
    Exports the taxonomy to RDF format.
    """
    rdf_graph = RDFGraph()
    TAXO = Namespace("http://example.org/taxonomy#")

    def safe(node):
        return re.sub(r"[^a-zA-Z0-9_-]", "_", node)

    for node in nx_graph.nodes():
        node_uri = TAXO[safe(node)]
        rdf_graph.add((node_uri, RDF.type, RDFS.Class))

    for parent, child in nx_graph.edges():
        parent_uri = TAXO[safe(parent)]
        child_uri = TAXO[safe(child)]
        rdf_graph.add((child_uri, RDFS.subClassOf, parent_uri))

    rdf_graph.serialize(destination=output_path, format='xml')
    print(f"RDF Taxonomy saved to {output_path}")


In [None]:
# MAIN PIPELINE

# Load SciSpaCy model
nlp = load_scispacy_model_with_linker(threshold=0.8)

# Load SentenceTransformer
print("Loading SentenceTransformer...")
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Model loaded.\n")

# Load domain and reference corpora
domain_corpus_path = "/content/drive/MyDrive/master_thesis2/data/pubmed/pubmed_cancer_corpus.txt"
reference_corpus_path = "/content/drive/MyDrive/master_thesis2/statistical_method/data/reference_corpus.txt"

with open(domain_corpus_path, "r") as f:
  domain_corpus = f.readlines()
with open(reference_corpus_path, "r") as f:
  reference_corpus = f.readlines()

# Extract cancer entities from domain corpus
print("Extracting cancer entities from domain corpus...")
domain_cancer_entities = []
for text in tqdm(domain_corpus, desc="Domain Corpus"):
  domain_cancer_entities.extend(extract_cancer_entities(text, nlp))

# Compute reference term frequencies
print("Computing term frequencies for reference corpus...")
reference_term_freq = Counter()
for text in tqdm(reference_corpus, desc="Reference Corpus"):
  reference_term_freq.update(text.split())

# Compute domain term frequencies
domain_term_freq = Counter(domain_cancer_entities)
total_domain_terms = sum(domain_term_freq.values())
total_ref_terms = sum(reference_term_freq.values())

# Filter by chi-square significance
print("Identifying domain-specific concepts...")
new_concepts = []
for term, d_freq in domain_term_freq.items():
  r_freq = reference_term_freq.get(term, 0)
  if calculate_statistical_significance(d_freq, r_freq, total_domain_terms, total_ref_terms) < 0.05:
      new_concepts.append(term)

# Compute embeddings
print("Computing embeddings...")
descriptions = {term: model.encode([term])[0] for term in new_concepts}

# Build taxonomy
taxonomy = initialize_taxonomy()
print("Building taxonomy...")
for concept in new_concepts:
  insert_concept_recursive(taxonomy, concept, descriptions, threshold=0.5)

# Postprocessing
print("Postprocessing taxonomy...")
clean_ambiguous_nodes(taxonomy, descriptions, threshold=0.8)
remove_singleton_nodes(taxonomy)

# Export to RDF
output_path = "taxonomy.rdf"
export_taxonomy_to_rdf(taxonomy, output_path)

Loading SciSpaCy model...
Initial pipeline components: ['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner']
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectors_sparse.npz not found in cache, downloading to /tmp/tmprh8ghzrm


100%|██████████| 492M/492M [00:16<00:00, 32.1MiB/s]


Finished download, copying /tmp/tmprh8ghzrm to cache at /root/.scispacy/datasets/2b79923846fb52e62d686f2db846392575c8eb5b732d9d26cd3ca9378c622d40.87bd52d0f0ee055c1e455ef54ba45149d188552f07991b765da256a1b512ca0b.tfidf_vectors_sparse.npz
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/nmslib_index.bin not found in cache, downloading to /tmp/tmp_0pvoo09


100%|██████████| 724M/724M [00:14<00:00, 53.6MiB/s]


Finished download, copying /tmp/tmp_0pvoo09 to cache at /root/.scispacy/datasets/7e8e091ec80370b87b1652f461eae9d926e543a403a69c1f0968f71157322c25.6d801a1e14867953e36258b0e19a23723ae84b0abd2a723bdd3574c3e0c873b4.nmslib_index.bin
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectorizer.joblib not found in cache, downloading to /tmp/tmpuhg6_ouc


100%|██████████| 1.32M/1.32M [00:00<00:00, 16.1MiB/s]

Finished download, copying /tmp/tmpuhg6_ouc to cache at /root/.scispacy/datasets/37bc06bb7ce30de7251db5f5cbac788998e33b3984410caed2d0083187e01d38.f0994c1b61cc70d0eb96dea4947dddcb37460fb5ae60975013711228c8fe3fba.tfidf_vectorizer.joblib



https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/concept_aliases.json not found in cache, downloading to /tmp/tmpiv8bbxnr


100%|██████████| 264M/264M [00:04<00:00, 62.6MiB/s]


Finished download, copying /tmp/tmpiv8bbxnr to cache at /root/.scispacy/datasets/6238f505f56aca33290aab44097f67dd1b88880e3be6d6dcce65e56e9255b7d4.d7f77b1629001b40f1b1bc951f3a890ff2d516fb8fbae3111b236b31b33d6dcf.concept_aliases.json
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/kbs/2023-04-23/umls_2022_ab_cat0129.jsonl not found in cache, downloading to /tmp/tmpq74p278d


100%|██████████| 628M/628M [00:11<00:00, 56.9MiB/s]


Finished download, copying /tmp/tmpq74p278d to cache at /root/.scispacy/datasets/d5e593bc2d8adeee7754be423cd64f5d331ebf26272074a2575616be55697632.0660f30a60ad00fffd8bbf084a18eb3f462fd192ac5563bf50940fc32a850a3c.umls_2022_ab_cat0129.jsonl
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv not found in cache, downloading to /tmp/tmpc5hz0vp2


100%|██████████| 4.26k/4.26k [00:00<00:00, 8.73MiB/s]


Finished download, copying /tmp/tmpc5hz0vp2 to cache at /root/.scispacy/datasets/21a1012c532c3a431d60895c509f5b4d45b0f8966c4178b892190a302b21836f.330707f4efe774134872b9f77f0e3208c1d30f50800b3b39a6b8ec21d9adf1b7.umls_semantic_type_tree.tsv
Loading SentenceTransformer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded.

Extracting cancer entities from domain corpus...


  global_matches = self.global_matcher(doc)
Domain Corpus: 100%|██████████| 11642/11642 [03:36<00:00, 53.72it/s]


Computing term frequencies for reference corpus...


Reference Corpus: 100%|██████████| 1000/1000 [00:00<00:00, 2469.30it/s]


Identifying domain-specific concepts...
Computing embeddings...
Building taxonomy...
Depth limit reached for concept 'Primary Breast Cancer'. Adding under 'Lobular Breast Cancer'.
Depth limit reached for concept 'Breast Cancer Surgery'. Adding under 'Lobular Breast Cancer'.
Depth limit reached for concept 'Breast Cancer Database'. Adding under 'Lobular Breast Cancer'.
Depth limit reached for concept 'Breast Cancer Gene'. Adding under 'Lobular Breast Cancer'.
Depth limit reached for concept 'Black Breast Cancer'. Adding under 'Lobular Breast Cancer'.
Depth limit reached for concept 'Breast Cancers'. Adding under 'Lobular Breast Cancer'.
Depth limit reached for concept 'Breast Cancer Stem Cells'. Adding under 'Lobular Breast Cancer'.
Postprocessing taxonomy...
RDF Taxonomy saved to taxonomy.rdf
