In [None]:
import os
os.environ['TRANSFORMERS_CACHE'] = 'models/cache/'
import medspacy
from spacy import displacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.language import Language
from spacy.util import filter_spans

import re
import nltk
from transformers import AutoTokenizer, pipeline

import pandas as pd
import numpy as np

import joblib
from tqdm.auto import tqdm

import preprocessing_utils
import utils
import preprocessing
from download import Downloader, download_study_info

In [None]:
memory = joblib.Memory(".")

def ParallelExecutor(use_bar="tqdm", **joblib_args):
    """Utility for tqdm progress bar in joblib.Parallel"""
    all_bar_funcs = {
        "tqdm": lambda args: lambda x: tqdm(x, **args),
        "False": lambda args: iter,
        "None": lambda args: iter,
    }
    def aprun(bar=use_bar, **tq_args):
        def tmp(op_iter):
            if str(bar) in all_bar_funcs.keys():
                bar_func = all_bar_funcs[str(bar)](tq_args)
            else:
                raise ValueError("Value %s not supported as bar type" % bar)
            
            # Pass n_jobs from joblib_args
            return joblib.Parallel(n_jobs=joblib_args.get("n_jobs", 10))(bar_func(op_iter))

        return tmp
    return aprun


In [None]:
#load dataset
# df = pd.read_csv("../data/clinicaltrials_parsed.csv")
# nct_ids = df["trials.nct_id"].unique().tolist()

folder_path = '../data/trials_xmls/'  # Replace this with the path to your folder
file_names = []
# List all files in the folder
for file in os.listdir(folder_path):
    if os.path.isfile(os.path.join(folder_path, file)):
        file_name, file_extension = os.path.splitext(file)
        file_names.append(file_name)
        
nct_ids = file_names

def parallel_downloader(
    n_jobs,
    nct_ids,
):
    parallel_runner = ParallelExecutor(n_jobs=n_jobs)(total=len(nct_ids))
    X = parallel_runner(
        joblib.delayed(download_study_info)(
        nct_id, 
        )
        for nct_id in nct_ids
    )     
    updated_cts = np.vstack(X).flatten()
    return updated_cts 

def parallel_preprocessing(
    n_jobs,
    nct_ids
):
    parallel_runner = ParallelExecutor(n_jobs=n_jobs)(total=len(nct_ids))
    parallel_runner(
        joblib.delayed(preprocessing.eic_text_preprocessing)(
        [nct_id]
        )
        for nct_id in nct_ids
    )       
    
# updated_cts = parallel_downloader(n_jobs=-1, nct_ids = nct_ids)

parallel_preprocessing(
    n_jobs=-1,
    nct_ids = nct_ids
)

In [None]:
preprocessing_utils.eic_text_preprocessing(_ids=["NCT05201183"])

In [None]:
_ids = ["NCT00001586"]
for _, nid in enumerate(_ids):
    print(nid)
    eic_text = preprocessing_utils.extract_eligibility_criteria(nid)
    # if eic_text is not None:
    print(eic_text)

In [None]:
regex_patterns = list(preprocessing_utils.load_regex_patterns("../data/regex_patterns.json").values())

In [None]:
text = """Inclusion Criteria:

          1. Documentation of Disease: Patients must be diagnosed with one of the following
             conditions:

               1. Acute Myeloid Leukemia (AML), with no history of extramedullary disease, who are
                  not in complete remission, who have either primary refractory or relapsed
                  disease, and who do not have more than one of the following adverse factors:

                    -  Duration of first CR < 6 months (if previously in CR), based on the best
                       overall clinical assessment of the disease course, not solely based on blood
                       test or bone marrow biopsy results

                    -  Poor risk karyotype including any of the following: complex karyotype with
                       ≥3 clonal abnormalities, 5q-/-5, 7q-/-7, 11q23 abnormalities, inv(3q), 20q
                       or 21q abnormalities, t (6;9), t (9;22), 17p abnormalities [or TP53
                       mutations] or monosomal karyotype. Molecular typing (except for TP53
                       mutation) will not be used for eligibility criteria determination.

                    -  Circulating peripheral blood blasts at time of enrollment

                    -  Karnofsky performance status <90%

               2. Acute Lymphocytic Leukemia (ALL) who are not in complete remission, who have
                  either primary refractory or relapsed disease, and who do not have more than one
                  of the following adverse factors:

                    -  Primary refractory or first relapse. Patients in second or subsequent
                       relapse are excluded.

                    -  Bone marrow blasts >25% within 30 days before the start of the conditioning
                       regimen

                    -  Age >40 years

               3. Myelodysplasia with a Revised International Prognostic Score (IPSS-R) of greater
                  than 4.5 (i.e., high- or very-high risk).

               4. Chronic Myelogenous Leukemia (CML) in accelerated phase, defined by any of the
                  following:

                    -  10-19% blasts in peripheral blood white cells or bone marrow

                    -  Peripheral blood basophils at least 20%

                    -  Persistent thrombocytopenia (< 100 x 109/l) unrelated to therapy, or
                       persistent thrombocytosis (>1000 x 109/l) unresponsive to therapy

                    -  Increasing spleen size and increasing white blood cell (WBC) count
                       unresponsive to therapy

                    -  Cytogenetic evidence of clonal evolution (i.e., the appearance of an
                       additional genetic abnormality that was not present in the initial specimen
                       at the time of diagnosis of chronic phase)

          2. The patient must be 18-65 years old at time of consent

          3. Signed written informed consent: Patient must be capable of understanding the
             investigational nature of this study, potential risks and benefits of the study, and
             be able to provide a valid informed consent.

          4. Availability of a consenting human leukocyte antigens (HLA)-matched donor

          5. Karnofsky Performance Status 70% or higher

          6. Required baseline laboratory values:

               -  Estimated creatinine clearance ≥ 60 ml/min

               -  Aspartate aminotransferase and alanine aminotransferase ≤ 2.5 x upper limit of
                  normal value

               -  Bilirubin ≤ 1.5 x upper limit of normal value (unless determined to be related to
                  Gilbert's disease)

          7. Required baseline cardiac function values:

               -  Required baseline cardiac function of left ventricular ejection fraction (LVEF) >
                  45 % corrected

          8. Required baseline pulmonary function values:

               -  Required baseline pulmonary function of lung diffusing capacity (DLCO) > 45 %
                  predicted (corrected for hemoglobin))

        Exclusion Criteria:

          1. HIV seropositive patients

          2. Pregnant or nursing females.

          3. Prior radiation therapy

          4. Patients who have had a prior autologous or allogeneic bone marrow or stem cell
             transplantation

          5. Gemtuzumab ozogamicin (trade name: Mylotarg) and/or inotuzumab ozogamicin (trade name:
             Besponsa) use within 60 days before start of the conditioning regimen

          6. Though this is NOT an exclusion criterion, we strongly recommend discontinuation of
             any steroidal oral contraceptives at least 7 days before start of the conditioning
             regimen. Use of therapeutic alternatives, including leuprolide should be considered to
             reduce the risk of SOS/VOD. Of note, for patients already on steroidal oral
             contraceptives for excessive menorrhagia, the switch to leuprolide should occur at
             least 2 weeks before the start of the conditioning regimen"""
             
preprocessing_utils.split_to_sentences(text, regex_patterns)

In [None]:
# Load spaCy language models
med_nlp = medspacy.load()
tokenizer_biomedical = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all",  aggregation_strategy="first")
biomedical_pipeline = pipeline("ner", model="d4data/biomedical-ner-all", tokenizer=tokenizer_biomedical)
mutations_tokenizer = AutoTokenizer.from_pretrained("Brizape/tmvar-PubMedBert-finetuned-24-02")
mutations_pipeline = pipeline("ner", model="Brizape/tmvar-PubMedBert-finetuned-24-02", tokenizer=mutations_tokenizer)

In [None]:
import spacy

def split_into_sentences(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return [sent.text for sent in doc.sents]

In [None]:
split_into_sentences(text)

In [None]:
aux_entities = aux_pipeline("Engraftment including >95% myeloid cell donor chimerism and Absolute neutrophil count {ANC} > 1.0 x 109/L")

In [None]:
tokenizer_biomedical.tokenize("Engraftment including >95% myeloid cell donor chimerism and Absolute neutrophil count {ANC} > 1.0 x 109/L")

In [None]:
import requests
from multiner_server import start_multiner_server
# start_multiner_server()
def query_plain(text, url="http://localhost:8888/plain"):
    return requests.post(url, json={'text': text})

In [None]:
import requests
import time

# Sleep for 3 seconds
# time.sleep(4)
ent_dict = query_plain("Currently receiving iron and vitamin B12 infusions for anemia with no resolution of fatigue".lower())

In [None]:
ent_dict.content.decode('utf-8')

In [None]:
current_directory = os.getcwd()
run_path = "../resources/BERN2/scripts/run_bern2.sh"
stop_path = "../resources/BERN2/scripts/stop_bern2.sh"
working_directory = "../resources/BERN2/scripts/"
os.chdir(working_directory)

In [None]:
import subprocess
import os
import time
import requests
current_directory = os.getcwd()
working_directory = "../resources/BERN2/scripts/"
os.chdir(working_directory)
run_path = "run_bern2.sh"
stop_path = "stop_bern2.sh"
print("Stopping any existing Multi-NER server instance.")
stop_process = subprocess.Popen(["bash", stop_path], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
stop_process.wait()
print("Activating Mutli-NER Server... This can take approx. 1 minute")
try:
    subprocess.Popen(["bash", run_path], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
    timeout = 120  # Adjust this value as needed
    # Define the server's URL that you want to check
    server_url = "http://localhost:8888"  # Update with the actual URL
    # Wait for the server to become available or reach the timeout
    start_time = time.time()
    while True:
        try:
            # Send a request to the server to check its availability
            response = requests.get(server_url)
            response.raise_for_status()  # Raises an exception for non-2xx status codes
            break  # Server is available, exit the loop
        except (requests.ConnectionError, requests.HTTPError) as e:
            if time.time() - start_time >= timeout:
                print(f"Server did not become available within {timeout} seconds.")
                break  # Timeout reached
            else:
                # Wait for a short time before checking again
                time.sleep(1)

    # Continue with other tasks
    print("Server is now available.")
except subprocess.CalledProcessError as e:
    print(f"Error executing the script: {e}")
os.chdir(current_directory)

In [None]:
def mtner_normalize_format(json_data):
    spacy_format_entities = []
    for annotation in json_data["annotations"]:
        start = annotation["span"]["begin"]
        end = annotation["span"]["end"]
        label = annotation["obj"]
        mention = annotation["mention"]
        score = annotation["prob"]
        normalized_id = annotation["id"]
        spacy_format_entities.append({
            "entity_group": label,
            "text": mention,
            "score": score,
            "start": start,
            "end": end,
            "normalized_id": normalized_id
        })
    spacy_result = {
        "text": json_data["text"],
        "ents": spacy_format_entities,
    }

    return spacy_result

def post_process_entities(entity_list):
    merged_entities = []
    current_entity = None
    for entity in entity_list:
        current_entity = {
            "entity_group": entity["entity_group"],
            "score": entity["score"],
            "text": entity["word"].replace("##", " "),
            "start": entity["start"],
            "end": entity["end"]
        }
        if (current_entity is not None) and entity["word"].startswith("##"):
            current_entity["text"] += entity["word"].replace("##", "")
            current_entity["end"] = entity["end"]
            current_entity["score"] = max(current_entity["score"], entity["score"])
            
        else:
            merged_entities.append(current_entity)
            current_entity = None
            
    return merged_entities

def merge_lists_with_priority_to_first(list1, list2):
    merged_list = list1.copy()  # Create a copy of list1 to preserve its contents
    
    for dict2 in list2:
        overlap = False
        for dict1 in list1:
            if (dict1['start'] <= dict2['end'] and dict2['start'] <= dict1['start']) or (dict2['start'] <= dict1['end'] and dict1['start'] <= dict2['start']):
                overlap = True
                break
        
        if not overlap:
            merged_list.append(dict2)
    
    return merged_list



In [None]:
df = pd.read_csv("../data/preprocessed_data/NCT05786924_preprocessed.csv")

In [None]:
text = "For participants in the NSCLC Cohort: Known tumor programmed death-ligand 1 {PD-L1} expression status as determined by an immunohistochemistry assay during participation in other clinical studies {e.g., participants whose PD-L1 expression status was determined during screening for entry into a study with anti-programmed death 1 or anti-PD-L1 antibodies but were not eligible are excluded}"


In [None]:
ent_list = []
for _,row in df.iterrows():
    sentences = row["sentence"].split(".")
    for sent in sentences:
        sent_dict = {}
        sent_dict["sentence"] = sent
        main_entities = mtner_normalize_format(query_plain(sent))["ents"]
        variants_entities = mutations_pipeline(sent, aggregation_strategy="simple")
        combined_entities = merge_lists_with_priority_to_first(variants_entities, main_entities)
        aux_entities = biomedical_pipeline(sent, aggregation_strategy="simple")
        
        aux_entities = post_process_entities(get_dictionaries_with_values(aux_entities, "entity_group", ["Age", "Sex", "Sign_symptom", "Biological_structure", "Date", 
                                                                                        "Duration", "Frequency", "Severity", "Lab_value", "Diagnostic_procedure", 
                                                                                        "Therapeutic_procedure", "Personal_background", "Clinical_event", "Outcome"]))
        combined_entities  = merge_lists_with_priority_to_first(combined_entities,aux_entities)
        # Convert the selected_entries dictionary back to a list
        sent_dict["annotations"] = combined_entities
        if len(sent_dict["annotations"]) > 0:
            ent_list.append(sent_dict)
    # print(row["sentence"])

In [None]:
ent_list

In [None]:
df = pd.read_csv("preprocessed_data/NCT05786924_preprocessed.csv")
all_dict = []
for idx, row in df.iterrows():
    sent_dict = {}
    sent_dict["index"]= idx + 1
    doc = nlp(row["sentence"])
    text = " ".join(doc._.bow)
    sent_dict["sentence"] = text
    bern_entities= convert_to_spacy_format(query_plain(text))["ents"]
    mutation_entities = mutations_pipeline(text, aggregation_strategy="simple")
    combined_entities = merge_lists_with_priority(mutation_entities, bern_entities)
    aux_entities = biomedical_pipeline(text, aggregation_strategy="simple")
    
    aux_entities = post_process_entities(get_dictionaries_with_values(aux_entities, "entity_group", ["Age", "Sex", "Sign_symptom", "Biological_structure", "Date", 
                                                                                    "Duration", "Frequency", "Severity", "Lab_value", "Diagnostic_procedure", 
                                                                                    "Therapeutic_procedure", "Personal_background", "Clinical_event", "Outcome"]))
    combined_entities  = merge_lists_with_priority(combined_entities,aux_entities)
    # Convert the selected_entries dictionary back to a list
    sent_dict["annotations"] = combined_entities
    if len(sent_dict["annotations"]) > 0:
        all_dict.append(sent_dict)
    


In [None]:
all_dict

In [None]:
doc = nlp("Recurrent NSCLC with BRAF Class II alterations KRAS mutations other than TP53RK and G12C {ie, G12D, G12V} mutations {with Sponsor approval for KRAS mutations} without small cell lung cancer transformation with progressive disease confirmed by radiographic assessment.".lower())
text = " ".join(doc._.bow)
mutation_entities = mutations_pipeline(text, aggregation_strategy="simple")

In [None]:
mutation_entities

In [None]:
med_nlp = medspacy.load()

In [None]:
med_nlp.pipe_names

In [None]:
med_nlp = medspacy.load()
med_nlp.disable_pipe('medspacy_target_matcher')
med_nlp.disable_pipe('medspacy_pyrush')
# med_nlp.add_pipe('sentencizer')
print(med_nlp.pipe_names)
@Language.component("gene-ner")
def gene_ner(doc):
    spacy_entities = [(entity['entity_group'], entity['start'], entity['end']) for entity in entities_resolved]
    for entity, start, end in spacy_entities:
        start_token = None
        end_token = None
        # Find the corresponding tokens for the start and end positions
        for token in doc:
            if token.idx <= start < token.idx + len(token.text) and start_token is None:
                start_token = token
            if token.idx <= end <= token.idx + len(token.text) and end_token is None:
                end_token = token

        # Check if the start or end positions fall outside the tokenization
        if start_token is None or end_token is None:
            continue

        span = spacy.tokens.Span(doc, start=start_token.i, end=end_token.i + 1, label=entity)
        doc.ents = list(doc.ents) + [span]    
    return doc


med_nlp.add_pipe("gene-ner", before='medspacy_context') 

@Language.component("biomed-ner")
def biomedical_ner(doc):
    sp_entities = [(entity['entity_group'], entity['start'], entity['end']) for entity in entities_biomedical]
    for entity, start, end in sp_entities:
        start_token = None
        end_token = None
        # Find the corresponding tokens for the start and end positions
        for token in doc:
            if token.idx <= start < token.idx + len(token.text) and start_token is None:
                start_token = token
            if token.idx <= end <= token.idx + len(token.text) and end_token is None:
                end_token = token

        # Check if the start or end positions fall outside the tokenization
        if start_token is None or end_token is None:
            continue

        span = spacy.tokens.Span(doc, start=start_token.i, end=end_token.i + 1, label=entity)
        doc.ents = list(doc.ents) + [span]    
    return doc

# med_nlp.add_pipe("biomed-ner", before='medspacy_context') 

@Language.component("aberrations-ner")
def regex_pattern_matcher_for_aberrations(doc):
    df_regex = pd.read_csv("../data/regex_variants.tsv", sep="\t", header=None)
    df_regex = df_regex.rename(columns={1 : "label", 2:"regex_pattern"}).drop(columns=[0])
    dict_regex = df_regex.set_index('label')['regex_pattern'].to_dict()
    original_ents = list(doc.ents)
    # Compile the regex patterns
    compiled_patterns = {
        label: re.compile(pattern)
        for label, pattern in dict_regex.items()
    }

    mwt_ents = []
    for label, pattern in compiled_patterns.items():
        for match in re.finditer(pattern, doc.text):
            start, end = match.span()
            span = doc.char_span(start, end)
            if span is not None:
                mwt_ents.append((label, span.start, span.end, span.text))
                
    for ent in mwt_ents:
        label, start, end, name = ent
        per_ent = Span(doc, start, end, label=label)
        original_ents.append(per_ent)

    doc.ents = filter_spans(original_ents)
    
    return doc

med_nlp.add_pipe("aberrations-ner", before='medspacy_context') 

In [None]:
entities_resolved

In [None]:
ent_list =[] 
for entity in doc.ents:
    ent_list.append({"entity_group" : entity.label_, "text" : entity.text, "start": entity.start_char, "end": entity.end_char, "is_negated" : "yes" if entity._.is_negated else "no"})

In [None]:
ent_list

In [None]:
def aberration_recognizer(text):
    med_nlp = medspacy.load()
    med_nlp.disable_pipe('medspacy_target_matcher')
    @Language.component("aberrations-ner")
    def regex_pattern_matcher_for_aberrations(doc):
        df_regex = pd.read_csv("../data/regex_variants.tsv", sep="\t", header=None)
        df_regex = df_regex.rename(columns={1 : "label", 2:"regex_pattern"}).drop(columns=[0])
        dict_regex = df_regex.set_index('label')['regex_pattern'].to_dict()
        original_ents = list(doc.ents)
        # Compile the regex patterns
        compiled_patterns = {
            label: re.compile(pattern)
            for label, pattern in dict_regex.items()
        }
        mwt_ents = []
        for label, pattern in compiled_patterns.items():
            for match in re.finditer(pattern, doc.text):
                start, end = match.span()
                span = doc.char_span(start, end)
                if span is not None:
                    mwt_ents.append((label, span.start, span.end, span.text))
                    
        for ent in mwt_ents:
            label, start, end, name = ent
            per_ent = Span(doc, start, end, label=label)
            original_ents.append(per_ent)

        doc.ents = filter_spans(original_ents)
        
        return doc
    med_nlp.add_pipe("aberrations-ner", before='medspacy_context')
    doc = med_nlp(text)
    return doc

        

In [None]:
doc = aberration_recognizer("Cohort 1: Recurrent NSCLC with BRAF Class II alterations or KRAS mutations other than G12C {ie, G12D, G12V} mutations {with Sponsor approval for KRAS mutations} without small cell lung cancer transformation with progressive disease confirmed by radiographic assessment.")

In [None]:
doc.ents

In [None]:
#function to modify options for displacy NER visualization
def get_entity_options():
    entities = [
        "Disease_disorder", "CHEMICAL", "Age", "GENETIC", "Duration", "Date", "Sex",
        "Diagnostic_procedure", "Lab_value", "Protein", "DNA", "cell_type",
        "Sign_symptom", "expression", "mutation", "NEG_ENTITY",
    ]
    colors = {
        "Disease_disorder": "linear-gradient(180deg, #66ffcc, #abf763)",
        "CHEMICAL": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
        "Age": "linear-gradient(180deg, #ff9a8f, #ffb55e)",
        "GENETIC": "linear-gradient(90deg, #9cd1fc, #9cfcf6)",
        "Duration": "linear-gradient(180deg, #fe8ce6, #fe8cd9)",
        "Date": "linear-gradient(90deg, #fca79c, #fcc59c)",
        "Sex": "linear-gradient(180deg, #9cfdfe, #9c9dfc)",
        "Diagnostic_procedure": "linear-gradient(90deg, #fcb69c, #fcec9c)",
        "Lab_value": "linear-gradient(180deg, #9cfc9c, #e3fc9c)",
        "Protein": "linear-gradient(90deg, #fc9cb0, #fc9cbe)",
        "DNA": "linear-gradient(180deg, #9c9cfc, #a39cfc)",
        "cell_type": "linear-gradient(90deg, #9ccdfc, #9cc3fc)",
        "Sign_symptom": "linear-gradient(180deg, #9cfc9e, #d4fc9c)",
        "expression": "linear-gradient(90deg, #9cfc9e, #fc9c9c)",
        "mutation": "linear-gradient(180deg, #ffc766, #fc9c9c)",
        "NEG_ENTITY": "linear-gradient(180deg, #ffc766, #fc9c9c)"
    }
    options = {"ents": entities, "colors": colors}
    return options

options = get_entity_options()#visualizing identified Named Entities in clinical input text 
displacy.render(doc, style='ent', options=options)

In [None]:
def negation_handling(doc):
    results = []
    for e in doc.ents:
        rs = str(e._.is_negated)
        if rs == "True": 
            results.append(e.text)
    return results #list of negative concepts from clinical note identified by negspacy

results0 = negation_handling(doc)

In [None]:
#function to identify span objects of matched megative phrases from clinical note
def match(nlp,terms,label):
        patterns = [nlp.make_doc(text) for text in terms]
        matcher = PhraseMatcher(nlp.vocab)
        matcher.add(label, None, *patterns)
        return matcher
    
#replacing the labels for identified negative entities    
def overwrite_ent_lbl(matcher, doc):
    matches = matcher(doc)
    seen_tokens = set()
    new_entities = []
    entities = doc.ents
    for match_id, start, end in matches:
        if start not in seen_tokens and end - 1 not in seen_tokens:
            new_entities.append(Span(doc, start, end, label=match_id))
            entities = [
                e for e in entities if not (e.start < end and e.end > start)
            ]
            seen_tokens.update(range(start, end))
        doc.ents = tuple(entities) + tuple(new_entities)
    return doc

matcher = match(med_nlp, results0, "NEG_ENTITY")
#doc0: new doc object with added "NEG_ENTITY label"
doc0 = overwrite_ent_lbl(matcher, doc) #visualizing identified Named Entities in clinical input text 
displacy.render(doc0, style='ent', options=options)

In [None]:
doc0.label

In [None]:
#function to add custom negation terms to the existing model
from negspacy.termsets import termset
ts = termset("en_clinical")
ts.add_patterns({
            "preceding_negations": ["deny", "refuse", "neither", "nor", "do not have"],
            "following_negations": ["absence of", "deny", "decline"],
        })
def neg_model2(nlp_model):
    nlp = spacy.load(nlp_model, disable = ['parser'])
    nlp.add_pipe('sentencizer')
    nlp.add_pipe("negex")
    return nlp  #updated list of all the negative concepts from clinical note identified by negspacy
results1 = negation_handling("en_ner_bc5cdr_md", lem_clinical_note, neg_model2)
matcher = match(nlp1, results1, "NEG_ENTITY")
#doc1: new doc object with added custom concepts for "NEG_ENTITY label"
doc1 = overwrite_ent_lbl(matcher, doc) #visualizing identified Named Entities in clinical input text 
displacy.render(doc, style='ent', options=options)

In [None]:
import re
import spacy
from spacy.tokens import Span
import pandas as pd
from utils import tokenize

nlp = spacy.load("en_ner_bc5cdr_md")
# Text to search for matches
text = "A Randomized Phase II Trial of a Mutated gp100 Melanoma Peptide g209-217210M With Hight Dose Interleukin-2 IL-2 in HLA-A2.1+Patients With Metastatic Melanoma"

@Language.component("aberrations-ner")
def regex_pattern_matcher_for_aberrations(doc):
    df_regex = pd.read_csv("../data/regex_variants.tsv", sep="\t", header=None)
    df_regex = df_regex.rename(columns={1 : "label", 2:"regex_pattern"}).drop(columns=[0])
    dict_regex = df_regex.set_index('label')['regex_pattern'].to_dict()
    original_ents = list(doc.ents)
    # Compile the regex patterns
    compiled_patterns = {
        label: re.compile(pattern)
        for label, pattern in dict_regex.items()
    }

    mwt_ents = []
    for label, pattern in compiled_patterns.items():
        for match in re.finditer(pattern, doc.text):
            start, end = match.span()
            span = doc.char_span(start, end)
            if span is not None:
                mwt_ents.append((label, span.start, span.end, span.text))
                
    for ent in mwt_ents:
        label, start, end, name = ent
        per_ent = Span(doc, start, end, label=label)
        original_ents.append(per_ent)

    doc.ents = filter_spans(original_ents)
    
    return doc


nlp.add_pipe("aberrations-ner", last=True) 
doc = nlp(text)

In [None]:
displacy.render(doc, style='ent')

In [None]:
import base64

# Replace 'input_file.bin' and 'output_file.txt' with your file names

# Read binary data from the .bin file
with open('wikipedia-pubmed-and-PMC-w2v.bin', 'rb') as binary_file:
    binary_data = binary_file.read()

# Encode binary data to Base64
base64_encoded_data = base64.b64encode(binary_data)

# Convert bytes to a string and write to a .txt file
with open('output_vectors.txt', 'w') as text_file:
    text_file.write(base64_encoded_data.decode('utf-8'))


In [None]:
# Specify the byte position from where you want to start printing
# Number of bytes you want to print
num_bytes_to_print = 100

# Read binary data from the .bin file
with open('wikipedia-pubmed-and-PMC-w2v.bin', 'rb') as binary_file:
    binary_data = binary_file.read()

# Slice the binary data to get only the first N bytes
sliced_data = binary_data[6000000:6000100]
print(sliced_data)

In [None]:
from gensim.models import KeyedVectors

# Replace 'path_to_word2vec_model.bin' and 'output_file.txt' with appropriate values
model = KeyedVectors.load_word2vec_format('wikipedia-pubmed-and-PMC-w2v.bin', binary=True)

# Open the output file in write mode
with open('output_file.txt', 'w') as output_file:
    # Iterate over each word in the vocabulary
    for idx, word in enumerate(model.index_to_key):
        print(idx)
        # Get the word vector for the current word
        word_vector = model.get_vector(word)

        # Convert the word vector to a comma-separated string
        vector_str = ','.join(str(val) for val in word_vector)

        # Write the word and its vector to the file
        output_file.write(f"{word} {vector_str}\n")


In [None]:
import gensim
import spacy
from spacy.vocab import Vocab

# Replace 'path_to_word2vec_model.bin' and 'spacy_word2vec_model' with appropriate values
gensim_model = KeyedVectors.load_word2vec_format('wikipedia-pubmed-and-PMC-w2v.bin', binary=True)

# Create a new spaCy Language object with a blank Vocab
nlp = spacy.blank("en")
vocab = Vocab()


In [None]:
# Copy word vectors from Gensim model to spaCy Vocab
for word in gensim_model.index_to_key:
    print(word)
    vocab.set_vector(word, gensim_model[word])

# Set the spaCy Vocab for the spaCy Language object
nlp.vocab = vocab

# # Save the spaCy model


In [None]:
nlp.to_disk("spacy_word2vec_biomed_model")

In [None]:
import rdflib

MESH_RDF_URL = "https://id.nlm.nih.gov/mesh/sparql"
MESH_GRAPH = rdflib.Graph()

def fetch_mesh_data():
    MESH_GRAPH.parse(MESH_RDF_URL, format="xml")

# Fetch MeSH data
fetch_mesh_data()

In [None]:
import spacy
py_text = "Cohort 1: Recurrent NSCLC with BRAF Class II alterations or KRAS mutations other than G12C {ie, G12D, G12V} mutations {with Sponsor approval for KRAS mutations} without small cell lung cancer transformation with progressive disease confirmed by radiographic assessment."
py_nlp = spacy.load("en_core_web_sm")
py_doc = py_nlp(py_text)
displacy.render(py_doc, style="dep", jupyter=True)

In [None]:
import os.path
import sys
target_directory = "/home/mabdallah/BERN2/multi_ner/"

In [None]:
sys.path.append(target_directory)

In [None]:
import subprocess
import os
import time
import requests
# Specify the path to your .sh script and the directory you want to change to
current_directory = os.getcwd()
run_path = "/home/mabdallah/BERN2/scripts/run_bern2.sh"
stop_path = "/home/mabdallah/BERN2/scripts/stop_bern2.sh"
working_directory = "/home/mabdallah/BERN2/scripts/"
os.chdir(working_directory)
print("Stopping existing Bio-NER server instance.")
stop_process = subprocess.Popen(["bash", stop_path], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
stop_process.wait()
print("Activating Bio-NER Server. This can take between 30 seconds and 1 minute.")
try:
    subprocess.Popen(["bash", run_path], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
    timeout = 60  # Adjust this value as needed
    # Define the server's URL that you want to check
    server_url = "http://localhost:8888"  # Update with the actual URL
    # Wait for the server to become available or reach the timeout
    start_time = time.time()
    while True:
        try:
            # Send a request to the server to check its availability
            response = requests.get(server_url)
            response.raise_for_status()  # Raises an exception for non-2xx status codes
            break  # Server is available, exit the loop
        except (requests.ConnectionError, requests.HTTPError) as e:
            if time.time() - start_time >= timeout:
                print(f"Server did not become available within {timeout} seconds.")
                break  # Timeout reached
            else:
                # Wait for a short time before checking again
                time.sleep(1)

    # Continue with other tasks
    print("Server is now available.")
except subprocess.CalledProcessError as e:
    print(f"Error executing the script: {e}")
os.chdir(current_directory)

In [None]:
def resolve_overlap(tagged_docs, tmvar_docs):
        """
        Step 1: check CUI and logit probability for same mention
        Step 2: check overlap with mutation and tags with the highest probability
        """

        # [Step 1] compare CUI and probability for same mention
        span2mentions = {}
        for entity_type, entity_dict in tagged_docs[0]['entities'].items():
            # check CUI and probability
            for mention_idx, mention_dict in enumerate(entity_dict):
                start = mention_dict['start']
                end = mention_dict['end']
                if "%d-%d" % (start, end) not in span2mentions:
                    span2mentions["%d-%d" % (start, end)] = []
                
                span2mentions["%d-%d"%(start, end)].append({"type":entity_type,
                                                            "CUI": mention_dict['id'],
                                                            "check_CUI": 1 if mention_dict['id'] != 'CUI-less' else 0,
                                                            "prob": tagged_docs[0]['prob'][entity_type][mention_idx][1],
                                                            "is_neural_normalized":mention_dict['is_neural_normalized']})
        
        for span in span2mentions.keys():
            # sort elements with CUI
            span2mentions[span] = sorted(span2mentions[span], key=lambda x:(x['check_CUI'], x['prob']), reverse=True)

        for entity_type, entity_dict in tagged_docs[0]['entities'].items():
            update_list = []
            for mention_idx, mention_dict in enumerate(entity_dict):
                start = mention_dict['start']
                end = mention_dict['end']
                
                if span2mentions["%d-%d"%(start, end)][0]['CUI'] == mention_dict['id'] and span2mentions["%d-%d"%(start, end)][0]['type'] == entity_type:
                    update_list.append(mention_dict)

            tagged_docs[0]['entities'].update({entity_type:update_list})

        # [Step 2] add mutation annotation
        tagged_docs[0]['entities']['mutation'] = tmvar_docs[0]['entities']['mutation']
        print(tmvar_docs)
        return tagged_docs

In [None]:
from downloader import Downloader

In [None]:
Downloader(id_list=["NCT05786924"], n_jobs=5).download_and_update_trials()

In [None]:
rootfile = "../data/preprocessed_data/"
rootfile + "/pre"

In [None]:
from preprocessing import Preprocessor

In [None]:
import os

# Replace 'path_to_directory' with the path of your directory
directory_path = '/mnt/cbib/EOSC4Cancer/synthetic_data/'

# Get the list of folders in the directory
folder_list = [folder for folder in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, folder))]
X = Preprocessor(id_list=folder_list, n_jobs=5).preprocess_patient_clinical_notes()

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
def find_and_remove_overlaps(dictionary_list, preferred_groups):
    # Create a dictionary to store non-overlapping entries
    non_overlapping = {}

    # Create a set from the preferred_groups for faster membership checking
    preferred_set = set(preferred_groups)

    # Iterate through the input list
    for entry in dictionary_list:
        text = entry['text']
        group = entry['entity_group']

        # Check if the text is already in the non_overlapping dictionary
        if text in non_overlapping:
            # Compare groups and keep the entry if it belongs to one of the preferred groups
            if group in preferred_set:
                non_overlapping[text] = entry
        else:
            non_overlapping[text] = entry

    # Convert the non-overlapping dictionary back to a list
    result_list = list(non_overlapping.values())

    return result_list

In [None]:
find_and_remove_overlaps(sentence["annotations"], preferred_groups=["gene", "ProteinMutation", "DNAMutation", "SNP"])

In [None]:
def find_and_remove_overlaps(dictionary_list, preferred_groups):
    # Create a dictionary to store non-overlapping entries
    non_overlapping = {}

    # Create a set from the preferred_groups for faster membership checking
    preferred_set = set(preferred_groups)

    # Iterate through the input list
    for entry in dictionary_list:
        text = entry['text']
        group = entry['group']

        # Check if the text is already in the non_overlapping dictionary
        if text in non_overlapping:
            # Compare groups and keep the entry if it belongs to one of the preferred groups
            if group in preferred_set:
                non_overlapping[text] = entry
        else:
            non_overlapping[text] = entry

    # Convert the non-overlapping dictionary back to a list
    result_list = list(non_overlapping.values())

    return result_list

# Example usage:
input_list = [
    {'text': 'apple', 'group': 'A'},
    {'text': 'banana', 'group': 'B'},
    {'text': 'apple', 'group': 'B'},
    {'text': 'banana', 'group': 'C'},
    {'text': 'date', 'group': 'C'},
]

preferred_groups = ['A', 'B']

result = find_and_remove_overlaps(input_list, preferred_groups)
print(result)


In [None]:
sent_dict

In [None]:
ent_list[0]

In [None]:
def negation_handling(sentence, entity):
    med_nlp = medspacy.load()
    med_nlp.disable_pipe('medspacy_target_matcher')
    # med_nlp.disable_pipe('medspacy_pyrush')
    @Language.component("add_custom_entity")
    def add_cutom_entity(doc):
        start_token = None
        end_token = None
        # Find the corresponding tokens for the start and end positions
        for token in doc:
            if token.idx <= entity["start"] < token.idx + len(token.text) and start_token is None:
                start_token = token
            if token.idx <= entity["end"] <= token.idx + len(token.text) and end_token is None:
                end_token = token
        doc.set_ents([Span(doc, start_token.i, end_token.i + 1, entity["entity_group"])]) 
        return doc
    med_nlp.add_pipe("add_custom_entity", before='medspacy_context') 
    doc = med_nlp(sentence)
    for e in doc.ents:
        rs = str(e._.is_negated)
        if rs == "True": 
            entity["is_negated"] = "yes"
        else:
            entity["is_negated"] = "no"
    return  entity 

In [None]:
ent_dict = ent_list[0]

In [None]:
ent_dict[["entity_group", "text"]

In [None]:
ent_list[0]["sentence"] = "No Histologically or cytologically confirmed recurrent/advanced metastatic solid tumors nor histiocytic neoplasms with documented BRAF or RAS {NRAS or KRAS} mutations"

In [None]:
ent_list[0]["sentence"]

In [None]:
import re

# Sample text
text = "Hello, World! (This) is an example sentence."

# Remove punctuation using regex
clean_text = re.sub(r'[^\w\s]', '', text)

print(clean_text)

In [None]:
from entity_recognition import EntityRecognizer

In [None]:
import pandas as pd
import os
folder_path = '../data/trials_xmls/'  # Replace this with the path to your folder
file_names = []
# List all files in the folder
for file in os.listdir(folder_path):
    if os.path.isfile(os.path.join(folder_path, file)):
        file_name, file_extension = os.path.splitext(file)
        file_names.append(file_name)

reco = EntityRecognizer(n_jobs=5, id_list=file_names, data_source="clinical trials")

KeyboardInterrupt: 

In [None]:
entities

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")

pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first", device=0) # pass device=0 if using gpu
pipe("""has not undergone a hysterectomy or bilateral oophorectomy""")

In [None]:
entities.head(50)

In [None]:
regexs = pd.read_csv("../data/regex_variants.tsv", sep="\t", header=None)

In [None]:
new_row_dict = {0: "pregnancy", 1: "pregnancy", 2:"\b(?:pregnanc(?:y|ies|ial)?|expect(?:ing|ant)|matern(?:al|ity)|gravid|gestation(?:al)?|prenatal|antenatal|postpartum|lactat(?:e|ing|ion)|nurs(?:ing)?|breastfeed(?:ing|s)?)\b"}

In [None]:
regexs = regexs.append(new_row_dict, ignore_index=True)

In [None]:
import medspacy
from spacy.tokens import Span
import spacy
entity = {'entity_group': 'Diagnostic_procedure', 'score': 0.99992955, 'text': 'eastern cooperative oncology group', 'start': 0, 'end': 34}
sentence = "Eastern Cooperative Oncology Group {ECOG} Performance Score {PS} of 0, 1 or 2"

def negation_handling(sentence, entity):
        nlp = spacy.load("en_core_web_sm", disable={"ner"})
        nlp = medspacy.load(nlp)
        nlp.disable_pipe('medspacy_target_matcher')
        nlp.disable_pipe('medspacy_pyrush')
        print(entity["text"])
        @Language.component("add_custom_entity")
        def add_cutom_entity(doc):
            print(doc)
            start_char = doc.text.find(entity["text"])
            print(start_char)
            end_char = start_char + len(entity["text"]) - 1  # Subtract 1 to get the inclusive end position
            print(end_char)
            start_token = None
            end_token = None
            # Find the corresponding tokens for the start and end positions
            for token in doc:
                if token.idx <= start_char < token.idx + len(token.text) and start_token is None:
                    start_token = token
                if token.idx <= end_char <= token.idx + len(token.text) and end_token is None:
                    end_token = token
                if start_token is not None and end_token is not None:
                    doc.set_ents([Span(doc, start_token.i, end_token.i + 1, entity["entity_group"])]) 
            return doc
        nlp.add_pipe("add_custom_entity", before='medspacy_context') 
        doc = nlp(sentence.lower())
        print(doc.ents)
        for e in doc.ents:
            rs = str(e._.is_negated)
            # print(rs)
            if rs == "True": 
                entity["is_negated"] = "yes"
            else:
                entity["is_negated"] = "no"
        return  entity 

In [None]:
entss = negation_handling(sentence=sentence, entity=entity)

In [None]:
entss

In [None]:
import medspacy
import spacy
nlp = spacy.load("en_core_web_sm", disable={"ner"})
nlp = medspacy.load(nlp)
nlp.disable_pipe('medspacy_target_matcher')
nlp.disable_pipe('medspacy_pyrush')

In [None]:
doc = nlp("Platelets > 50 x 109/L with no platelet transfusions in the prior 7 days")


In [None]:

for token in doc:
    print(token)

In [None]:
sentence = "This is an example sentence with some text in it."
substring = "example sentence"

# Find the start and end positions of the substring
start = sentence.find(substring)
end = start + len(substring) - 1  # Subtract 1 to get the inclusive end position

if start != -1:
    print(f"Start position: {start}")
    print(f"End position: {end}")
else:
    print("Substring not found in the sentence.")

In [None]:
import spacy


nlp = spacy.load("en_core_web_sm", disable={"ner"})
nlp = medspacy.load(nlp)
nlp.disable_pipe('medspacy_target_matcher')
nlp.disable_pipe('medspacy_pyrush')
doc=nlp(sentence)

In [None]:
doc = nlp(sentence.lower())

In [None]:

# start_char = doc.text.find(entity["text"])

In [None]:
start_char

In [None]:
words = [token.text for token in doc]
combined_sentence = ' '.join(words)

In [None]:
combined_sentence

In [None]:
import spacy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def negation_handling(sentence, entity):
    @Language.component("add_custom_entity")
    def add_custom_entity(doc):
        threshold = 90
        entity_text = entity["text"].lower()
        # Convert the document tokens to a list of token texts
        token_texts = [token.text for token in doc]
        start_indices = []

        for i in range(len(token_texts) - len(entity_text.split()) + 1):
            window = " ".join(token_texts[i:i + len(entity_text.split())])
            if fuzz.partial_ratio(entity_text, window) >= threshold:
                start_indices.append(i)

        if start_indices:
            print(start_indices)
            # You can choose the first matching window or handle multiple matches
            start_index = start_indices[0]
            start_token = doc[start_index]
            print(start_token)
            end_token = doc[start_index + len(entity_text.split()) - 1]
            print(doc[start_token.i:end_token.i + 1])
            doc.set_ents([Span(doc, start_token.i, end_token.i + 1, entity["entity_group"])])
        return doc
    
    nlp = spacy.load("en_core_web_sm", disable={"ner"})
    nlp = medspacy.load(nlp)
    nlp.disable_pipe('medspacy_target_matcher')
    nlp.disable_pipe('medspacy_pyrush')
    nlp.add_pipe("add_custom_entity", before='medspacy_context') 
    doc = nlp(sentence.lower())
    for e in doc.ents:
        rs = str(e._.is_negated)
        if rs == "True": 
            entity["is_negated"] = "yes"
        else:
            entity["is_negated"] = "no"
    return  entity 

In [None]:
sentence="willing and able to adhere to the study visit schedule and other protocol requirements."
entity = {'entity_group': 'Lab_value', 'score': 0.9387666, 'text': 'willing', 'start': 0, 'end': 7}
# negation_handling(sentence, entity)

In [None]:
entity["text"].split()

In [None]:
import spacy

# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

# Your text
text = "engraftment including >95% myeloid cell donor chimerism and absolute neutrophil count {anc} > 1.0 x 109/l"
entity_text = "> 1. 0 x 109 / l"
# Process the text with SpaCy
doc = nlp(text)

# Character indices
start_char = 92
end_char = 105

# Find the token indices corresponding to the character span
token_indices = [i for i, token in enumerate(doc) if start_char <= token.idx < end_char]

# Print the result
print(f"Token indices corresponding to character span ({start_char}, {end_char}): {token_indices}")


In [None]:
import re
import spacy
import medspacy
entity= {'entity_group': 'Lab_value', 'score': 0.8586178, 'text': '- 14. 2', 'start': 17, 'end': 22}
sentence = "06:30AM BLOOD WBC-14.2* RBC-3.82* Hgb-11.0*# Hct-33.5*"
nlp = spacy.load("en_core_web_sm", disable={"ner"})
doc = nlp(sentence)
nlp = medspacy.load(nlp)
# doc = nlp(sentence)
entity["text"] = re.sub(r'([,.-])\s+', r'\1', entity["text"]) 
# print(entity["text"])
entity_text = entity["text"].lower()
start_char = entity["start"] 
end_char = entity["end"] 
start_indices = [i for i, token in enumerate(doc) if (start_char <= token.idx <= end_char) or (entity_text in token.text and token.idx <= start_char and token.idx + len(token.text) <= end_char)]
print(start_indices)
if start_indices:
# You can choose the first matching window or handle multiple matches
    start_index = start_indices[0]
    start_token = doc[start_index]
    end_index = min(start_index + len(entity_text.split()) - 1, len(doc) - 1)
    end_token = doc[end_index]
    # print(doc[start_token.i:end_token.i + 1])
    doc.set_ents([Span(doc, start_token.i, end_token.i + 1, entity["entity_group"])])


In [None]:
for i, token in enumerate(doc):
    print(token.text)

In [None]:
import numpy as np
start_indices = [i for i, token in enumerate(doc) if start_char <= token.idx < end_char]
if start_indices:
# You can choose the first matching window or handle multiple matches
    start_index = start_indices[0]
    start_token = doc[start_index]
    end_index = min(start_index + len(entity_text.split()) - 1, len(doc) - 1)
    end_token = doc[end_index]
    # print(doc[start_token.i:end_token.i + 1])


In [None]:
len(entity_text.split()) - 1

In [None]:
entities = [
    {
        "entity_group": "Lab_value",
        "score": 0.9990455508232117,
        "word": "<",
        "start": 11,
        "end": 12
    },
    {
        "entity_group": "Lab_value",
        "score": 0.9935429096221924,
        "word": "1. 5",
        "start": 13,
        "end": 16
    },
    {
        "entity_group": "Lab_value",
        "score": 0.9999258518218994,
        "word": "normal",
        "start": 19,
        "end": 25
    },
]

def combine_entities(entities):
    combined_entities = []
    current_entity = entities[0]
    for next_entity in entities[1:]:
        if (
            current_entity['entity_group'] == next_entity['entity_group']
            and next_entity['start'] - current_entity['end'] <= 3
        ):
            current_entity['word'] += ' ' + next_entity['word']
            current_entity['end'] = next_entity['end']
        else:
            combined_entities.append(current_entity)
            current_entity = next_entity

    combined_entities.append(current_entity)
    return combined_entities

combined = combine_entities(entities)
print(combined)


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"  # You can choose different models from Hugging Face's repository
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Original sentence and its negation
sentence = "fantastic."
negation_sentence = "not fantastic."

# Tokenize and get IDs for the sentences
inputs_sentence = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
inputs_negation = tokenizer(negation_sentence, return_tensors="pt", padding=True, truncation=True)

# Generate embeddings for the sentences
with torch.no_grad():
    outputs_sentence = model(**inputs_sentence)
    outputs_negation = model(**inputs_negation)

# Extract the embeddings (CLS token) from the last layer
embedding_sentence = outputs_sentence.last_hidden_state[:, 0, :].numpy()
embedding_negation = outputs_negation.last_hidden_state[:, 0, :].numpy()

# Compute cosine similarity between the embeddings
similarity_score = cosine_similarity(embedding_sentence, embedding_negation)

# Print the sentences and their similarity score
print("Sentence:", sentence)
print("Negation Sentence:", negation_sentence)
print("Cosine Similarity:", similarity_score[0][0])


In [None]:
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
database = client['your_database_name']
collection = database['your_collection_name']


In [None]:
from pymongo import MongoClient

# Connect to MongoDB (assuming it's running locally on default port)
client = MongoClient('mongodb://localhost:27017/')
database = client['test_database']  # Change 'test_database' to your database name
collection = database['test_collection']  # Change 'test_collection' to your collection name

# Insert some sample data into the collection
data_to_insert = [
    {
        "name": "John",
        "age": 30,
        "address": {
            "city": "New York",
            "zipcode": "10001"
        }
    },
    {
        "name": "Alice",
        "age": 25,
        "address": {
            "city": "San Francisco",
            "zipcode": "94107"
        }
    },
    {
        "name": "Bob",
        "age": 35,
        "address": {
            "city": "Los Angeles",
            "zipcode": "90001"
        }
    }
]

# Insert the data into the collection
collection.insert_many(data_to_insert)

# Perform a search/query
# Find documents where the city in the address is "New York"
result = collection.find({"address.city": "New York"})

# Iterate through the results and print them
for doc in result:
    print(doc)

# Close the connection to MongoDB
client.close()


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = torch.device("cpu")

model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

premise = "'gene': KRAS, 'is_negated':no"
hypothesis = "'gene': KRAS, 'is_negated':yes"

input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
output = model(input["input_ids"].to(device))  # device = "cuda:0" or "cpu"
prediction = torch.softmax(output["logits"][0], -1).tolist()
label_names = ["entailment", "neutral", "contradiction"]
prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
print(prediction)


In [None]:
def remove_space_between_numbers(text):
    import re
    # Use regular expression to find space between numbers and remove it
    modified_text = re.sub(r'(\d)[\s,]+(\d)(?!\D)', r'\1\2', text)
    return modified_text


In [None]:
import re

texts_with_spaces = [
   "1- 2"
]

texts_without_spaces = [re.sub(r'([,.-])\s+', r'\1', text) for text in texts_with_spaces]

for text in texts_without_spaces:
    print(text)


In [None]:
import pandas as pd

In [None]:
proc_nct = pd.read_csv("../data/ner_clinical_trials/entities_parsed.csv")

In [None]:
proc_nct

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")

pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first") # pass device=0 if using gpu
pipe("""For participants in the NSCLC Cohort: Known tumor programmed death-ligand 1 {PD-L1} expression status as determined by an immunohistochemistry assay during participation in other clinical studies {e.g., participants whose PD-L1 expression status was determined during screening for entry into a study with anti-programmed death 1 or anti-PD-L1 antibodies but were not eligible are excluded}""")


In [None]:
import re
import pandas as pd
from spacy.tokens import Span

def pregnancy_recognizer(self, text):
    med_nlp = medspacy.load()
    med_nlp.disable_pipe('medspacy_target_matcher')
    
    # Updated regex pattern
    regex_pattern = r"(?i)\b(?:pregn\w+|matern\w+|gestat\w+|lactat\w+|breastfeed\w+|prenat\w+|antenat\w+|postpartum|childbear\w+|parturient|conceiv\w+|obstetr\w+)\b"

    @Language.component("pregnancy-ner")
    def regex_pattern_matcher_for_pregnancy(doc):
        compiled_pattern = re.compile(regex_pattern)

        original_ents = list(doc.ents)
        mwt_ents = []

        for match in re.finditer(compiled_pattern, doc.text):
            start, end = match.span()
            span = doc.char_span(start, end)
            if span is not None:
                mwt_ents.append((span.start, span.end, span.text))

        for ent in mwt_ents:
            start, end, name = ent
            per_ent = Span(doc, start, end, label="pregnancy")  # Assigning the label "pregnancy"
            original_ents.append(per_ent)

        doc.ents = filter_spans(original_ents)

        return doc

    med_nlp.add_pipe("pregnancy-ner", before='medspacy_context')
    doc = med_nlp(text)
    
    ent_list =[] 
    for entity in doc.ents:
        ent_list.append({
            "entity_group": entity.label_,
            "text": entity.text,
            "start": entity.start_char,
            "end": entity.end_char,
            "is_negated": "yes" if entity._.is_negated else "no"
        })
    
    return ent_list


In [None]:
import requests

def get_cancer_trials():
    base_url = "https://clinicaltrials.gov/api/query/full_studies"
    trials_list = []

    start = 0
    rows = 100  # Maximum rows allowed per request

    while True:
        search_params = {
            "expr": "(cancer) AND (Interventional) AND (mutation)",
            "min_rnk": start + 1,
            "max_rnk": start + rows,
            "fmt": "json",
            "fields": "NCTId"
        }

        response = requests.get(base_url, params=search_params)

        if response.status_code == 200:
            trials_data = response.json()
            if "FullStudiesResponse" in trials_data:
                studies = trials_data["FullStudiesResponse"]["FullStudies"]
                if not studies:  # No more records
                    break

                for study in studies:
                    trials_list.append(study["Study"]["ProtocolSection"]["IdentificationModule"]["NCTId"])

                start += rows
            else:
                print("No trials found matching the criteria.")
                break
        else:
            print("Failed to retrieve data. Status code:", response.status_code)
            break

    return trials_list

# Example usage:
cancer_trials = get_cancer_trials()
print("Clinical trial IDs related to cancer, intervention, and mutations:")
print(cancer_trials)


In [None]:
len(cancer_trials)

In [None]:
cancer_trials

In [None]:
import requests

def get_cancer_trials(max_trials):
    base_url = "https://clinicaltrials.gov/api/query/full_studies"
    trials_set = set()
    page_size = 100  # Number of trials per page
    current_rank = 1
    trials_fetched = 0

    while trials_fetched < max_trials:
        search_params = {
            "expr": "((cancer) OR (neoplasm)) AND ((interventional) OR (treatment)) AND ((mutation) OR (variant))",
            "min_rnk": current_rank,
            "max_rnk": current_rank + page_size - 1,
            "fmt": "json",
            "fields": "NCTId"
        }

        response = requests.get(base_url, params=search_params)

        if response.status_code == 200:
            trials_data = response.json()
            if "FullStudiesResponse" in trials_data:
                studies = trials_data["FullStudiesResponse"]["FullStudies"]
                if not studies:
                    break  # No more studies found, exit the loop
                for study in studies:
                    trials_set.add(study["Study"]["ProtocolSection"]["IdentificationModule"]["NCTId"])
                    trials_fetched += 1
                    if trials_fetched == max_trials:
                        break
                current_rank += page_size
            else:
                print("No trials found matching the criteria.")
                break
        else:
            print("Failed to retrieve data. Status code:", response.status_code)
            break

    return list(trials_set)  # Convert set to list for output

# Example usage: Fetching a maximum of 500 trials
max_trials_to_fetch = 1000
cancer_trials = get_cancer_trials(max_trials_to_fetch)
print("Clinical trial IDs related to cancer, intervention, and mutations (up to {} trials):".format(max_trials_to_fetch))
print(cancer_trials)


In [None]:
import requests

def query_plain(text, url="http://localhost:8888/plain"):
    return requests.post(url, json={'text': text}).json()

if __name__ == '__main__':
    text = "Autophagy maintains tumour growth through circulating arginine."
    print(query_plain(text))

In [None]:
from multiner_server import start_multiner_server

In [None]:
start_multiner_server()