# Named Entity Tagging and Normalization
- In this notebook, we will develop systems for two subtasks of the Bacteria Biotope Task (https://sites.google.com/view/bb-2019/task-description).
    1) **BB-norm:** Normalization of Microorganism, Habitat, and Phenotype entities with NCBI Taxonomy taxa (for the former) and OntoBiotope habitat concepts (for the last two).
    2) **BB-norm+ner:** Recognition of Microorganism, Habitat, and Phenotype entities and normalization with NCBI Taxonomy taxa and OntoBiotope habitat concepts.
- In the BB-norm of the Bacteria Biotope task, we will develop a biomedical named entity normalizer to link the named entities (Microorganism, Habitat, and Phenotype) in a given text through a given ontology when the entities are already given their boundaries.
- We will assume that the entities are embedded as noun phrases to achieve this goal.

In [None]:
import pronto
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import re
import string
import nltk
from nltk.corpus import stopwords
import os
import spacy
import time

In [None]:
# Download the stopwords from nltk
nltk.download('stopwords')

In [None]:
resource_directory = r"C:\Users\said_\OneDrive\Masaüstü\github\Natural Language Processing\Datasets\Resources"

## 1) Search Mechanism within OntoBiotope Ontology and NCBI Taxonomy

In [None]:
import gensim.downloader as api

# Load the pre-trained word2vec model
word2vec_model = api.load("word2vec-google-news-300")

### a. Check OntoBiotope Ontology

In [None]:
# Load the OBO file
obo_file_path = os.path.join(resource_directory, "OntoBiotope_BioNLP-OST-2019.obo")
ontology = pronto.Ontology(obo_file_path)

# Extract term IDs and names
onto_entities = [(term.id, term.name) for term in ontology.terms()]

In [None]:
# Preprocess text by converting to lowercase and removing unnecessary characters
def preprocess_input_text(input_text):
    # Convert to lowercase
    input_text = input_text.lower()

    # Remove abbreviations with dots and any following word starting with the same letter
    input_text = re.sub(r'\b(\w\.)\s*\w*?\b', '', input_text)

    # Remove leading and trailing whitespaces
    input_text = input_text.strip()

    return input_text

# Check if input text is present in ontology, then calculate cosine similarity
def check_ontobiotope_match(input_text, onto_entities):
    input_text = preprocess_input_text(input_text)
    input_tokens = input_text.split()
    terms_to_check = input_tokens + [input_text]
    
    matches = []

    # Calculate similarity scores for each ontology entity
    for term_id, term_name in onto_entities:
        term_name_processed = preprocess_input_text(term_name)
        term_tokens = term_name_processed.split()

        # Check for exact matches of any term in terms_to_check
        if any(term == term_name_processed or term in term_tokens for term in terms_to_check):
            # Get word vectors for each token and compute the average vector
            input_vectors = [word2vec_model[token] for token in input_tokens if token in word2vec_model]
            term_vectors = [word2vec_model[token] for token in term_tokens if token in word2vec_model]

            # Skip if no vectors found for input or term
            if not input_vectors or not term_vectors:
                continue 

            input_vector = np.mean(input_vectors, axis=0).reshape(1, -1)
            term_vector = np.mean(term_vectors, axis=0).reshape(1, -1)

            # Compute cosine similarity
            similarity = cosine_similarity(input_vector, term_vector)[0][0]
            matches.append((term_id, term_name, similarity))

    if matches:
        # Sort matches by similarity score
        matches.sort(key=lambda x: x[2], reverse=True)

        # Get the best match
        best_match_id, best_match_name, best_similarity_score = matches[0]
        return best_match_id, best_match_name, best_similarity_score
    else:
        return None, None, None

In [None]:
# Sample input text
input_text = "human animal plant"
best_match_id, best_match_name, similarity_score = check_ontobiotope_match(input_text, onto_entities)

if best_match_id:
    print(f"Best match ID: {best_match_id}")
    print(f"Best match Name: {best_match_name}")
    print(f"Cosine Similarity Score: {similarity_score}")
else:
    print("No match found.")

### b. Check NCBI Taxonomy

In [None]:
def extract_ncbi_taxonomy_info(file_path):
    entities = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('|')
            if len(parts) >= 2:
                entity_id = parts[0].strip()
                entity_name = parts[1].strip()
                entities.append((entity_id, entity_name))
    return entities

# Load the names.dmp file
ncbi_taxonomy_file_path = os.path.join(resource_directory, "names.dmp")
ncbi_taxonomy_entities = extract_ncbi_taxonomy_info(ncbi_taxonomy_file_path)

# Print first 10 entities as a sample
print(ncbi_taxonomy_entities[:10])

In [None]:
# Preprocess text by converting to lowercase and removing unnecessary characters
def preprocess_input_text(input_text):
    # Convert to lowercase
    input_text = input_text.lower()

    # Remove abbreviations with dots and any following word starting with the same letter
    input_text = re.sub(r'\b(\w\.)\s*\w*?\b', '', input_text)

    # Remove leading and trailing whitespaces
    input_text = input_text.strip()

    return input_text

# Check if input text matches with entities from NCBI Taxonomy and calculate cosine similarity
def check_ncbi_taxonomy_match(input_text, ncbi_taxonomy_entities):
    input_text = preprocess_input_text(input_text)
    input_tokens = input_text.split()
    terms_to_check = input_tokens + [input_text]
    
    matches = []

    # Calculate similarity scores for each NCBI Taxonomy entity
    for entity_id, entity_name in ncbi_taxonomy_entities:
        entity_name_processed = preprocess_input_text(entity_name)
        entity_tokens = entity_name_processed.split()

        # Check for exact matches of any term in terms_to_check
        if any(term == entity_name_processed or term in entity_tokens for term in terms_to_check):
            # Get word vectors for each token and compute the average vector
            input_vectors = [word2vec_model[token] for token in input_tokens if token in word2vec_model]
            entity_vectors = [word2vec_model[token] for token in entity_tokens if token in word2vec_model]

            # Skip if no vectors found for input or entity
            if not input_vectors or not entity_vectors:
                continue 

            input_vector = np.mean(input_vectors, axis=0).reshape(1, -1)
            entity_vector = np.mean(entity_vectors, axis=0).reshape(1, -1)

            # Compute cosine similarity
            similarity = cosine_similarity(input_vector, entity_vector)[0][0]
            matches.append((entity_id, entity_name, similarity))

    if matches:
        # Sort matches by similarity score
        matches.sort(key=lambda x: x[2], reverse=True)
        # Get the best match
        best_match_id, best_match_name, best_similarity_score = matches[0]
        return best_match_id, best_match_name, best_similarity_score
    
    else:
        return None, None, None

In [None]:
input_text = "abc"
best_match_id, best_match_name, similarity_score = check_ncbi_taxonomy_match(input_text, ncbi_taxonomy_entities)

if best_match_id:
    print(f"Best match ID: {best_match_id}")
    print(f"Best match Name: {best_match_name}")
    print(f"Cosine Similarity Score: {similarity_score}")
else:
    print("No match found.")

### c. Put them all together

In [None]:
def get_best_match(input_text, onto_entities, ncbi_taxonomy_entities):
    onto_match_id, onto_match_name, onto_similarity = check_ontobiotope_match(input_text, onto_entities)
    ncbi_match_id, ncbi_match_name, ncbi_similarity = check_ncbi_taxonomy_match(input_text, ncbi_taxonomy_entities)
    
    # Handle the case where one or both similarities are None
    if onto_similarity is None:
        onto_similarity = 0
    if ncbi_similarity is None:
        ncbi_similarity = 0
    
    if onto_similarity > ncbi_similarity:
        return "Habitat or Phenotype", onto_match_name, onto_match_id
    elif ncbi_similarity > onto_similarity:
        return "NCBI Taxonomy", ncbi_match_name, ncbi_match_id
    else:
        # Handle the case where both similarities are equal
        return None, None, None

In [None]:
# Sample input text
input_text = "respiratory syncytial virus"

# Record the start time
start_time = time.time()

# Call the function and perform similarity check
entity_type, best_match_name, best_match_id = get_best_match(input_text, onto_entities, ncbi_taxonomy_entities)

# Record the end time
end_time = time.time()

# Calculate the runtime
runtime = end_time - start_time

# Print the results
if best_match_id:
    print(f"Entity Type: {entity_type}")
    print(f"Best match name: {best_match_name}")
    print(f"Best match ID: {best_match_id}")
else:
    print("No match found.")

print(f"Runtime (min): {(runtime/60):.2f}")

In [None]:
def count_text_files(directory):
    txt_files = [filename for filename in os.listdir(directory) if filename.endswith(".txt")]
    return len(txt_files)

# Example usage
input_file_directory = os.path.join(resource_directory, "BioNLP-OST-2019_BB-norm+ner_test")
num_text_files = count_text_files(input_file_directory)
print(f"Number of text files in the directory: {num_text_files}")

**Note:** We have 96 test documents. Assuming each document contains an average of 40 entities, checking for matches would take approximately 125 hours calculated from around 80 minutes for each run. To simplify the analysis, , we will focus solely on matches within the OntoBiotope Ontology and label them as 'Habitat or Phenotype' entities.

In [None]:
def adjusted_best_match(input_text, onto_entities):
    onto_match_id, onto_match_name, onto_similarity = check_ontobiotope_match(input_text, onto_entities)
    
    # Handle the case where similarity is None
    if onto_similarity is None:
        onto_similarity = 0
    
    return "Habitat or Phenotype", onto_match_name, onto_match_id

In [None]:
# Sample input text
input_text = "respiratory syncytial virus"

# Record the start time
start_time = time.time()

# Call the function and perform similarity check
entity_type, best_match_name, best_match_id = adjusted_best_match(input_text, onto_entities)

# Record the end time
end_time = time.time()

# Calculate the runtime
runtime = end_time - start_time

# Print the results
if best_match_id:
    print(f"Entity Type: {entity_type}")
    print(f"Best match name: {best_match_name}")
    print(f"Best match ID: {best_match_id}")
else:
    print("No match found.")

print(f"Runtime (seconds): {(runtime):.2f}")

## 2) Name Entity Recognition (NER) Model to Generate A1 Files

In [None]:
# Install a full spaCy pipeline for biomedical data 
spacy_directory = os.path.join(resource_directory, "en_core_sci_sm-0.5.4.tar.gz")
# Use double quotes around the file path to prevent parsing issues
!pip install "{spacy_directory}"

### NER for a sample text

In [None]:
# Load the pre-trained biomedical Named Entity Recognition (NER) model
nlp = spacy.load("en_core_sci_sm")

# Sample text
text = """The etiologic and epidemiologic spectrum of bronchiolitis in pediatric practice.
To develop a broad understanding of the causes and patterns of occurrence of wheezing associated respiratory infections, we analyzed data from an 11-year study of acute lower respiratory illness in a pediatric practice. Although half of the WARI occurred in children less than 2 years of age, wheezing continued to be observed in 19% of children greater than 9 years of age who had lower respiratory illness. Males experienced LRI 1.25 times more often than did females; the relative risk of males for WARI was 1.35. A nonbacterial pathogen was recovered from 21% of patients with WARI; respiratory syncytial virus, parainfluenza virus types 1 and 3, adenoviruses, and Mycoplasma pneumoniae accounted for 81% of the isolates. Patient age influenced the pattern of recovery of these agents. The most common cause of WARI in children under 5 years of age was RSV whereas Mycoplasma pneumoniae was the most frequent isolate from school age children with wheezing illness. The data expand our understanding of the causes of WARI and are useful to diagnosticians and to researchers interested in the control of lower respiratory disease."""

# Split the text into title and paragraph
title_end_idx = text.find('\n')
title = text[:title_end_idx].strip()
paragraph = text[title_end_idx + 1:].strip()

# Output in .a1 format
output = []

# Add the title as T1
output.append(f"T1\tTitle 0 {title_end_idx}\t{title}")

# Add the paragraph as T2
paragraph_start_idx = title_end_idx + 1
paragraph_end_idx = paragraph_start_idx + len(paragraph)
output.append(f"T2\tParagraph {paragraph_start_idx} {paragraph_end_idx}\t{paragraph}")

# Perform NER on the paragraph
doc = nlp(paragraph)

# Start entity IDs from T3
entity_id = 3
for ent in doc.ents:
    entity_type = ent.label_
    start = ent.start_char + paragraph_start_idx
    end = ent.end_char + paragraph_start_idx
    entity_text = ent.text
    output.append(f"T{entity_id}\t{entity_type} {start} {end}\t{entity_text}")
    entity_id += 1

# Print the output
print('\n'.join(output))

In [None]:
def adjusted_best_match(input_text, onto_entities):
    onto_match_id, onto_match_name, onto_similarity = check_ontobiotope_match(input_text, onto_entities)
    
    # Handle the case where similarity is None
    if onto_similarity is None:
        onto_similarity = 0
    
    return "Habitat or Phenotype", onto_match_name, onto_match_id

# Load the pre-trained biomedical Named Entity Recognition (NER) model
nlp = spacy.load("en_core_sci_sm")

# Sample text
text = """The etiologic and epidemiologic spectrum of bronchiolitis in pediatric practice.
To develop a broad understanding of the causes and patterns of occurrence of wheezing associated respiratory infections, we analyzed data from an 11-year study of acute lower respiratory illness in a pediatric practice. Although half of the WARI occurred in children less than 2 years of age, wheezing continued to be observed in 19% of children greater than 9 years of age who had lower respiratory illness. Males experienced LRI 1.25 times more often than did females; the relative risk of males for WARI was 1.35. A nonbacterial pathogen was recovered from 21% of patients with WARI; respiratory syncytial virus, parainfluenza virus types 1 and 3, adenoviruses, and Mycoplasma pneumoniae accounted for 81% of the isolates. Patient age influenced the pattern of recovery of these agents. The most common cause of WARI in children under 5 years of age was RSV whereas Mycoplasma pneumoniae was the most frequent isolate from school age children with wheezing illness. The data expand our understanding of the causes of WARI and are useful to diagnosticians and to researchers interested in the control of lower respiratory disease."""

# Split the text into title and paragraph
title_end_idx = text.find('\n')
title = text[:title_end_idx].strip()
paragraph = text[title_end_idx + 1:].strip()

# Output in .a1 format
output = []

# Add the title as T1
output.append(f"T1\tTitle 0 {title_end_idx}\t{title}")

# Add the paragraph as T2
paragraph_start_idx = title_end_idx + 1
paragraph_end_idx = paragraph_start_idx + len(paragraph)
output.append(f"T2\tParagraph {paragraph_start_idx} {paragraph_end_idx}\t{paragraph}")

# Perform NER on the paragraph
doc = nlp(paragraph)

# Start entity IDs from T3
entity_id = 3
for ent in doc.ents:
    entity_text = ent.text
    
    # Get the best match
    entity_type, match_name, match_id = adjusted_best_match(entity_text, onto_entities)
    
    # If the entity_text is present in onto_entities, update entity_type
    if match_id:
        # Add the entity to the output
        start = ent.start_char + paragraph_start_idx
        end = ent.end_char + paragraph_start_idx
        output.append(f"T{entity_id}\t{entity_type} {start} {end}\t{entity_text}")
        entity_id += 1

# Print the output
print('\n'.join(output))

#### Creating A1 Files

In [None]:
# Check for the entity matches from input text files and save them as a1 files
def process_text_files(input_dir, output_dir, onto_entities):
    # Load the pre-trained biomedical Named Entity Recognition (NER) model
    nlp = spacy.load("en_core_sci_sm")
    
    # Iterate over each file in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            input_file_path = os.path.join(input_dir, filename)
            output_file_path = os.path.join(output_dir, filename.replace('.txt', '.a1'))
            print(f"Processing: {filename}")
            
            # Read the input text file
            with open(input_file_path, 'r', encoding='utf-8') as file:
                text = file.read()
            
            # Split the text into title and paragraph
            title_end_idx = text.find('\n')
            title = text[:title_end_idx].strip()
            paragraph = text[title_end_idx + 1:].strip()
            
            # Output in .a1 format
            output = []
            
            # Add the title as T1
            output.append(f"T1\tTitle 0 {title_end_idx}\t{title}")
            
            # Add the paragraph as T2
            paragraph_start_idx = title_end_idx + 1
            paragraph_end_idx = paragraph_start_idx + len(paragraph)
            output.append(f"T2\tParagraph {paragraph_start_idx} {paragraph_end_idx}\t{paragraph}")
            
            # Perform NER on the paragraph
            doc = nlp(paragraph)
            
            # Start entity IDs from T3
            entity_id = 3
            for ent in doc.ents:
                entity_text = ent.text
                
                # Get the best match
                entity_type, match_name, match_id = adjusted_best_match(entity_text, onto_entities)
                
                # If the entity_text is present in onto_entities, update entity_type
                if match_id:
                    # Add the entity to the output
                    start = ent.start_char + paragraph_start_idx
                    end = ent.end_char + paragraph_start_idx
                    output.append(f"T{entity_id}\t{entity_type} {start} {end}\t{entity_text}")
                    entity_id += 1
            
            # Save the entities in .a1 format
            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.write('\n'.join(output))
            
            print(f"Entities extracted and saved to: {output_file_path}")

In [None]:
# Example usage
a1_output_directory = r"C:\Users\said_\OneDrive\Masaüstü\github\Natural Language Processing\Datasets\Resources\Trial - A1"
process_text_files(input_file_directory, a1_output_directory, onto_entities)

In [None]:
def count_text_files(directory):
    txt_files = [filename for filename in os.listdir(directory) if filename.endswith(".a1")]
    return len(txt_files)

# Example usage
num_text_files = count_text_files(a1_output_directory)
print(f"Number of text files in the directory: {num_text_files}")

#### 3) Creating A2 Files

In [None]:
def process_text_files(input_dir, output_dir, onto_entities):
    # Load the pre-trained biomedical Named Entity Recognition (NER) model
    nlp = spacy.load("en_core_sci_sm")
    
    # Iterate over each file in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            input_file_path = os.path.join(input_dir, filename)
            output_file_path = os.path.join(output_dir, filename.replace('.txt', '.a2'))
            print(f"Processing: {filename}")
            
            # Read the input text file
            with open(input_file_path, 'r', encoding='utf-8') as file:
                text = file.read()
            
            # Split the text into title and paragraph
            title_end_idx = text.find('\n')
            title = text[:title_end_idx].strip()
            paragraph = text[title_end_idx + 1:].strip()
            
            # Perform NER on the paragraph
            doc = nlp(paragraph)
            
            # Start entity IDs from T3
            entity_id = 3
            # Entities to be included in a1 format
            entities_a1 = []
            additional_annotations = []

            for ent in doc.ents:
                entity_text = ent.text
                
                # Get the best match
                entity_type, match_name, match_id = adjusted_best_match(entity_text, onto_entities)
                
                # If the entity_text is present in onto_entities, update entity_type
                if match_id:
                    # Add the entity to the output (excluding T1 and T2)
                    start = ent.start_char + title_end_idx + 1
                    end = ent.end_char + title_end_idx + 1
                    entities_a1.append(f"T{entity_id}\t{entity_type} {start} {end}\t{entity_text}")
                    entity_id += 1

                    # Add additional annotation
                    # Skip the first two entities (T1 and T2)
                    if entity_id > 3:
                        additional_annotations.append(f"N{entity_id-3}\tOntoBiotope Annotation:T{entity_id-1} Referent:{match_id}")
            
            # Save the entities in .a1 format
            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.write('\n'.join(entities_a1 + additional_annotations))
            
            print(f"Entities extracted and saved to: {output_file_path}")

In [None]:
# Example usage
a2_output_directory = r"C:\Users\said_\OneDrive\Masaüstü\github\Natural Language Processing\Datasets\Resources\Trial - A2"
process_text_files(input_file_directory, a2_output_directory, onto_entities)

In [None]:
def count_text_files(directory):
    txt_files = [filename for filename in os.listdir(directory) if filename.endswith(".a2")]
    return len(txt_files)

# Example usage
num_text_files = count_text_files(a2_output_directory)
print(f"Number of text files in the directory: {num_text_files}")

# END