#### Single Document NER

We expect as input a PDF file which will be converted to a raw text format using *pdfminer*, the text extraction tool selected for this task given that it extracts the most information when compared to others such as *Py2PDF* (used in the word2vec part of the project).

Two processing functions are then applied to the string containing the text found in the pdf file. One function to identify **references** and remove them as they are useless to the task at hand and another function to detect if the authors have provided **keywords** for the document, keywords which would be complementary to the topic classification part of this project.

In [1]:
import re
import PyPDF2
from pdfminer.layout import LTTextContainer
from pdfminer.high_level import extract_text, extract_pages

##### Provided the location of a pdf file on disk, this function extract the entirety of text from it
##### This extracted text can be returned as a single string or a list containing the text of each page
def extract_text_from_pdf(pdf_file):
    page_counter = 0
    text_as_list = []
    text_as_str = ''

    for page_layout in extract_pages(pdf_file):
        page_counter += 1
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                extracted_text = element.get_text()
                text_as_list.append(extracted_text)
                text_as_str += '' + extracted_text
    print('Text from PDF File ({} pages) extracted successfully.'.format(page_counter)) 
    return text_as_str, text_as_list

##### Function to locate the possible 'keywords' segment on the string containing the document's text
##### If found, a list that contains all the keywords the authors have put will be generated 
def try_finding_keywords(document_text_str):
    potential_keywords_index = document_text_str.find('Keywords')
    final_keywords = []

    #### Check if the sub-string "Keywords" is located in the document
    if potential_keywords_index != -1:
        found_keywords = []
        for i in range(potential_keywords_index, len(document_text_str)):
            if document_text_str[i] != '\n':
                found_keywords.append(document_text_str[i])
            else:
                break     
                
        keyword = ''
        for i in range(len(found_keywords)):
            if found_keywords[i].isalpha():
                keyword += found_keywords[i]
            elif (keyword != ''):
                final_keywords.append(keyword)
                keyword = ''    
    
    return final_keywords

##### Function to locate the sub-string holding the References title and remove all the text after it
##### This will potentially remove all the references in a document from the NER pipeline
def remove_references(doc_str):
    print('Original document character length: {}'.format(len(doc_str)))
    potential_references_index = ''
    
    ##### Locate the index of the references title sub-string
    if (doc_str.find('References') != -1) or (doc_str.find('REFERENCES') != -1):
        potential_references_index = doc_str.find('References') if doc_str.find('References') != -1 else doc_str.find('REFERENCES')
        # print('Potential References Index: {}'.format(potential_references_index))
        doc_str = doc_str[:potential_references_index]
        print('References removed, new document character length: {}'.format(len(doc_str)))
    else:
        print('References string index not found')
    return doc_str


def extract_text_from_pdf_2(pdf_file):
    pdf_reader = PyPDF2.PdfFileReader(pdf_file, strict=False)
    meta_data = pdf_reader.metadata

    doc_as_str = ''

    x = pdf_reader.numPages
    for i in range(x): ## skip the last page as it usually includes references
        page = pdf_reader.getPage(i)
        page_text = page.extract_text()
        doc_as_str += page_text
        
    print('PDF processed ({} pages)'.format(x))
    
    try:
        doc_author = meta_data['/Author']
        print('Author: {}'.format(doc_author))
        doc_keywords = meta_data['/Keywords']
                    
        pattern = r"\b\w+(?: \w+)*\b"
        keywords_list = re.findall(pattern, doc_keywords)
        print('Keywords: {}'.format(keywords_list))

    except KeyError:
        pass
    
    return doc_as_str

##### Remove links from a given string using RE (Regular Expressions)
def remove_links(review):
  review = re.sub(r'https?:\/\/.*[\r\n]*', '', review)
  return review

Loop to treat all publications in our testing set.

In [2]:
import os

pdfs_directory = 'data/implementome_publications/test_miner/'
#### Load the paths of the input PDF file and the output empty text notebook
for file in os.listdir(pdfs_directory):
    filename = os.fsdecode(file)
    # print('what are you', filename)
    if filename.endswith('.pdf'):
        print('For Publication: {}'.format(filename[:-4]))
        pdf_file = open(pdfs_directory + filename, 'rb')
        doc_as_str, doc_as_list = extract_text_from_pdf(pdf_file)
        
        #### Processing the string containing the entirety of the documents text
        #### The goal is to remove references and detect potential keywords in the document
        print('#### Processing ####')
        doc_as_str = remove_references(doc_as_str)
        doc_keywords = try_finding_keywords(doc_as_str)
        print('Potential Extracted Keywords: {}'.format(doc_keywords[1:] if doc_keywords != [] else 'None Found'))
        print('#####\n')
        
        print('##### New Version #####')
        doc_as_str2 = extract_text_from_pdf_2(pdf_file)
        doc_as_str2 = remove_references(doc_as_str2)
        print('######\n')
        


# pdf_file = open('data/implementome_publications/test_miner/child_obesity_switzerland.pdf', 'rb')
# empty_text_file = open('data/implementome_publications/test_miner/test_text.txt', "w", encoding="utf-8")

# #### The extract_text() function is used to extract any text found in the pdf file
# #### Text in tables, headers, banners and other graphical representations is also extracted
# publications_text = extract_text(pdf_file)
# print('Text from PDF File extracted successfully.')
# # empty_text_file.write(publications_text)

For Publication: ai_and_surgical_decision_making
Text from PDF File (11 pages) extracted successfully.
#### Processing ####
Original document character length: 66451
References removed, new document character length: 50476
Potential Extracted Keywords: None Found
#####

##### New Version #####
PDF processed (11 pages)
Author: American Medical Association
Keywords: ['The JAMA Network']
Original document character length: 66691
References removed, new document character length: 50342
######

For Publication: brachytherapy_lmic
Text from PDF File (10 pages) extracted successfully.
#### Processing ####
Original document character length: 43351
References removed, new document character length: 38957
Potential Extracted Keywords: None Found
#####

##### New Version #####
PDF processed (10 pages)
Author: 
Keywords: []
Original document character length: 43100
References removed, new document character length: 38687
######

For Publication: child_obesity_switzerland
Text from PDF File (11 pag

##### Processing a Single Document

The cells displayed below will explain how this NER process is computed when given a PDF document.

Before the document's text is run through the NER pipeline itself, the pre-processing functions of removing references, finding keywords, and tokenizing the document into sentences are performed.

In [2]:
from nltk.tokenize import sent_tokenize

##### Document extraction and pre-processing (Removing Referencesn & Potentially finding Keywords) 
single_pdf = open('data/implementome_publications/test_miner/child_obesity_switzerland.pdf', 'rb')
doc_as_str, doc_as_list = extract_text_from_pdf(single_pdf)
# doc_as_str = extract_text_from_pdf_2(single_pdf)
doc_as_str = remove_references(doc_as_str)
doc_keywords = try_finding_keywords(doc_as_str)

###### Tokenization process of the string containing the entire document text
###### Regular Expression pattern removes '\n' characters and tries to concatenate words separated by a '-'
sentences = sent_tokenize(doc_as_str)
pattern = r'(?<![a-zA-Z])-|-(?![a-zA-Z])'
sentences = [re.sub(pattern, '', sentence.replace('\n', ' ')) for sentence in sentences]

Text from PDF File (11 pages) extracted successfully.
Original document character length: 44197
References removed, new document character length: 36152


NER Pipeline 

A custom model named **nlp** needs to be loaded accompanied by a list of wanted entity labels.

In [4]:
import spacy
from spacy import displacy

##### Generator that yields wanted entity labels
def count_entities(doc, entity_labels):
    for ent in doc.ents:
        if ent.label_ in entity_labels:
            yield (ent.label_, ent.text)

##### Initialization
##### Loading the full entity recognition model also equipped with the Disease Recognition pipeline
nlp = spacy.load("./models/full-model-lg")
our_labels = ['GPE', 'ORG', 'LAW', 'PERSON', 'PRODUCT', 'DISEASE']
full_visualization = True
entity_counts = {label: {} for label in our_labels}

###### Perform NER in the entire document, saving predictions for each sentence as a spaCy Doc Object
docs = list(nlp.pipe(sentences))
total_entity_count = 0

###### Loop through the predictions for each sentence and the entities in them
###### Entities stored in a Doc object are accessed with doc.ents
for doc in docs:
    total_entity_count += len(doc.ents)
    for label, entity in count_entities(doc, our_labels):
    # If the entity text has already been seen during the processing, increment its count, else initialize it
        if entity in entity_counts[label]:
            entity_counts[label][entity] += 1
        else:
            entity_counts[label][entity] = 1
    
    if full_visualization:
        displacy.render(doc, 'ent')
        
print("Named Entity Recognition performed on the entire document")
print("{} total entities predicted".format(total_entity_count))



Named Entity Recognition performed on the entire document
366 total entities predicted


In [5]:
##### Sort the tuples containing the entities and their count by the the highest count
def sort_entity_counts(output_entities):
    sorted_entities = {}
    for entity_type, entity_counts_dict in entity_counts.items():
        sorted_entities[entity_type] = dict(sorted(entity_counts_dict.items(), key = lambda item: item[1], reverse = True))
    return sorted_entities

entity_counts = sort_entity_counts(entity_counts)

for label in entity_counts:
    print(entity_counts[label])

{'Switzerland': 62, 'Southern Europe': 21, 'Northern': 15, 'Portugal': 13, 'Central Europe': 11, 'Italy': 11, 'Spain': 11, 'Balkan region': 11, 'Turkey': 11, 'Germany': 5, 'Netherlands': 4, 'Serbia': 4, 'Kosovo': 3, 'Zurich': 2, 'France': 2, 'Austria': 2, 'Albania': 2, 'Croatia': 2, 'Montenegro': 2, 'Northern Macedonia': 2, 'USA': 2, 'Southern European': 2, 'Southern Italy': 2, 'Southern European countries': 1, 'Europe': 1, 'Yugoslavia': 1, 'Northern/ Central Europe': 1, 'Belgium': 1, 'Luxembourg': 1, 'Sweden': 1, 'Finland': 1, 'Norway': 1, 'Denmark': 1, 'Bosnia-Herzegovina, Bulgaria': 1, 'Greece': 1, 'Moldova': 1, 'Romania': 1, 'Slovenia': 1, 'Brazil': 1, 'Great Britain': 1, 'Hong Kong': 1, 'Singapore': 1, 'United States': 1, 'LMSchartmaker': 1, 'Cambridge': 1, 'UK': 1, 'Stata': 1, 'Texas': 1, 'Switzerland Northern/Central Europe Italy': 1, 'Balkan region Turkey': 1, '/Central Europe': 1, 'Western Turkey': 1, 'Central Eur ope': 1, 'Southern Eur': 1, 'Southern Eur ope': 1, 'Northern Eu

In [6]:
from thefuzz import fuzz
import itertools

visualization_text = 'Annotated Entity Candidates: '

# for label in entity_counts:
#     if entity_counts[label] != {}:
#         highest_count = next(iter(entity_counts[label].items()))[1]
#         entities_in_label = [ent for ent in iter(entity_counts[label].items()) if ((highest_count // ent[1]) < 3)]
        
all_candidates = []
labeled_candidates = {}
for label, entities in entity_counts.items():
    #### If any entities are found
    if entities:
        ##### For our current label, find the entity with the highest count
        max_count = max(entities.values())
        
        # print("WHAT", highest_count_entity)
        entities_in_label = [entity[0] for entity in iter(entities.items()) if ((max_count // entity[1]) < 3)]
        
        ##### NOTE: Special Case for disease to detect and save disease variations no matter the count
        ##### This is so sub-categories of diseases are still saved, e.g. "Pneumonia" and "Bacterial Pneumonia"
        ##### This is so sub-categories of diseases are still saved, e.g. "Cancer" and "Breast Cancer"
        if label == "DISEASE":
            fuzzy_matches = []
            for i, candidate in enumerate(entities_in_label):
                for e, _ in itertools.islice(entities.items(), i + 1, None):
                    if fuzz.partial_ratio(candidate, e) == 100:
                        fuzzy_matches.append(e)
            entities_in_label += fuzzy_matches
            
        ##### Append the list comprehension to our candidates list
        all_candidates += entities_in_label
        labeled_candidates[label] = entities_in_label
               
        # candidates.append(first_element_tuple[0])
        # visualization_text += '{}, '.format(first_element_tuple[0])
        # print("Highest Count {}: {}".format(label, first_element_tuple[0]))
        

print('Annotated Entity Candidates: {}'.format(', '.join(all_candidates)))
print('Ah moj elvana: {}'.format(labeled_candidates))
# displacy.render(nlp(visualization_text), style = 'ent')

Annotated Entity Candidates: Switzerland, Southern Europe, BMC Public Health, IOTF, Human Research Act, Eiholzer, SSEP, obesity, obese, child obesity
Ah moj elvana: {'GPE': ['Switzerland', 'Southern Europe'], 'ORG': ['BMC Public Health', 'IOTF'], 'LAW': ['Human Research Act'], 'PERSON': ['Eiholzer'], 'PRODUCT': ['SSEP'], 'DISEASE': ['obesity', 'obese', 'child obesity']}


In [7]:
from thefuzz import fuzz

def filter_candidate_list(entity_label : str, candidate_list):
    ##### Keep count of all words that will need to be removed
    remove_set = set()
    
    ##### Loop through each word except the final one (it will be checked with all previous ones anyway)
    for i, word in enumerate(candidate_list[:-1]):
        for other_word in candidate_list[i + 1:]:
            similarity = fuzz.partial_ratio(word, other_word)
            ##### 65 Threshold seemed best for now
            if (similarity > 65):
                shorter_word = word if len(word) <= len(other_word) else other_word
                longer_word = other_word if len(word) <= len(other_word) else word
                
                ##### NOTE: Special Case | If the partial token similarity between two disease entities is equal to 100, do not remove it
                ##### This is so sub-categories of diseases are still saved, e.g. "Cancer" and "Breast Cancer"
                if entity_label in ("DISEASE", "PERSON") and similarity == 100:
                    ###### The case where we might be dealing with "Cancer" and "cancer"
                    if not (shorter_word.istitle() and longer_word.istitle()):
                        continue
                
                remove_set.add(shorter_word)
                print("Removing word '{}' due to similarity with '{}' ({})".format(shorter_word, longer_word, similarity))
    return [word for word in candidate_list if word not in remove_set]



example_list = ['WHO', 'Switzerland', 'Southern Europe', 'Switz', 'BMC', 'Color Violet', 'BMC Public Health', 'IOTF', 'Violet', 
                'Human Research Act', 'Eiholzer', 'SSEP', 'obesity', 'obese', 'hills', 'the hills have eyes', 'World Health Organization (WHO)']

filtered_list = filter_candidate_list('', example_list)

print('')
print('Original Entity Set: {}'.format(example_list))
print('Filtered Entity Set: {}'.format(filtered_list))


Removing word 'WHO' due to similarity with 'World Health Organization (WHO)' (67)
Removing word 'Switz' due to similarity with 'Switzerland' (100)
Removing word 'BMC' due to similarity with 'BMC Public Health' (100)
Removing word 'Violet' due to similarity with 'Color Violet' (100)
Removing word 'Violet' due to similarity with 'Eiholzer' (67)
Removing word 'obese' due to similarity with 'obesity' (80)
Removing word 'hills' due to similarity with 'the hills have eyes' (100)

Original Entity Set: ['WHO', 'Switzerland', 'Southern Europe', 'Switz', 'BMC', 'Color Violet', 'BMC Public Health', 'IOTF', 'Violet', 'Human Research Act', 'Eiholzer', 'SSEP', 'obesity', 'obese', 'hills', 'the hills have eyes', 'World Health Organization (WHO)']
Filtered Entity Set: ['Switzerland', 'Southern Europe', 'Color Violet', 'BMC Public Health', 'IOTF', 'Human Research Act', 'Eiholzer', 'SSEP', 'obesity', 'the hills have eyes', 'World Health Organization (WHO)']


In [8]:
fuzz.partial_ratio("cancer", "breast cancer")

100

In [9]:
print('Entities per Label: ')
all_candidates = []
for label, entities in labeled_candidates.items():
    if label != "DISEASE":
        filtered_entities = filter_candidate_list(label, entities)
    else:
        filtered_entities = filter_candidate_list(label, entities)
    print('{}: {}'.format(label, filtered_entities))
    all_candidates += filtered_entities

all_candidates = [candidate if not candidate.islower() else candidate.capitalize() for candidate in all_candidates]   
print('\nAll Entities: {}'.format(all_candidates))

Entities per Label: 
GPE: ['Switzerland', 'Southern Europe']
ORG: ['BMC Public Health', 'IOTF']
LAW: ['Human Research Act']
PERSON: ['Eiholzer']
PRODUCT: ['SSEP']
Removing word 'obese' due to similarity with 'obesity' (80)
Removing word 'obese' due to similarity with 'child obesity' (80)
DISEASE: ['obesity', 'child obesity']

All Entities: ['Switzerland', 'Southern Europe', 'BMC Public Health', 'IOTF', 'Human Research Act', 'Eiholzer', 'SSEP', 'Obesity', 'Child obesity']
