# NLP application Date Entity Extraction

## Lib Imports 


In [None]:
!pip install Stanza
!pip install transformers
!pip install datefinder

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Stanza
  Downloading stanza-1.4.2-py3-none-any.whl (691 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m691.3/691.3 KB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.9/240.9 KB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.2.0-py3-none-any.whl size=234926 sha256=216f4426e965e4fac463107dd2efde305a476e3eeadc114f1bae51face7275e9
  Stored in directory: /root/.cache/pip/wheels/86/62/9e/a6b27a681abcde69970dbc0326ff51955f3beac72f15696984
Successfully built emoji
Installing collected packages: emoji, Stanza
Successfully installed Stanza-1.4.2 emo

In [None]:
import os
import csv 
import re
import datefinder

# Stanza
import stanza
stanza.download('en')
st_nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')

#transformers
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')



Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/default.zip:   0%|          | 0…

INFO:stanza:Finished downloading models and saved to /root/stanza_resources.
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

INFO:stanza:Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

INFO:stanza:Use device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

## Date Extraction

In [None]:
def extract_meta_data(text:str, dates:list=[]):
    labeled_data =[]
    simp_list = []
    if(text == None):
        return['NAN']
    
    # Stansa General NER for people entites and dates
    doc2 = st_nlp(text)
    for ent in doc2.ents:
        res = {'text':ent.text, 'label': ent.type}
        labeled_data.append(res)
    
    matches = datefinder.find_dates(text,source=False, index=False, strict=True)
    for match in matches:
        res = {'text':str(match), 'label': 'DATE'}
        labeled_data.append(res)
    
    for x in labeled_data:
        if('TIME' in x['label'] or 'DATE' in x['label']):
            dates.append(x)
    
    for date in dates:
      d = date['text']
      simp_list.append(d)
    
    simp_list = list(set(simp_list))
    return [dates, simp_list]

## Question Answer NLP

In [None]:
def question_answer(question, text):
    
    #tokenize question and text as a pair
    input_ids = tokenizer.encode(question, text)
    answer =''
    
    #string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    #segment IDs
    #first occurence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    #number of tokens in segment A (question)
    num_seg_a = sep_idx+1
    #number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a
    
    #list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    
    #model output using input_ids and segment_ids
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    
    #reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)
    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
                
    if answer.startswith("[CLS]"):
        answer = "NaN"
    
    #print("\nPredicted answer:\n{}".format(answer.capitalize()))
    return answer

## Central Date Entity Extraction Method 

In [None]:
def date_entity_extraction_method(text):
  dates = []

  dates = extract_meta_data(text, dates)[1]

  answers =[]

  for date in dates:
    date = date.replace('00:00:00','')
    question = 'info about '+ str(date)
    res = question_answer(question, text)
    dictz = {'text':text, 'date': date, 'entity': res }
    answers.append(dictz)

  return answers

## Test the Application

In [None]:
text = """
 on patient, Mr. John Doe, has been diagnosed 01/13/2022 with stage 3 prostate cancer. This diagnosis was made following a series of tests, including a biopsy, which was performed on 01/01/2022.

Mr. Doe has a history of prostate issues, having first come to see me with complaints of urinary difficulties on 05/14/2021. At that time, I ordered a prostate-specific antigen (PSA) test and a digital rectal exam (DRE), both of which showed abnormal results. I recommended that Mr. Doe undergo further testing, including a transrectal ultrasound-guided biopsy, to confirm the presence of cancer.

On 01/01/2022, Mr. Doe returned for the biopsy results, at which point he was informed of the stage 3 diagnosis. We will be starting treatment as soon as possible, which will likely include a combination of surgery, radiation therapy, and hormone therapy.

I will be closely monitoring Mr. Doe's progress and will provide updates as needed. In the meantime, please do not hesitate to contact me with any questions or concerns.
"""



In [None]:
dd =[]
dates = extract_meta_data(text,dd)

dates[1]

['2021-05-14 00:00:00', '2022-01-01 00:00:00', '2022-01-13 00:00:00']

In [None]:
date = dates[0][0]['text']
date

'2022-01-01 00:00:00'

In [None]:
question1 = 'what is the patients name?'
question2 ='what has been diagnosed'
question3 = 'info about 01/13/2022'
ans = question_answer(question3, text)

ans

'stage 3 prostate cancer'

In [None]:
answers = date_entity_extraction_method(text)

for x in answers:
  print(x)
  print()

{'text': "\n on patient, Mr. John Doe, has been diagnosed 01/13/2022 with stage 3 prostate cancer. This diagnosis was made following a series of tests, including a biopsy, which was performed on 01/01/2022.\n\nMr. Doe has a history of prostate issues, having first come to see me with complaints of urinary difficulties on 05/14/2021. At that time, I ordered a prostate-specific antigen (PSA) test and a digital rectal exam (DRE), both of which showed abnormal results. I recommended that Mr. Doe undergo further testing, including a transrectal ultrasound-guided biopsy, to confirm the presence of cancer.\n\nOn 01/01/2022, Mr. Doe returned for the biopsy results, at which point he was informed of the stage 3 diagnosis. We will be starting treatment as soon as possible, which will likely include a combination of surgery, radiation therapy, and hormone therapy.\n\nI will be closely monitoring Mr. Doe's progress and will provide updates as needed. In the meantime, please do not hesitate to cont

In [None]:
df = pd.DataFrame(answers)
df

Unnamed: 0,text,date,entity
0,"\n on patient, Mr. John Doe, has been diagnose...",2021-05-14,05 / 14 / 2021
1,"\n on patient, Mr. John Doe, has been diagnose...",2022-01-01,biopsy
2,"\n on patient, Mr. John Doe, has been diagnose...",2022-01-13,"mr . john doe , has been diagnosed 01 / 13 / 2..."
