Env: "cre"

# 01. Stanza
https://stanfordnlp.github.io/stanza/ner.html

In [9]:
import stanza

text = "I need a desk on this Tuesday, not next Tuesday, 10 December 2020, from 2pm to 3pm. These are in Q4 of 2020. The time is between 1 to 2 hours."
nlp = stanza.Pipeline('en', processors='tokenize,pos,ner', verbose=False)
doc = nlp(text)
print(*[f'entity: {ent.text}\ttype: {ent.type}' for sent in doc.sentences for ent in sent.ents], sep='\n')

entity: this Tuesday	type: DATE
entity: next Tuesday, 10 December 2020	type: DATE
entity: 2pm to 3pm	type: TIME
entity: Q4 of 2020	type: DATE
entity: between 1 to 2 hours	type: TIME


In [4]:
doc

[
  [
    {
      "id": 1,
      "text": "I",
      "upos": "PRON",
      "xpos": "PRP",
      "feats": "Case=Nom|Number=Sing|Person=1|PronType=Prs",
      "start_char": 0,
      "end_char": 1,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 2,
      "text": "need",
      "upos": "VERB",
      "xpos": "VBP",
      "feats": "Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin",
      "start_char": 2,
      "end_char": 6,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 3,
      "text": "a",
      "upos": "DET",
      "xpos": "DT",
      "feats": "Definite=Ind|PronType=Art",
      "start_char": 7,
      "end_char": 8,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 4,
      "text": "desk",
      "upos": "NOUN",
      "xpos": "NN",
      "feats": "Number=Sing",
      "start_char": 9,
      "end_char": 13,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
     

# 02. StanfordCoreNLP
https://github.com/stanfordnlp/stanza/blob/main/demo/Stanza_CoreNLP_Interface.ipynb

In [10]:
# Import client module
from stanza.server import CoreNLPClient

In [11]:
# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = r"C:\Users\effbl\stanza_corenlp\stanford-corenlp-4.2.2"

In [12]:
# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
client = CoreNLPClient(
    annotators=['tokenize','ssplit','ner'], 
    memory='4G', 
    endpoint='http://localhost:9000',
    be_quiet=True)
print(client)

# Start the background server and wait for some time
# Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
client.start()
import time; time.sleep(10)

<stanza.server.client.CoreNLPClient object at 0x0000022833E9C388>


In [13]:
# Print background processes and look for java
# You should be able to see a StanfordCoreNLPServer java process running in the background
!ps -o pid,cmd | grep java

'ps' is not recognized as an internal or external command,
operable program or batch file.


In [14]:
# Annotate some text
text = "I need a desk on this Tuesday, not next Tuesday, 10 December 2020, from 2pm to 3pm. These are in Q4 of 2020. The time is between 1 to 2 hours."
document = client.annotate(text)
print(type(document))

<class 'CoreNLP_pb2.Document'>


In [16]:
document.sentence

[token {
  word: "I"
  pos: "PRP"
  value: "I"
  before: ""
  after: " "
  originalText: "I"
  ner: "O"
  lemma: "I"
  beginChar: 0
  endChar: 1
  tokenBeginIndex: 0
  tokenEndIndex: 1
  hasXmlContext: false
  isNewline: false
  coarseNER: "O"
  fineGrainedNER: "O"
  nerLabelProbs: "O=0.9999627153845908"
}
token {
  word: "need"
  pos: "VBP"
  value: "need"
  before: " "
  after: " "
  originalText: "need"
  ner: "O"
  lemma: "need"
  beginChar: 2
  endChar: 6
  tokenBeginIndex: 1
  tokenEndIndex: 2
  hasXmlContext: false
  isNewline: false
  coarseNER: "O"
  fineGrainedNER: "O"
  nerLabelProbs: "O=0.9999983188358704"
}
token {
  word: "a"
  pos: "DT"
  value: "a"
  before: " "
  after: " "
  originalText: "a"
  ner: "O"
  lemma: "a"
  beginChar: 7
  endChar: 8
  tokenBeginIndex: 2
  tokenEndIndex: 3
  hasXmlContext: false
  isNewline: false
  coarseNER: "O"
  fineGrainedNER: "O"
  nerLabelProbs: "O=0.9999979942182556"
}
token {
  word: "desk"
  pos: "NN"
  value: "desk"
  before: " "


In [63]:
# Get tokens that have TIMEX infos

timex_infos = []
for i, sent in enumerate(document.sentence):
    for j,t in enumerate(sent.token):
        if t.timexValue.text!='':
            timex_infos.append({
                'sent_id': i,
                'word_id': j,
                'word': t.word,
                'value': t.timexValue.value,
                'altValue': t.timexValue.altValue,
                'text': t.timexValue.text,
                'type': t.timexValue.type,
            })

            
# Consolidate tokens that refer to the same TIMEX

current = None
consolidate_timex_infos = []

for i,d in enumerate(timex_infos):
    if current is None: # new item
        current = d
    else:
        # check if new or append to old
        if i==len(timex_infos)-1: # last
            consolidate_timex_infos.append({
                'sent_id': current['sent_id'],
                'word_id': str(current['word_id'])+'_'+str(d['word_id']),
                'word': current['word'],
                'value': current['value'],
                'altValue': current['altValue'],
                'text': current['text'],
                'type': current['type'],
            })
        elif (d['value']==current['value']) and (d['type']==current['type']):
            pass
        else:
            consolidate_timex_infos.append({
                'sent_id': current['sent_id'],
                'word_id': str(current['word_id'])+'_'+str(timex_infos[i-1]['word_id']),
                'word': current['word'],
                'value': current['value'],
                'altValue': current['altValue'],
                'text': current['text'],
                'type': current['type'],
            })
            current = None
            
consolidate_timex_infos

[{'sent_id': 0,
  'word_id': '5_14',
  'word': 'this',
  'value': '',
  'altValue': 'THIS XXXX-WXX-2',
  'text': 'this Tuesday',
  'type': 'DATE'},
 {'sent_id': 0,
  'word_id': '18_18',
  'word': 'pm',
  'value': 'T14:00',
  'altValue': '',
  'text': '2pm',
  'type': 'TIME'},
 {'sent_id': 0,
  'word_id': '21_22',
  'word': 'pm',
  'value': 'T15:00',
  'altValue': '',
  'text': '3pm.',
  'type': 'TIME'},
 {'sent_id': 1,
  'word_id': '4_5',
  'word': 'of',
  'value': '2020-FYQ4',
  'altValue': '',
  'text': 'Q4 of 2020',
  'type': 'DATE'},
 {'sent_id': 2,
  'word_id': '5_7',
  'word': 'to',
  'value': '',
  'altValue': 'PT1H/PT2H',
  'text': '1 to 2 hours',
  'type': 'DURATION'}]