<a href="https://colab.research.google.com/github/mohammedterry/NLP_for_ML/blob/master/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
example_document = '''Baidu's Apollo Project is one of the world's leading autonomous driving and AI programs, with one of the largest partner ecosystems and over 100 global partners as of 2018, including BYD, Dongfeng, Microsoft, Intel, Nvidia, Daimler AG, ZTE, Grab, Ford, Hyundai and Honda.'''

# GATE

In [2]:
import requests
url = "https://cloud-api.gate.ac.uk/process-document/annie-named-entity-recognizer"
headers = {'Content-Type': 'text/plain'}
response = requests.post(url, data=example_document, headers=headers).json()

import json
print(json.dumps(response, indent=2))

{
  "text": "Baidu's Apollo Project is one of the world's leading autonomous driving and AI programs, with one of the largest partner ecosystems and over 100 global partners as of 2018, including BYD, Dongfeng, Microsoft, Intel, Nvidia, Daimler AG, ZTE, Grab, Ford, Hyundai and Honda.",
  "entities": {
    "Date": [
      {
        "indices": [
          167,
          171
        ],
        "kind": "date",
        "rule": "TempYear2",
        "ruleFinal": "YearOnlyFinal"
      }
    ],
    "Location": [
      {
        "indices": [
          247,
          251
        ],
        "locType": "city",
        "rule": "Location1",
        "ruleFinal": "LocFinal"
      }
    ],
    "Organization": [
      {
        "indices": [
          8,
          22
        ],
        "orgType": "unknown",
        "rule": "OrgXBase",
        "ruleFinal": "OrgFinal"
      },
      {
        "indices": [
          198,
          207
        ],
        "orgType": "company",
        "rule": "GazOrganization"

In [0]:
def gate_ner(sentence):
  import requests
  return [(sentence[entity["indices"][0]:entity["indices"][1]] + f" ({entity['gender']})",entity_type) if entity_type == "Person" and "gender" in entity else (sentence[entity["indices"][0]:entity["indices"][1]],entity_type)  for entity_type,entities in requests.post("https://cloud-api.gate.ac.uk/process-document/annie-named-entity-recognizer", data=sentence, headers={'Content-Type': 'text/plain'}).json()["entities"].items() for entity in entities]

In [4]:
gate_ner(example_document)

[('2018', 'Date'),
 ('Ford', 'Location'),
 ('Apollo Project', 'Organization'),
 ('Microsoft', 'Organization'),
 ('Intel', 'Organization'),
 ('Nvidia', 'Organization'),
 ('Daimler AG', 'Organization'),
 ('Hyundai', 'Organization'),
 ('Honda', 'Organization')]

# NLTK

In [5]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [0]:
def nltk_ner(document):
  return {(' '.join(c[0] for c in chunk), chunk.label() ) for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(document))) if hasattr(chunk, 'label') }

In [7]:
nltk_ner(example_document)

{('AI', 'ORGANIZATION'),
 ('Apollo Project', 'PERSON'),
 ('BYD', 'ORGANIZATION'),
 ('Baidu', 'GPE'),
 ('Daimler AG', 'PERSON'),
 ('Dongfeng', 'PERSON'),
 ('Ford', 'ORGANIZATION'),
 ('Grab', 'PERSON'),
 ('Honda', 'GPE'),
 ('Hyundai', 'PERSON'),
 ('Intel', 'ORGANIZATION'),
 ('Microsoft', 'PERSON'),
 ('Nvidia', 'GPE'),
 ('ZTE', 'ORGANIZATION')}

# Spacy

In [0]:
!python3 -m spacy download en_core_web_lg
import spacy
sp_lg = spacy.load('en_core_web_lg') 

In [0]:
def spacy_large_ner(document):
  return {(ent.text.strip(), ent.label_) for ent in sp_lg(document).ents}

In [12]:
spacy_large_ner(example_document)

{('2018', 'DATE'),
 ('Apollo Project', 'ORG'),
 ('BYD', 'ORG'),
 ('Baidu', 'ORG'),
 ('Daimler AG', 'ORG'),
 ('Dongfeng', 'PERSON'),
 ('Ford', 'ORG'),
 ('Honda', 'ORG'),
 ('Hyundai', 'ORG'),
 ('Intel', 'ORG'),
 ('Microsoft', 'ORG'),
 ('Nvidia', 'ORG'),
 ('ZTE', 'ORG'),
 ('over 100', 'CARDINAL')}

# Flair

In [21]:
!pip3 install flair

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/44/54/76374f9a448ca765446502e7f2bb53c976e9c055102290fe6f8b0b038b37/flair-0.4.1.tar.gz (78kB)
[K     |████████████████████████████████| 81kB 24.1MB/s 
Collecting segtok>=1.5.7 (from flair)
  Downloading https://files.pythonhosted.org/packages/1d/59/6ed78856ab99d2da04084b59e7da797972baa0efecb71546b16d48e49d9b/segtok-1.5.7.tar.gz
Collecting mpld3>=0.3 (from flair)
[?25l  Downloading https://files.pythonhosted.org/packages/91/95/a52d3a83d0a29ba0d6898f6727e9858fe7a43f6c2ce81a5fe7e05f0f4912/mpld3-0.3.tar.gz (788kB)
[K     |████████████████████████████████| 798kB 42.1MB/s 
Collecting sqlitedict>=1.6.0 (from flair)
  Downloading https://files.pythonhosted.org/packages/0f/1c/c757b93147a219cf1e25cef7e1ad9b595b7f802159493c45ce116521caff/sqlitedict-1.6.0.tar.gz
Collecting deprecated>=1.2.4 (from flair)
  Downloading https://files.pythonhosted.org/packages/9f/7a/003fa432f1e45625626549726c2fbb7a29baa764e9d1fdb2323a5d779f

In [22]:
from flair.models import SequenceTagger
flair_12class = SequenceTagger.load('ner-ontonotes-fast')
flair_4class = SequenceTagger.load('ner')

2019-05-03 18:22:34,061 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.2/NER-ontoner--h256-l1-b32-%2Bcrawl%2Bnews-forward-fast%2Bnews-backward-fast--v0.2/en-ner-ontonotes-fast-v0.3.pt not found in cache, downloading to /tmp/tmpy8wumt_d


100%|██████████| 1331337776/1331337776 [02:11<00:00, 10094955.31B/s]

2019-05-03 18:24:47,221 copying /tmp/tmpy8wumt_d to cache at /root/.flair/models/en-ner-ontonotes-fast-v0.3.pt





2019-05-03 18:24:54,910 removing temp file /tmp/tmpy8wumt_d
2019-05-03 18:24:54,912 loading file /root/.flair/models/en-ner-ontonotes-fast-v0.3.pt
2019-05-03 18:25:09,807 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4/NER-conll03-english/en-ner-conll03-v0.4.pt not found in cache, downloading to /tmp/tmpnejixshj


100%|██████████| 432197603/432197603 [00:47<00:00, 9149328.71B/s]

2019-05-03 18:25:58,338 copying /tmp/tmpnejixshj to cache at /root/.flair/models/en-ner-conll03-v0.4.pt





2019-05-03 18:26:00,534 removing temp file /tmp/tmpnejixshj
2019-05-03 18:26:00,536 loading file /root/.flair/models/en-ner-conll03-v0.4.pt


In [0]:
def flair_ner(document, model):
  from flair.data import Sentence
  s = Sentence(document)
  model.predict(s)
  entities = s.to_dict(tag_type='ner')
  return [(entity["text"], entity["type"]) for entity in entities["entities"]]

In [27]:
flair_ner(example_document, flair_4class)

[("Baidu's Apollo Project", 'ORG'),
 ('AI', 'ORG'),
 ('BYD, Dongfeng, Microsoft, Intel, Nvidia, Daimler AG, ZTE, Grab, Ford, Hyundai',
  'ORG'),
 ('Honda.', 'ORG')]

In [28]:
flair_ner(example_document, flair_12class)

[("Baidu's Apollo Project", 'ORG'),
 ('AI', 'ORG'),
 ('over 100', 'CARDINAL'),
 ('2018,', 'DATE'),
 ('BYD, Dongfeng, Microsoft, Intel, Nvidia, Daimler', 'ORG'),
 ('Hyundai', 'ORG')]

# Deep Pavlov

In [29]:
!pip3 install deeppavlov
!python3 -m deeppavlov install ner_ontonotes
from deeppavlov import configs, build_model
deeppavlov_ner = build_model(configs.ner.ner_ontonotes, download=True)

Collecting deeppavlov
[?25l  Downloading https://files.pythonhosted.org/packages/ab/92/33166dcd4fd87b171d5d37a87e19fc936e97f0a7ddbe2e7c0cdae7ceabb6/deeppavlov-0.2.0-py3-none-any.whl (602kB)
[K     |████████████████████████████████| 604kB 22.4MB/s 
Collecting numpy==1.14.5 (from deeppavlov)
[?25l  Downloading https://files.pythonhosted.org/packages/68/1e/116ad560de97694e2d0c1843a7a0075cc9f49e922454d32f49a80eb6f1f2/numpy-1.14.5-cp36-cp36m-manylinux1_x86_64.whl (12.2MB)
[K     |████████████████████████████████| 12.2MB 33.0MB/s 
Collecting pymorphy2-dicts-ru (from deeppavlov)
[?25l  Downloading https://files.pythonhosted.org/packages/7c/9b/358faaff410f65a4ad159275e897b5956dcb20576c5b8e764b971c1634d7/pymorphy2_dicts_ru-2.4.404381.4453942-py2.py3-none-any.whl (8.0MB)
[K     |████████████████████████████████| 8.0MB 42.2MB/s 
[?25hCollecting pandas==0.23.1 (from deeppavlov)
[?25l  Downloading https://files.pythonhosted.org/packages/57/eb/6ab533ea8e35e7dd159af6922ac1123d4565d89f3926ad9a

2019-05-03 18:45:42.8 INFO in 'deeppavlov.core.common.file'['file'] at line 30: Interpreting 'ner_ontonotes' as '/usr/local/lib/python3.6/dist-packages/deeppavlov/configs/ner/ner_ontonotes.json'
Collecting gensim==2.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/bc/ed/fbbb2cc3f37a39cc4ff8e5f667374478fb852b384840aa7feb9608144290/gensim-2.3.0.tar.gz (17.2MB)
[K     |████████████████████████████████| 17.2MB 32.3MB/s 
Building wheels for collected packages: gensim
  Building wheel for gensim (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/3a/1f/86/63c886325bdffa379a7c91499bc9ea6317a4e4e0fc6e2ff1ce
Successfully built gensim
[31mERROR: flair 0.4.1 has requirement gensim>=3.4.0, but you'll have gensim 2.3.0 which is incompatible.[0m
[31mERROR: flair 0.4.1 has requirement tqdm>=4.26.0, but you'll have tqdm 4.23.4 which is incompatible.[0m
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensi

2019-05-03 18:46:25.848 INFO in 'deeppavlov.core.data.utils'['utils'] at line 63: Downloading from http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt to /root/.deeppavlov/downloads/embeddings/glove.6B.100d.txt


2019-05-03 18:46:25,848 Downloading from http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt to /root/.deeppavlov/downloads/embeddings/glove.6B.100d.txt


347MB [00:56, 6.14MB/s]
2019-05-03 18:47:23.683 INFO in 'deeppavlov.core.data.utils'['utils'] at line 63: Downloading from http://files.deeppavlov.ai/deeppavlov_data/ner_ontonotes_v3_cpu_compatible.tar.gz to /root/.deeppavlov/ner_ontonotes_v3_cpu_compatible.tar.gz


2019-05-03 18:47:23,683 Downloading from http://files.deeppavlov.ai/deeppavlov_data/ner_ontonotes_v3_cpu_compatible.tar.gz to /root/.deeppavlov/ner_ontonotes_v3_cpu_compatible.tar.gz


100%|██████████| 8.13M/8.13M [00:04<00:00, 1.64MB/s]
2019-05-03 18:47:28.648 INFO in 'deeppavlov.core.data.utils'['utils'] at line 201: Extracting /root/.deeppavlov/ner_ontonotes_v3_cpu_compatible.tar.gz archive into /root/.deeppavlov/models


2019-05-03 18:47:28,648 Extracting /root/.deeppavlov/ner_ontonotes_v3_cpu_compatible.tar.gz archive into /root/.deeppavlov/models


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Unzipping misc/perluniprops.zip.
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping corpora/nonbreaking_prefixes.zip.
2019-05-03 18:47:29.935 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 103: [loading vocabulary from /root/.deeppavlov/models/ner_ontonotes/tag.dict]


2019-05-03 18:47:29,935 [loading vocabulary from /root/.deeppavlov/models/ner_ontonotes/tag.dict]


2019-05-03 18:47:29.943 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 103: [loading vocabulary from /root/.deeppavlov/models/ner_ontonotes/char.dict]


2019-05-03 18:47:29,943 [loading vocabulary from /root/.deeppavlov/models/ner_ontonotes/char.dict]


2019-05-03 18:47:29.950 INFO in 'deeppavlov.models.embedders.glove_embedder'['glove_embedder'] at line 52: [loading GloVe embeddings from `/root/.deeppavlov/downloads/embeddings/glove.6B.100d.txt`]


2019-05-03 18:47:29,950 [loading GloVe embeddings from `/root/.deeppavlov/downloads/embeddings/glove.6B.100d.txt`]
2019-05-03 18:47:29,954 this function is deprecated, use smart_open.open instead


Using TensorFlow backend.
2019-05-03 18:48:07.348 INFO in 'deeppavlov.core.layers.tf_layers'['tf_layers'] at line 756: 


2019-05-03 18:48:07,348 
2019-05-03 18:48:14,219 From /usr/local/lib/python3.6/dist-packages/deeppavlov/core/layers/tf_layers.py:861: calling reverse_sequence (from tensorflow.python.ops.array_ops) with seq_dim is deprecated and will be removed in a future version.
Instructions for updating:
seq_dim is deprecated, use seq_axis instead
2019-05-03 18:48:14,238 From /usr/local/lib/python3.6/dist-packages/tensorflow/python/util/deprecation.py:454: calling reverse_sequence (from tensorflow.python.ops.array_ops) with batch_dim is deprecated and will be removed in a future version.
Instructions for updating:
batch_dim is deprecated, use batch_axis instead


2019-05-03 18:48:14.240 INFO in 'deeppavlov.core.layers.tf_layers'['tf_layers'] at line 756: 


2019-05-03 18:48:14,240 


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
2019-05-03 18:48:15.920 INFO in 'deeppavlov.core.models.tf_model'['tf_model'] at line 50: [loading model from /root/.deeppavlov/models/ner_ontonotes/model]


2019-05-03 18:48:15,920 [loading model from /root/.deeppavlov/models/ner_ontonotes/model]


In [0]:
def convert_entities(entities):
  ents = set()
  for entity,next_entity in zip(entities,entities[1:] + [(".","O")]):
    word,tag = entity
    if tag != "O":
      ent_position, ent_type = tag.split("-")
      if ent_position == "U":
        ents.add((word,ent_type))
      else:
        if ent_position == "B":
          w = word
        elif ent_position == "I":
          w += " " + word
          if next_entity[1].split("-")[0] != "I":
            ents.add((w,ent_type))
  return ents

def dp_ner(sentence):
  tokens,tags = deeppavlov_ner([sentence])
  return convert_entities([(tok,tg) for token,tag in zip(tokens,tags) for tok,tg in list(zip(token,tag)) ])

In [31]:
dp_ner(example_document)  

{('Apollo Project', 'ORG'), ('Daimler AG', 'ORG'), ('over 100', 'CARDINAL')}

# Stanford Core NLP

In [32]:
!pip3 install nltk==3.2.4

Collecting nltk==3.2.4
[?25l  Downloading https://files.pythonhosted.org/packages/cd/c2/858e0708b497116ae45cf5c6b1f66984ac60729c61e49df6c1c0b808d1e4/nltk-3.2.4.tar.gz (1.2MB)
[K     |████████████████████████████████| 1.2MB 22.9MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/36/f1/5c/f667347d86a3a534ba4c0127eed4389f929916e3ec88bb461a
Successfully built nltk
[31mERROR: deeppavlov 0.2.0 has requirement nltk==3.2.5, but you'll have nltk 3.2.4 which is incompatible.[0m
Installing collected packages: nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.2.4


In [34]:
!wget http://nlp.stanford.edu/software/stanford-ner-2015-04-20.zip
!unzip stanford-ner-2015-04-20.zip 

--2019-05-03 18:58:03--  http://nlp.stanford.edu/software/stanford-ner-2015-04-20.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/software/stanford-ner-2015-04-20.zip [following]
--2019-05-03 18:58:03--  https://nlp.stanford.edu/software/stanford-ner-2015-04-20.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 176961718 (169M) [application/zip]
Saving to: ‘stanford-ner-2015-04-20.zip.1’

Archive:  stanford-ner-2015-04-20.zip
replace stanford-ner-2015-04-20/README.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [35]:
from nltk.tag.stanford import StanfordNERTagger
jar = "stanford-ner-2015-04-20/stanford-ner-3.5.2.jar"
model = "stanford-ner-2015-04-20/classifiers/" 
st_3class = StanfordNERTagger(model + "english.all.3class.distsim.crf.ser.gz", jar, encoding='utf8') 
st_4class = StanfordNERTagger(model + "english.conll.4class.distsim.crf.ser.gz", jar, encoding='utf8') 
st_7class = StanfordNERTagger(model + "english.muc.7class.distsim.crf.ser.gz", jar, encoding='utf8') 

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  '-tokenizerFactory',
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  '-tokenizerFactory',
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  '-tokenizerFactory',


In [0]:
def stanford_ner(document,model):
  if model == 1:
    return [(entity,tag) for entity,tag in st_3class.tag(document.split()) if tag != "O"]
  elif model == 2:
    return [(entity,tag) for entity,tag in st_4class.tag(document.split()) if tag != "O"]
  elif model == 3:
    return [(entity,tag) for entity,tag in st_7class.tag(document.split()) if tag != "O"]

In [37]:
stanford_ner(example_document,model=1)

[('Hyundai', 'ORGANIZATION')]

In [38]:
stanford_ner(example_document,model=2)

[('Apollo', 'ORGANIZATION'),
 ('Project', 'ORGANIZATION'),
 ('Daimler', 'ORGANIZATION'),
 ('Hyundai', 'ORGANIZATION')]

In [39]:
stanford_ner(example_document,model=3)

[('Apollo', 'ORGANIZATION'),
 ('Project', 'ORGANIZATION'),
 ('Daimler', 'ORGANIZATION'),
 ('Hyundai', 'ORGANIZATION'),
 ('Honda.', 'LOCATION')]

# Allen NLP

In [40]:
!pip3 install allennlp
from allennlp.predictors import Predictor
al = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/fine-grained-ner-model-elmo-2018.12.21.tar.gz")

Collecting allennlp
[?25l  Downloading https://files.pythonhosted.org/packages/a4/c8/10342a6068a8d156a5947e03c95525d559e71ad62de0f2585ab922e14533/allennlp-0.8.3-py3-none-any.whl (5.6MB)
[K     |████████████████████████████████| 5.6MB 25.3MB/s 
Collecting tensorboardX>=1.2 (from allennlp)
[?25l  Downloading https://files.pythonhosted.org/packages/5c/76/89dd44458eb976347e5a6e75eb79fecf8facd46c1ce259bad54e0044ea35/tensorboardX-1.6-py2.py3-none-any.whl (129kB)
[K     |████████████████████████████████| 133kB 26.8MB/s 
Collecting jsonnet>=0.10.0; sys_platform != "win32" (from allennlp)
[?25l  Downloading https://files.pythonhosted.org/packages/60/dc/3abd3971869a741d7acdba166d71d4f9366b6b53028dfd56f95de356af0f/jsonnet-0.12.1.tar.gz (240kB)
[K     |████████████████████████████████| 245kB 34.2MB/s 
Collecting parsimonious>=0.8.0 (from allennlp)
[?25l  Downloading https://files.pythonhosted.org/packages/02/fc/067a3f89869a41009e1a7cdfb14725f8ddd246f30f63c645e8ef8a1c56f4/parsimonious-0.8.1.

100%|██████████| 724601837/724601837 [00:52<00:00, 13728259.04B/s]


In [0]:
def convert_results(allen_results):
  ents = set()
  for word, tag in zip(allen_results["words"], allen_results["tags"]):
    if tag != "O":
      ent_position, ent_type = tag.split("-")
      if ent_position == "U":
        ents.add((word,ent_type))
      else:
        if ent_position == "B":
          w = word
        elif ent_position == "I":
          w += " " + word
        elif ent_position == "L":
          w += " " + word
          ents.add((w,ent_type))
  return ents

def allennlp_ner(document):
  return convert_results(al.predict(sentence=document))

In [42]:
allennlp_ner(example_document)

{('2018', 'DATE'),
 ('Apollo Project', 'ORG'),
 ('BYD', 'ORG'),
 ('Baidu', 'ORG'),
 ('Daimler AG', 'ORG'),
 ('Dongfeng', 'ORG'),
 ('Ford', 'ORG'),
 ('Grab', 'ORG'),
 ('Honda', 'ORG'),
 ('Hyundai', 'ORG'),
 ('Intel', 'ORG'),
 ('Microsoft', 'ORG'),
 ('Nvidia', 'ORG'),
 ('ZTE', 'ORG'),
 ('one', 'CARDINAL'),
 ('over 100', 'CARDINAL')}

# Polyglot

In [43]:
!pip3 install -U git+https://github.com/aboSamoor/polyglot.git@master
!polyglot download embeddings2.en ner2.en
from polyglot.text import Text

Collecting git+https://github.com/aboSamoor/polyglot.git@master
  Cloning https://github.com/aboSamoor/polyglot.git (to revision master) to /tmp/pip-req-build-2wnozxsr
  Running command git clone -q https://github.com/aboSamoor/polyglot.git /tmp/pip-req-build-2wnozxsr
Collecting futures>=2.1.6 (from polyglot==16.7.4)
  Downloading https://files.pythonhosted.org/packages/cc/26/b61e3a4eb50653e8a7339d84eeaa46d1e93b92951978873c220ae64d0733/futures-3.1.1.tar.gz
Collecting pycld2>=0.3 (from polyglot==16.7.4)
[?25l  Downloading https://files.pythonhosted.org/packages/21/77/8525fe5f147bf2819c7c9942c717c4a79b83f8003da1a3847759fb560909/pycld2-0.31.tar.gz (14.3MB)
[K     |████████████████████████████████| 14.3MB 34.5MB/s 
[?25hCollecting PyICU>=1.8 (from polyglot==16.7.4)
[?25l  Downloading https://files.pythonhosted.org/packages/e9/35/211ffb949c68e688ade7d40426de030a24eaec4b6c45330eeb9c0285f43a/PyICU-2.3.1.tar.gz (214kB)
[K     |████████████████████████████████| 215kB 49.4MB/s 
[?25hCollec

[polyglot_data] Downloading package embeddings2.en to
[polyglot_data]     /root/polyglot_data...
[polyglot_data] Downloading package ner2.en to /root/polyglot_data...


In [0]:
def polyglot_ner(document):
  return {(' '.join(entity),entity.tag.split('-')[-1]) for entity in Text(document).entities}

In [45]:
polyglot_ner(example_document)

{('Daimler AG', 'ORG'),
 ('Honda', 'ORG'),
 ('Hyundai', 'ORG'),
 ('Intel', 'ORG'),
 ('Microsoft', 'ORG'),
 ('Nvidia', 'ORG')}