In [77]:
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import tree2conlltags

In [100]:
sentence = "Mark and John are working at Google Cloud."

# Text from here https://www.africanews.com/2022/06/16/boom-in-egypt-innovation-ecosystem-business-africa/
text = """By the end of May 2022, total funds raised by African start-ups amounted to $2.7 billion. From January to May, funds raised by Egyptian start-ups increased by 212% compared to the same period last year.

With $81 million raised as of May 2022, the value of investments raised by Egyptian start-ups is the highest in the MENA region, comprising North Africa and the Middle East. 

In addition to the government's commitment to establishing a favourable business climate, this growth is partly due to the presence of a strong and diversified entrepreneurial support infrastructure. 

To discuss this, Business Africa welcomes Tamer Azer, partner at Shorooq Partners, entrepreneur and company founder.

Cameroon

The deep water port of Kribi wants to develop its pole of attraction. Four years after the launch of its activities, it ranks among the most important contributors in terms of customs profits to the coffers of the Cameroonian state. 

With this in mind, a new platform has been launched. The objective is to place Kribi at the heart of the movement of goods in the Gulf of Guinea.

Uganda

Lake Victoria fishermen are being driven out of business by soaring fuel prices, while more than 5 million people across the country depend on the sector for their livelihood. Prices at the pump were 47% higher in May than in June last year, leading many fishermen to suspend their activities."""

In [101]:
def get_iob_tags(text):
    # This does the NER and returns a strange format (which is apparently a standard)
    ne_tree = ne_chunk(pos_tag(word_tokenize(text)))
    iob_tagged = tree2conlltags(ne_tree)
    
    return iob_tagged


def combined_named_enitites(iob_tagged):
    
    # this selects the named entities and merges them if the are longer than 1 word
    named_entities = []
    
    for name, _, tag in iob_tagged:
        if tag == u"O": # This tag tells it is not a named entity
            pass
        elif tag[0] == u"B": # This tag tells that it is the beginning of an entity
            named_entities.append((name, tag[2::]))
        elif tag[0] == u"I": # This tag tells that it is an inner part of an entity (continuing word)
            last = named_entities.pop()
            named_entities.append((f"{last[0]} {name}", tag[2::]))
            
    return named_entities


In [102]:
# Tags for the one sentence above
iob_tags = get_iob_tags(sentence)
combined_named_enitites(iob_tags)

[('Mark', 'PERSON'), ('John', 'PERSON'), ('Google Cloud', 'ORGANIZATION')]

In [86]:
# Tags for the long text extracted from the website
iob_tags = get_iob_tags(text)
combined_named_enitites(iob_tags)

[('African', 'GPE'),
 ('Egyptian', 'GPE'),
 ('Egyptian', 'GPE'),
 ('MENA', 'ORGANIZATION'),
 ('North Africa', 'PERSON'),
 ('Middle East', 'GPE'),
 ('Business Africa', 'ORGANIZATION'),
 ('Tamer Azer', 'PERSON'),
 ('Shorooq Partners', 'ORGANIZATION'),
 ('Kribi', 'GPE'),
 ('Cameroonian', 'GPE'),
 ('Kribi', 'PERSON'),
 ('Guinea', 'GPE'),
 ('Uganda Lake Victoria', 'PERSON')]

In [112]:
sentences = [
    "Mark and John are working at Google Cloud.",
    "Mark and John are working at Google cloud.",
    "Mark Johnson and John are working at Google Cloud.",
    "Mark Twain and John Travolta are working at Google Cloud.",
    "Ishango people are having fun in Ghana"
]

for s in sentences:
    print(s)
    print(combined_named_enitites(get_iob_tags(s)))
    print()

Mark and John are working at Google Cloud.
[('Mark', 'PERSON'), ('John', 'PERSON'), ('Google Cloud', 'ORGANIZATION')]

Mark and John are working at Google cloud.
[('Mark', 'PERSON'), ('John', 'PERSON'), ('Google', 'ORGANIZATION')]

Mark Johnson and John are working at Google Cloud.
[('Mark', 'PERSON'), ('Johnson', 'PERSON'), ('John', 'PERSON'), ('Google Cloud', 'ORGANIZATION')]

Mark Twain and John Travolta are working at Google Cloud.
[('Mark', 'PERSON'), ('Twain', 'ORGANIZATION'), ('John Travolta', 'PERSON'), ('Google Cloud', 'ORGANIZATION')]

Ishango people are having fun in Ghana
[('Ishango', 'GPE'), ('Ghana', 'GPE')]

