In [1]:
#Make sure you have java.exe in this location
import os
java_path = "C:/Program Files/Java/jre1.8.0_211/bin/java.exe"
os.environ['JAVAHOME'] = java_path

In [2]:
#Loading packages and creating a function to capture the entities, credit to Omar Bahareth

from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize import word_tokenize

def formatted_entities(classified_paragraphs_list):
    entities = {'persons': list(), 'organizations': list(), 'locations': list(), 'dates': list(), 'money': list(), 'percent': list()}

    for classified_paragraph in classified_paragraphs_list:
        for entry in classified_paragraph:
            entry_value = entry[0]
            entry_type = entry[1]

            if entry_type == 'PERSON':
                entities['persons'].append(entry_value)

            elif entry_type == 'ORGANIZATION':
                entities['organizations'].append(entry_value)

            elif entry_type == 'LOCATION':
                entities['locations'].append(entry_value)
            elif entry_type == 'DATE':
                entities['dates'].append(entry_value)
            elif entry_type == 'MONEY':
                entities['money'].append(entry_value)
            elif entry_type == 'PERCENT':
                entities['percent'].append(entry_value)
    return entities

In [3]:
#Download stanford-ner-2018-10-16 and unzip to get  english.muc.7class.distsim.crf.ser.gz and stanford-ner.jar files. Save them
#in the appropraite folder
tagger = StanfordNERTagger('/Users/Shared/stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz',
               '/Users/Shared/stanford-ner/stanford-ner.jar',
               encoding='utf-8')

In [4]:
#Import spacy to extract the data from the text file
import spacy,en_core_web_sm
import pandas as pd

# Read the text file
nlp = en_core_web_sm.load()
doc = nlp(open('African_Fintech.txt', encoding="utf8").read())

In [5]:
doc


African fintech has taken center stage for the Catalyst Fund, a JP Morgan Chase and Bill & Melinda Gates Foundation-backed accelerator that provides mentorship and non-equity funding to emerging markets startups.

The organization announced its 2019 startup cohort and three out of the four finance ventures — Chipper Cash, Salutat and Turaco — have an Africa focus (Brazil-based venture Diin, was the fourth).

Catalyst Fund, which is managed by global tech consulting firm BFA,  also released its latest evaluation report, which showed 60% of the organization’s portfolio startups are located in Africa.

The new additions to the fund’s program will gain $50,000 to $60,000 in non-equity venture building support (as Catalyst Fund dubs it) and six months of technical assistance. The funds and support are aimed at moving the ventures to the next phase of catalyzing business models, generating revenue and connecting to global VCs.

“We really tailor the kind of help we give to companies so they 

In [6]:
#Create a dataframe that has tokenized sentences in a column
d = []
for idno, sentence in enumerate(doc.sents):
    d.append({"id": idno, "sentence":str(sentence)})

df = pd.DataFrame(d)
df.set_index('id', inplace=True)

#How many sentences are in the document?
print('There are {}'.format(len(d)) ,'Sentences in this article')
df.head()

There are 37 Sentences in this article


Unnamed: 0_level_0,sentence
id,Unnamed: 1_level_1
0,African fintech has taken center stage for the...
1,The organization announced its 2019 startup co...
2,"Catalyst Fund, which is managed by global tech..."
3,The new additions to the fund’s program will g...
4,The funds and support are aimed at moving the ...


In [7]:
#set the sentences column to list
df1=df.sentence.tolist()

In [8]:
#Get a dictionary of all the entities you specified above.

tokenized_paragraphs = list()

for text in df1:
    tokenized_paragraphs.append(word_tokenize(text))

classified_paragraphs_list = tagger.tag_sents(tokenized_paragraphs)


formatted_result = formatted_entities(classified_paragraphs_list)
print(formatted_result)

{'persons': ['Maelis', 'Carraro', 'Colleen', 'Briggs', 'Briggs', 'David', 'del', 'Ser'], 'organizations': ['JP', 'Morgan', 'Chase', 'Bill', '&', 'Melinda', 'Gates', 'BFA', 'BFA', 'Mastercard', 'Foundation', 'Circle', 'of', 'Investors', 'Bill', '&', 'Melinda', 'Gates', 'Foundation', 'JP', 'Morgan', 'Chase', 'Rockefeller', 'Philanthropy', 'Advisors', 'JP', 'Morgan', 'Chase', 'JP', 'Morgan', '’', 's', 'Head', 'of', 'Community', 'JP', 'Morgan', 'fintech', 'Briter', 'Bridges', 'fintech', 'Ugandan', 'Ham', 'Serunjogi'], 'locations': ['Africa', 'Africa', 'San', 'Francisco', 'Africa', 'Asia', 'Latin', 'America', 'Africa', 'Africa', 'Africa', 'Africa', 'Africa', 'Ghana', 'Kenya', 'Rwanda', 'Tanzania', 'Uganda'], 'dates': ['2019', '2019', '2019', '2019', '2016', '2018'], 'money': ['$', '50,000', '$', '60,000', '$', '25.7', 'million', '$', '125', 'million'], 'percent': ['60', '%']}
