In [10]:
import re
from nltk.tag import StanfordNERTagger
import os
import pandas as pd
import nltk

In [11]:
class SNER:
    def __init__(self,text,java_path,classifier,path_to_jar):
        self.text=text
        #set java path in environment variables
        self.java_path=java_path
        os.environ['JAVAHOME']=java_path
        #load stanford NER
        self.sn=StanfordNERTagger(classifier,path_to_jar=path_to_jar)
        
        
    def document_to_sentence(self,document):
        """
        将document按句子分割为sentences
        """
        document=re.sub('\n',' ',document)#把document字符串中的换行符替换为空格
        if isinstance(document,str):
            document=document
        else:
            raise ValueError('Document is not string!')
        document=document.strip()#删除开头结尾处字符,默认删除空白符(包括'\n',''\r',''\t','')
        sentences=nltk.sent_tokenize(document)#按句子分割
        sentences=[sentence.strip() for sentence in sentences]
        return sentences
    
    def sentence_to_tokenized(self,sentences):
        """
        将senteces按词分割为tokenized_sentences
        """
        tokenized_sentences=[nltk.word_tokenize(sentence) for sentence in sentences]
        return tokenized_sentences
    
    def tokenized_to_annotated(self,tokenized_sentences):
        """
        tag sentences
        """
        ne_annotated_sentences=[self.sn.tag(sent) for sent in tokenized_sentences]
        return ne_annotated_sentences
    
    def extract_named_entities(self,ne_annotated_sentences):
        #extract named entities
        named_entities=[]
        for sentence in ne_annotated_sentences:
            temp_entity_name=''
            temp_named_entity=None
            for term,tag in sentence:
                if tag != 'O':
                    temp_entity_name=' '.join([temp_entity_name,term]).strip()
                    temp_named_entity=(temp_entity_name,tag)
                else:
                    if temp_named_entity:
                        named_entities.append(temp_named_entity)
                        temp_entity_name=''
                        temp_named_entity=None
        #get unique named entities
        named_entitie=list(set(named_entities))
        #store named entities in a data frame
        entity_frame=pd.DataFrame(named_entities,columns=['Entity Name','Entity Type'])
        return entity_frame
    
    def sparse(self):
        """
        NER主函数
        """
        sentences=self.document_to_sentence(self.text)
        tokenized_sentences=self.sentence_to_tokenized(sentences)
        ne_annotated_sentences=self.tokenized_to_annotated(tokenized_sentences)
        res=self.extract_named_entities(ne_annotated_sentences)
        return res

In [13]:
text = """
FIFA was founded in 1904 to oversee international competition among the national associations of Belgium, 
Denmark, France, Germany, the Netherlands, Spain, Sweden, and Switzerland. Headquartered in Zürich, its 
membership now comprises 211 national associations. Member countries must each also be members of one of 
the six regional confederations into which the world is divided: Africa, Asia, Europe, North & Central America 
and the Caribbean, Oceania, and South America.
"""
java_path=r'D:\Java\jdk-11.0.12\bin\java.exe'
classifier='D:/stanford-ner-2020-11-17/classifiers/english.muc.7class.distsim.crf.ser.gz'
path_to_jar='D:/stanford-ner-2020-11-17/stanford-ner.jar'
sner=SNER(text,java_path,classifier,path_to_jar)
res=sner.sparse()
print(res)

                Entity Name   Entity Type
0                      FIFA  ORGANIZATION
1                      1904          DATE
2                   Belgium      LOCATION
3                   Denmark      LOCATION
4                    France      LOCATION
5                   Germany      LOCATION
6           the Netherlands      LOCATION
7                     Spain      LOCATION
8                    Sweden      LOCATION
9               Switzerland      LOCATION
10                   Zürich      LOCATION
11                   Africa      LOCATION
12                     Asia      LOCATION
13                   Europe      LOCATION
14  North & Central America  ORGANIZATION
15                Caribbean      LOCATION
16                  Oceania      LOCATION
17            South America      LOCATION
