In [1]:
from collections import namedtuple

In [2]:
Tag = namedtuple("Tag", ["word", "tag"])    
def readTags(file):
    """
    Creates a list of tagged words from the corpus
    
    Parameter
    String file - dest of file from which sentences are to be read
    
    Return
    sentences - read tags
    """
    tags = []
    sep="\t"
    with open(file) as fp:
        for line in fp:
            line = line.strip()
            if line:
                line = line.split(sep)
                tags.append(Tag(*line))
            else:
                tags.append(Tag("",""))  #append emty tuple to mark sentence ending
    return tags

In [3]:
def tokenize(tags):
    ''' sentence and word tokenization
    '''
    words = []
    entities = []
    
    sentence = []
    entitiesOfSentence = []
    for tag in tags:
        if (tag[0] == "" and tag[1] == ""): 
            words.append(sentence)
            entities.append(entitiesOfSentence)
            sentence = []
            entitiesOfSentence = []
        else:
            sentence.append(tag[0])
            entitiesOfSentence.append(tag[1])
    return words, entities
  
  
def addEntitiyTaggs(posTagged, entities):
    if(len(posTagged) != len(entities)):
        raise ValueError
  
    newTags = []
    sentence = []
    i = 0
    for i in range(len(posTagged)):
        for j in range(len(posTagged[i])):
            sentence.append(((posTagged[i][j][0], posTagged[i][j][1]),entities[i][j]))
        newTags.append(sentence)
        sentence = []
    return newTags

In [6]:
tagsTrain = readTags(r"Data\wnut17train.conll")
print(tagsTrain[0:10])
print(len(tagsTrain))

tagsTest = tagsTrain[50001:]
tagsTrain = tagsTrain[:50000]


#tagsTest = readTags(r"Data\emerging.test.conll") #error due to encoding 
#print(tagsTest[0:10])

wordTaggedSentencesTrain, entitiesTrain = tokenize(tagsTrain)
wordTaggedSentencesTest, entitiesTest = tokenize(tagsTest)
print(wordTaggedSentencesTrain)
print(entitiesTrain)

[Tag(word='@paulwalk', tag='O'), Tag(word='It', tag='O'), Tag(word="'s", tag='O'), Tag(word='the', tag='O'), Tag(word='view', tag='O'), Tag(word='from', tag='O'), Tag(word='where', tag='O'), Tag(word='I', tag='O'), Tag(word="'m", tag='O'), Tag(word='living', tag='O')]
66124
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location', 'O', 'B-location', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'B-group', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-corporation', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-person', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'B-creative-work', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

In [19]:
import nltk
#nltk.download('averaged_perceptron_tagger')
#nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\malte\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [8]:
def posTag(sentences):
    posTaggedSentences = [nltk.pos_tag(sent) for sent in sentences]
    return posTaggedSentences

In [11]:
posTaggedSentencesTrain = posTag(wordTaggedSentencesTrain)
posTaggedSentencesTest = posTag(wordTaggedSentencesTest)
print(posTaggedSentencesTrain)

completeTaggedSentencesTrain = addEntitiyTaggs(posTaggedSentencesTrain, entitiesTrain)
completeTaggedSentencesTest = addEntitiyTaggs(posTaggedSentencesTest, entitiesTest)
print(completeTaggedSentencesTrain)



In [12]:
from nltk.tag import ClassifierBasedTagger
from nltk.chunk import ChunkParserI, conlltags2tree, tree2conlltags
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger
from nltk.stem import SnowballStemmer
import string

In [16]:
class NamedEntityChunker(ChunkParserI):
    def __init__(self, train_sents, tagger, **kwargs):
        #self.feature_detector = features
        self.tagger = tagger
    def parse(self, sentence):
        chunks = self.tagger.tag(sentence)
        iobTriblets = [(word,pos,entity) for ((word,pos),entity) in chunks]
        return conlltags2tree(iobTriblets)
    

In [17]:
unigramTagger = UnigramTagger(train=completeTaggedSentencesTrain)
bigramTagger = BigramTagger(train=completeTaggedSentencesTrain)
trigramTagger = TrigramTagger(train=completeTaggedSentencesTrain)


In [21]:
nerChunkerUnigram =  NamedEntityChunker(completeTaggedSentencesTrain, unigramTagger)
print(nerChunkerUnigram.evaluate([conlltags2tree([(word, pos, entity) for (word, pos), entity in iobs]) for iobs in completeTaggedSentencesTest]))
print(nerChunkerUnigram.parse(nltk.pos_tag(nltk.word_tokenize("I'm going to Germany this Monday."))))
print(nerChunkerUnigram.parse(nltk.pos_tag(nltk.word_tokenize("Justin is going to France."))))
print(nerChunkerUnigram.parse(nltk.pos_tag(nltk.word_tokenize("Facebook's stock fell by 5% in Frankfurt's stock marked."))))
print(nerChunkerUnigram.parse(nltk.pos_tag(nltk.word_tokenize("The Eiffel Tower is over 300 metres tall"))))
print(nerChunkerUnigram.parse(nltk.pos_tag(nltk.word_tokenize("Liverpool played well at Old Trafford"))))
print(nerChunkerUnigram.parse(nltk.pos_tag(nltk.word_tokenize("Stock in Tesla is soaring"))))

ChunkParse score:
    IOB Accuracy:  94.1%%
    Precision:     29.8%%
    Recall:        10.3%%
    F-Measure:     15.3%%
(S I/PRP 'm/VBP going/VBG to/TO Germany/NNP this/DT Monday/NNP ./.)
(S (person Justin/NNP) is/VBZ going/VBG to/TO France/NNP ./.)
(S
  (corporation Facebook/NNP)
  's/POS
  stock/NN
  fell/VBD
  by/IN
  5/CD
  %/NN
  in/IN
  Frankfurt/NNP
  's/POS
  stock/NN
  marked/VBD
  ./.)
(S
  The/DT
  Eiffel/NNP
  Tower/NNP
  is/VBZ
  over/IN
  300/CD
  metres/NNS
  (location tall/JJ))
(S Liverpool/NNP played/VBD well/RB at/IN Old/NNP Trafford/NNP)
(S Stock/NN in/IN Tesla/NNP is/VBZ soaring/VBG)


In [22]:
nerChunkerBigram =  NamedEntityChunker(completeTaggedSentencesTrain, bigramTagger)
print(nerChunkerBigram.evaluate([conlltags2tree([(word, pos, entity) for (word, pos), entity in iobs]) for iobs in completeTaggedSentencesTest]))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("I'm going to Germany this Monday."))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("Justin is going to France."))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("Facebook's stock fell by 5% in Frankfurt's stock marked."))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("The Eiffel Tower is over 300 metres tall"))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("Liverpool played well at Old Trafford"))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("Stock in Tesla is soaring"))))

ChunkParse score:
    IOB Accuracy:  93.9%%
    Precision:     54.5%%
    Recall:         1.1%%
    F-Measure:      2.1%%
(S I/PRP 'm/VBP going/VBG to/TO Germany/NNP this/DT Monday/NNP ./.)
(S (person Justin/NNP) is/VBZ going/VBG to/TO France/NNP ./.)
(S
  (corporation Facebook/NNP)
  's/POS
  stock/NN
  fell/VBD
  by/IN
  5/CD
  %/NN
  in/IN
  Frankfurt/NNP
  's/POS
  stock/NN
  marked/VBD
  ./.)
(S
  The/DT
  Eiffel/NNP
  Tower/NNP
  is/VBZ
  over/IN
  300/CD
  metres/NNS
  tall/JJ)
(S Liverpool/NNP played/VBD well/RB at/IN Old/NNP Trafford/NNP)
(S Stock/NN in/IN Tesla/NNP is/VBZ soaring/VBG)


In [23]:
nerChunkerTrigram =  NamedEntityChunker(completeTaggedSentencesTrain, trigramTagger)
print(nerChunkerTrigram.evaluate([conlltags2tree([(word, pos, entity) for (word, pos), entity in iobs]) for iobs in completeTaggedSentencesTest]))
print(nerChunkerTrigram.parse(nltk.pos_tag(nltk.word_tokenize("I'm going to Germany this Monday."))))
print(nerChunkerTrigram.parse(nltk.pos_tag(nltk.word_tokenize("Justin is going to France."))))
print(nerChunkerTrigram.parse(nltk.pos_tag(nltk.word_tokenize("Facebook's stock fell by 5% in Frankfurt's stock marked."))))
print(nerChunkerTrigram.parse(nltk.pos_tag(nltk.word_tokenize("The Eiffel Tower is over 300 metres tall"))))
print(nerChunkerTrigram.parse(nltk.pos_tag(nltk.word_tokenize("Liverpool played well at Old Trafford"))))
print(nerChunkerTrigram.parse(nltk.pos_tag(nltk.word_tokenize("Stock in Tesla is soaring"))))

ChunkParse score:
    IOB Accuracy:  93.9%%
    Precision:     20.0%%
    Recall:         0.2%%
    F-Measure:      0.4%%
(S I/PRP 'm/VBP going/VBG to/TO Germany/NNP this/DT Monday/NNP ./.)
(S (person Justin/NNP) is/VBZ going/VBG to/TO France/NNP ./.)
(S
  (corporation Facebook/NNP)
  's/POS
  stock/NN
  fell/VBD
  by/IN
  5/CD
  %/NN
  in/IN
  Frankfurt/NNP
  's/POS
  stock/NN
  marked/VBD
  ./.)
(S
  The/DT
  Eiffel/NNP
  Tower/NNP
  is/VBZ
  over/IN
  300/CD
  metres/NNS
  tall/JJ)
(S Liverpool/NNP played/VBD well/RB at/IN Old/NNP Trafford/NNP)
(S Stock/NN in/IN Tesla/NNP is/VBZ soaring/VBG)


In [160]:
import nltk.data, nltk.chunk, nltk.corpus
nltk.download('maxent_ne_chunker')
nltk.download('words')


# load the default chunker for nltk.chunk.ne_chunk
ne_chunker = nltk.data.load(nltk.chunk._MULTICLASS_NE_CHUNKER)
print(ne_chunker.evaluate([conlltags2tree([(word, pos, entity) for (word, pos), entity in iobs]) for iobs in completeTaggedSentencesTest]))



[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
ChunkParse score:
    IOB Accuracy:  91.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
