In [0]:
import requests
from google.colab import files
from collections import namedtuple

In [0]:
def downloadFromGithub(url,dest,colab=False):
    """
    Downloads file from Github
    USE RAW link!!!
    
    Parameter
    String file - link of file
    String des - destination of file
    bool colab - Running on google colab
    """
 
    r = requests.get(url)
    with open(dest, 'wb') as f:  
        f.write(r.content)
        
    #if colab == True:
        #files.download(dest)
    
Tag = namedtuple("Tag", ["word", "tag"])    
def readTags(file):
    """
    Creates a list of tagged words from the corpus
    
    Parameter
    String file - dest of file from which sentences are to be read
    
    Return
    sentences - read tags
    """
    tags = []
    sep="\t"
    with open(file) as fp:
        for line in fp:
            line = line.strip()
            if line:
                line = line.split(sep)
                tags.append(Tag(*line))
            else:
                tags.append(Tag("",""))  #append emty tuple to mark sentence ending
    return tags

In [0]:
def tokenize(tags):
    ''' sentence and word tokenization
    '''
    words = []
    entities = []
    
    sentence = []
    entitiesOfSentence = []
    for tag in tags:
        if (tag[0] == "" and tag[1] == ""): 
            words.append(sentence)
            entities.append(entitiesOfSentence)
            sentence = []
            entitiesOfSentence = []
        else:
            sentence.append(tag[0])
            entitiesOfSentence.append(tag[1])
    return words, entities
  
  
def addEntitiyTaggs(posTagged, entities):
    if(len(posTagged) != len(entities)):
        raise ValueError
  
    newTags = []
    sentence = []
    i = 0
    for i in range(len(posTagged)):
        for j in range(len(posTagged[i])):
            sentence.append(((posTagged[i][j][0], posTagged[i][j][1]),entities[i][j]))
        newTags.append(sentence)
        sentence = []
    return newTags

In [95]:
url = "https://raw.githubusercontent.com/leondz/emerging_entities_17/master/wnut17train.conll"
dest = r"trainData"
downloadFromGithub(url,dest,True)
downloadFromGithub(url,"testData",True)

tagsTrain = readTags(r"trainData")
print(tagsTrain)

tagsTest = readTags(r"testData")
print(tagsTest)

wordTaggedSentencesTrain, entitiesTrain = tokenize(tagsTrain)
wordTaggedSentencesTest, entitiesTest = tokenize(tagsTest)
print(wordTaggedSentencesTrain)
print(entitiesTrain)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [0]:
import nltk

In [0]:
def posTag(sentences):
    posTaggedSentences = [nltk.pos_tag(sent) for sent in sentences]
    return posTaggedSentences

In [107]:
posTaggedSentencesTrain = posTag(wordTaggedSentencesTrain)
posTaggedSentencesTest = posTag(wordTaggedSentencesTest)
print(posTaggedSentencesTrain)

completeTaggedSentencesTrain = addEntitiyTaggs(posTaggedSentencesTrain, entitiesTrain)
completeTaggedSentencesTest = addEntitiyTaggs(posTaggedSentencesTest, entitiesTest)
print(completeTaggedSentencesTrain)



In [0]:
from nltk.tag import ClassifierBasedTagger
from nltk.chunk import ChunkParserI, conlltags2tree, tree2conlltags
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger
from nltk.stem import SnowballStemmer
import string

In [0]:
class NamedEntityChunker(ChunkParserI):
    def __init__(self, train_sents, tagger, **kwargs):
        self.feature_detector = features
        self.tagger = tagger
    def parse(self, sentence):
        chunks = self.tagger.tag(sentence)
        iobTriblets = [(word,pos,entity) for ((word,pos),entity) in chunks]
        return conlltags2tree(iobTriblets)
    

In [0]:
unigramTagger = UnigramTagger(train=completeTaggedSentencesTrain)
bigramTagger = BigramTagger(train=completeTaggedSentencesTrain)
trigramTagger = TrigramTagger(train=completeTaggedSentencesTrain)


In [172]:
nerChunkerUnigram =  NamedEntityChunker(completeTaggedSentencesTrain, unigramTagger)
print(nerChunkerUnigram.evaluate([conlltags2tree([(word, pos, entity) for (word, pos), entity in iobs]) for iobs in completeTaggedSentencesTest]))
print(nerChunkerUnigram.parse(nltk.pos_tag(nltk.word_tokenize("I'm going to Germany this Monday."))))
print(nerChunkerUnigram.parse(nltk.pos_tag(nltk.word_tokenize("Justin is going to France."))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("Facebook's stock fell by 5% in Frankfurt's stock marked."))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("The Eiffel Tower is over 300 metres tall"))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("Liverpool played well at Old Trafford"))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("Stock in Tesla is soaring"))))

ChunkParse score:
    IOB Accuracy:  98.5%%
    Precision:     70.4%%
    Recall:        79.4%%
    F-Measure:     74.6%%
(S
  I/PRP
  'm/VBP
  going/VBG
  to/TO
  (location Germany/NNP)
  this/DT
  Monday/NNP
  ./.)
(S
  (person Justin/NNP)
  is/VBZ
  going/VBG
  to/TO
  (location France/NNP)
  ./.)
(S
  (corporation Facebook/NNP)
  's/POS
  stock/NN
  fell/VBD
  by/IN
  5/CD
  %/NN
  in/IN
  Frankfurt/NNP
  's/POS
  stock/NN
  marked/VBD
  ./.)
(S
  The/DT
  Eiffel/NNP
  Tower/NNP
  is/VBZ
  over/IN
  300/CD
  metres/NNS
  tall/JJ)
(S Liverpool/NNP played/VBD well/RB at/IN Old/NNP Trafford/NNP)
(S Stock/NN in/IN Tesla/NNP is/VBZ soaring/VBG)


In [171]:
nerChunkerBigram =  NamedEntityChunker(completeTaggedSentencesTrain, bigramTagger)
print(nerChunkerBigram.evaluate([conlltags2tree([(word, pos, entity) for (word, pos), entity in iobs]) for iobs in completeTaggedSentencesTest]))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("I'm going to Germany this Monday."))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("Justin is going to France."))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("Facebook's stock fell by 5% in Frankfurt's stock marked."))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("The Eiffel Tower is over 300 metres tall"))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("Liverpool played well at Old Trafford"))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("Stock in Tesla is soaring"))))

ChunkParse score:
    IOB Accuracy:  98.9%%
    Precision:     91.6%%
    Recall:        84.5%%
    F-Measure:     87.9%%
(S
  I/PRP
  'm/VBP
  going/VBG
  to/TO
  (location Germany/NNP)
  this/DT
  Monday/NNP
  ./.)
(S
  (person Justin/NNP)
  is/VBZ
  going/VBG
  to/TO
  (location France/NNP)
  ./.)
(S
  (corporation Facebook/NNP)
  's/POS
  stock/NN
  fell/VBD
  by/IN
  5/CD
  %/NN
  in/IN
  Frankfurt/NNP
  's/POS
  stock/NN
  marked/VBD
  ./.)
(S
  The/DT
  Eiffel/NNP
  Tower/NNP
  is/VBZ
  over/IN
  300/CD
  metres/NNS
  tall/JJ)
(S Liverpool/NNP played/VBD well/RB at/IN Old/NNP Trafford/NNP)
(S Stock/NN in/IN Tesla/NNP is/VBZ soaring/VBG)


In [170]:
nerChunkerTrigram =  NamedEntityChunker(completeTaggedSentencesTrain, trigramTagger)
print(nerChunkerTrigram.evaluate([conlltags2tree([(word, pos, entity) for (word, pos), entity in iobs]) for iobs in completeTaggedSentencesTest]))
print(nerCunkerTrigram.parse(nltk.pos_tag(nltk.word_tokenize("I'm going to Germany this Monday."))))
print(nerCunkerTrigram.parse(nltk.pos_tag(nltk.word_tokenize("Justin is going to France."))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("Facebook's stock fell by 5% in Frankfurt's stock marked."))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("The Eiffel Tower is over 300 metres tall"))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("Liverpool played well at Old Trafford"))))
print(nerChunkerBigram.parse(nltk.pos_tag(nltk.word_tokenize("Stock in Tesla is soaring"))))

ChunkParse score:
    IOB Accuracy:  99.0%%
    Precision:     93.4%%
    Recall:        86.5%%
    F-Measure:     89.8%%
(S
  I/PRP
  'm/VBP
  going/VBG
  to/TO
  (location Germany/NNP)
  this/DT
  Monday/NNP
  ./.)
(S (person Justin/NNP) is/VBZ going/VBG to/TO France/NNP ./.)
(S
  (corporation Facebook/NNP)
  's/POS
  stock/NN
  fell/VBD
  by/IN
  5/CD
  %/NN
  in/IN
  Frankfurt/NNP
  's/POS
  stock/NN
  marked/VBD
  ./.)
(S
  The/DT
  Eiffel/NNP
  Tower/NNP
  is/VBZ
  over/IN
  300/CD
  metres/NNS
  tall/JJ)
(S Liverpool/NNP played/VBD well/RB at/IN Old/NNP Trafford/NNP)
(S Stock/NN in/IN Tesla/NNP is/VBZ soaring/VBG)


In [160]:
import nltk.data, nltk.chunk, nltk.corpus
nltk.download('maxent_ne_chunker')
nltk.download('words')


# load the default chunker for nltk.chunk.ne_chunk
ne_chunker = nltk.data.load(nltk.chunk._MULTICLASS_NE_CHUNKER)
print(ne_chunker.evaluate([conlltags2tree([(word, pos, entity) for (word, pos), entity in iobs]) for iobs in completeTaggedSentencesTest]))



[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
ChunkParse score:
    IOB Accuracy:  91.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
