In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag.stanford import StanfordNERTagger
from nltk.tag import pos_tag

In [2]:
jar = './stanford-ner/stanford-ner.jar'
model = './stanford-ner/ner-model-english.ser.gz'

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /Users/namdar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/namdar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/namdar/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/namdar/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/namdar/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [4]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [5]:
# for sent in nltk.sent_tokenize(sentence):
#     for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
#          print(chunk)

In [47]:
def extract_ne(quote):
    words = word_tokenize(quote)
    tags = nltk.pos_tag(words)
    tree = nltk.ne_chunk(tags, binary=True)
    rtn  = []
    
    for t in tree:
        if hasattr(t, "label") and t.label() == "NE":
             rtn.append(i for i in t)
    
    return rtn

In [9]:
captions = {"image1": "Coronaviruses 004 lores.jpg", "image2": "Colour-coded map of North America showing the distribution of North American language families north of Mexico", "image5": "Refer to caption", "image8": "Map showing Canada divided into different ecozones", "image12": "Tidal flooding. Sea-level rise increases flooding in low-lying coastal regions. Shown: Venice, Italy (2004).[210]", "image14": "refer to caption", "image23": "Canadian Delegation to the United Nations seated around conference table", "image24": "2020 Democratic presidential primary and caucus calendar rescheduled.svg", "image25": "Star Wars The Rise of Skywalker poster.jpg", "image26": "Aerial view of the Cliff Palace", "image32": "Muslim Youth League CAA, NRC Protest in Kerala 2019.jpg", "image33": "49ers uniforms 18.png", "image39": "The Grand Concourse in the Bronx, foreground, with Manhattan in the background in February 2018", "image41": "The YouTube logo is made of a red round-rectangular box with a white \"play\" button inside and the word \"YouTube\" written in black.", "image45": "Musk shakes hands with NASA Administrator Charles Bolden before a SpaceX Dragon capsule", "image46": "Canadian Senate chamber long hall with two opposing banks of seats with historical paintings", "image53": "The graph from 1880 to 2020 shows natural drivers exhibiting fluctuations of about 0.3 degrees Celsius. Human drivers steadily increase by 0.3 degrees over 100 years to 1980, then steeply by 0.8 degrees more over the past 40 years.", "image56": "A building with a central clock tower rising from a block", "image58": "Environmental migration. Sparser rainfall leads to desertification that harms agriculture and can displace populations. Shown: Telly, Mali (2008).[208]", "image60": "Coca-Cola bottle - see \"Contour bottle design\" section", "image61": "Each letter of \"Google\" is colored (from left to right) in blue, red, yellow, blue, green, and red.", "image62": "A corrugated silver metal subway train sits with its doors open in a station. Its rollsign reads \"0 Local / To Old Gotham all times / Downtown & Tricorner\".", "image63": "First Lady Michelle Obama, Mel Brooks, Dave Brubeck, Grace Bumbry, De Niro, Bruce Springsteen, and President Barack Obama", "image64": "Photograph of a large area of forest. The green trees are interspersed with large patches of damaged or dead trees turning purple-brown and light red.", "image65": "see caption", "image70": "Uniform Set of the Kansas City Chiefs.svg", "image72": "The Joker dances on a set of stairs. Below him are the words \"Joaquin Phoenix\", \"A Todd Phillips film\", \"Joker\", the billing block, and \"October 4\".", "image76": "Illustration of a SARS-CoV-2 virion", "image77": "The global map shows sea temperature rises of 0.5 to 1 degree Celsius; land temperature rises of 1 to 2 degree Celsius; and Arctic temperature rises of up to 4 degrees Celsius.", "image78": "Olympic rings.svg", "image84": "A map showing five boroughs in different colors.", "image90": "A large facade of a building", "image91": "See caption", "image94": "refer to caption", "image97": "Amazon.es fulfillment center in San Fernando de Henares, Spain", "image100": "An incomplete sphere made of large, white jigsaw puzzle pieces. Each puzzle piece contains one"}

In [38]:
def stanford_ner(sentence):
    
    # Prepare NER tagger with english model
    ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

    # Tokenize: Split sentence into words
    words = nltk.word_tokenize(sentence)

    # Run NER tagger on words
    print(ner_tagger.tag(words))

In [39]:
sentence = captions['image77']

In [48]:
stanford_ner(sentence)

[('The', 'O'), ('global', 'O'), ('map', 'O'), ('shows', 'O'), ('sea', 'O'), ('temperature', 'O'), ('rises', 'O'), ('of', 'O'), ('0.5', 'O'), ('to', 'O'), ('1', 'O'), ('degree', 'O'), ('Celsius', 'O'), (';', 'O'), ('land', 'O'), ('temperature', 'O'), ('rises', 'O'), ('of', 'O'), ('1', 'O'), ('to', 'O'), ('2', 'O'), ('degree', 'O'), ('Celsius', 'O'), (';', 'O'), ('and', 'O'), ('Arctic', 'LOCATION'), ('temperature', 'O'), ('rises', 'O'), ('of', 'O'), ('up', 'O'), ('to', 'O'), ('4', 'O'), ('degrees', 'O'), ('Celsius', 'O'), ('.', 'O')]


In [49]:
extract_ne(sentence)

[<generator object extract_ne.<locals>.<genexpr> at 0x12bbbd510>,
 <generator object extract_ne.<locals>.<genexpr> at 0x12bbbd580>,
 <generator object extract_ne.<locals>.<genexpr> at 0x12bbbd5f0>,
 <generator object extract_ne.<locals>.<genexpr> at 0x12bbbd660>]

In [36]:
for t in tree:
    if hasattr(t, "label") and t.label() == "NE":
        print(t[])

(NE Celsius/NNP)
(NE Celsius/NNP)
(NE Arctic/NNP)
(NE Celsius/NNP)


In [None]:
" ".join(i for i in t)