In [1]:
import wikipedia
import nltk
from collections import Counter

In [2]:
# get first np chunk from tree maked in function call_wikipedia
def get_first_np(tree):
    for subtree in tree.subtrees():
        if subtree.label() == 'NP':
            return ' '.join(word for word, tag in subtree.leaves() if (tag not in ["VBZ","VB","VBD","VBG","VBN"]))

# get all np
def get_all_np(tree):
    entities = []
    for subtree in tree.subtrees():
        if subtree.label() == 'NP':
            entities.append(' '.join(word for word, tag in subtree.leaves()))
    return entities

In [3]:
#call wikipedia and process summary
def call_wikipedia(query_noun):
    try:
        summary = wikipedia.page(wikipedia.search(query_noun)[0]).summary
    except:
        return "Thing"
    tokens = nltk.word_tokenize(summary)
    tagged = nltk.pos_tag(tokens) 
    ne_chunked = nltk.ne_chunk(tagged, binary=True)
    return get_first_np(nltk.RegexpParser(r"NP: {<VBZ|VB|VBD|VBG|VBN><DT>+<JJ|NE>*<NN>*}").parse(ne_chunked))

In [4]:
def get_sorted_counts(tokens):
    c = Counter(tokens)
    return sorted(c.items(), key=lambda count:count[1], reverse=True)

In [5]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [6]:
#build-in pos tagging
def pos_tagging(text):
    tokens = nltk.word_tokenize(text)
    return nltk.pos_tag(tokens)

# from https://gist.github.com/onyxfish/322906
def extract_entity_names(t):
    entity_names = []
    
    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))

    return entity_names

# build in ner tagger
def ner_build_in(text):
    tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens) 
    ne = nltk.ne_chunk(tagged, binary=True)
    
    entity_names = []
    for tree in ne:
        entity_names.extend(extract_entity_names(tree))
        
    return entity_names

# custom ner tagger
def ner_custom(text):
    tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    entity = []
    return_entity = []
    for tagged_entry in tagged:
        if(tagged_entry[1].startswith("NN") or (entity and tagged_entry[1].startswith("IN"))):
            entity.append(tagged_entry)
        else:
            if(entity) and entity[-1][1].startswith("IN"):
                entity.pop()
            if(entity and " ".join(e[0] for e in entity)[0].isupper()):
                return_entity.append(" ".join(e[0] for e in entity))
            entity = []
            
    return return_entity

# custom2 ner tagger - different grammar
def ner_custom2(text):
    tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens)        
    ne_chunked = nltk.ne_chunk(tagged, binary=True)
    return get_all_np(nltk.RegexpParser(r"NP: {<JJ|JJR|JJS>*<NN|NE>+}").parse(ne_chunked))

In [7]:
lines = []
chosen_line = 0

with open("data.txt") as file:
    lines = file.readlines()

In [8]:
print(lines[chosen_line])

I don't watch a lot of TV. I don't watch Game of Thrones or Walking Dead or many of the TV shows that are usually at the top of the popular zeitgeist. Instead, my daily entertainment is usually filled with YouTube videos and on-demand shows from Hulu or Netflix. In almost all respects, I am the perfect candidate to cut the cord. There was only one thing holding me back: baseball. I am a huge San Francisco Giants fan and I have a deep desire to watch games live. With the arrival of YouTube TV (and a few other services) I am now, at long last, finally free.



In [10]:
print(get_sorted_counts(ner_custom(lines[chosen_line])))

[('Game of Thrones', 1), ('Netflix', 1), ('YouTube videos', 1), ('San Francisco Giants fan', 1), ('TV shows', 1)]


In [11]:
print(get_sorted_counts(ner_custom2(lines[chosen_line])))

[('TV', 2), ('YouTube', 2), ('Netflix', 1), ('Hulu', 1), ('huge San Francisco Giants fan', 1), ('arrival', 1), ('perfect candidate', 1), ('lot', 1), ('on-demand', 1), ('cord', 1), ('popular zeitgeist', 1), ('baseball', 1), ('thing', 1), ('daily entertainment', 1), ('Thrones', 1), ('deep desire', 1), ('top', 1)]


In [12]:
print(get_sorted_counts(pos_tagging(lines[chosen_line])))

[(('.', '.'), 7), (('the', 'DT'), 6), (('I', 'PRP'), 6), (('of', 'IN'), 5), (('a', 'DT'), 4), ((',', ','), 4), (('am', 'VBP'), 3), (('watch', 'VB'), 3), (('or', 'CC'), 3), (('and', 'CC'), 3), (('usually', 'RB'), 2), (("n't", 'RB'), 2), (('to', 'TO'), 2), (('do', 'VBP'), 2), (('shows', 'NNS'), 2), (('YouTube', 'NNP'), 2), (('TV', 'NN'), 2), (('at', 'IN'), 2), (('few', 'JJ'), 1), (('videos', 'NNS'), 1), (('back', 'RB'), 1), (('Dead', 'JJ'), 1), (('thing', 'NN'), 1), (('With', 'IN'), 1), (('holding', 'VBG'), 1), (('Netflix', 'NNP'), 1), (('now', 'RB'), 1), (('Walking', 'VBG'), 1), (('is', 'VBZ'), 1), (('There', 'EX'), 1), (('San', 'NNP'), 1), (('Thrones', 'NNP'), 1), (('popular', 'JJ'), 1), (('games', 'NNS'), 1), (('almost', 'RB'), 1), (('zeitgeist', 'NN'), 1), (('have', 'VBP'), 1), (('(', '('), 1), (('with', 'IN'), 1), (('that', 'WDT'), 1), (('cord', 'NN'), 1), (('Instead', 'RB'), 1), (('my', 'PRP$'), 1), (('Giants', 'NNP'), 1), (('filled', 'VBN'), 1), (('cut', 'VB'), 1), (('Francisco', 

In [13]:
print(get_sorted_counts(ner_build_in(lines[chosen_line])))

[('YouTube', 2), ('Hulu', 1), ('Thrones', 1), ('Netflix', 1), ('San Francisco Giants', 1)]


In [14]:
print(get_sorted_counts(ner_custom(lines[chosen_line])))

[('Game of Thrones', 1), ('Netflix', 1), ('YouTube videos', 1), ('San Francisco Giants fan', 1), ('TV shows', 1)]


In [15]:
for entity in ner_custom(lines[chosen_line]):
    print (entity, " - ", call_wikipedia(entity))

Game of Thrones  -  an American fantasy drama television series
TV shows  -  a list
YouTube videos  -  an American video-sharing website
Netflix  -  an American entertainment company
San Francisco Giants fan  -  the
