In [1]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
lemmatizer = nltk.WordNetLemmatizer()

In [3]:
## Defines function to extract information

def leaves(tree):
    """Finds VP (verbphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label() =='VP'):
        yield subtree.leaves()


def get_word_postag(word):
    if pos_tag([word])[0][1].startswith('J'):
        return wordnet.ADJ
    if pos_tag([word])[0][1].startswith('V'):
        return wordnet.VERB
    if pos_tag([word])[0][1].startswith('N'):
        return wordnet.NOUN
    else:
        return wordnet.NOUN

def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    postag = get_word_postag(word)
    word = lemmatizer.lemmatize(word,postag)
    return word

def get_terms(tree):    
    for leaf in leaves(tree):
        terms = [normalise(w) for w,t in leaf]
        yield terms


def ie_extract(text):
    
    ## Tokenize sentences
    sentences = nltk.sent_tokenize(text)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    grammar = "VP: {<V.*><.*>*<NN>}  # Chunk a verb and everything up to noun"
    cp = nltk.RegexpParser(grammar)
    
    verb_list = []
    for i in sentences:
        terms = get_terms(cp.parse(i))
        features = []
        for term in terms:
            _term = ''
            for word in term:
                _term += ' ' + word
            features.append(_term.strip())
        verb_list.append(features)
    return verb_list


In [4]:
text = "It was a nice day. Today I went to the park and played. I rode a bike."
ie_extract(text)

[['be a nice day'], ['go to the park'], ['rode a bike']]