In [16]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/fracton/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/fracton/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
from nltk.corpus import wordnet as wn

def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']


def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']


def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']


def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return None

In [26]:
text = nltk.word_tokenize("Computer Science is a better but a  subject")
print(text)
print(nltk.pos_tag(text))

['Computer', 'is', 'a', 'better', 'subject', 'consisting', 'of', 'streams', 'like', 'Information', 'Technology', 'and', 'Data', 'Science']
[('Computer', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('better', 'JJR'), ('subject', 'JJ'), ('consisting', 'NN'), ('of', 'IN'), ('streams', 'NNS'), ('like', 'IN'), ('Information', 'NNP'), ('Technology', 'NNP'), ('and', 'CC'), ('Data', 'NNP'), ('Science', 'NNP')]


In [35]:
for word,pos in nltk.pos_tag(text):
    tag = penn_to_wn(pos)
    if tag is None:
        lemmatizedWord = word
    else:
        lemmatizedWord = wordnet_lemmatizer.lemmatize(word,tag)
    print(word,pos,tag,lemmatizedWord)

Computer NNP n Computer
is VBZ v be
a DT None a
better JJR a good
subject JJ a subject
consisting NN n consisting
of IN None of
streams NNS n stream
like IN None like
Information NNP n Information
Technology NNP n Technology
and CC None and
Data NNP n Data
Science NNP n Science


In [58]:
def sentence_split(sentence):
    text = nltk.word_tokenize(sentence)
    pos_text = nltk.pos_tag(text)
    new_text = []
    grab = False
    for word,pos in pos_text:
        #print(word)
        #print(pos)
        tag = penn_to_wn(pos)
        if pos == 'NNP' and grab == True:
            grab = False
            #print(word)
            lemmatizedWord += " " + wordnet_lemmatizer.lemmatize(word,tag)
            new_text.append(lemmatizedWord)
            lemmatizedWord = None
        elif pos == 'NNP' and grab == False:
            lemmatizedWord = wordnet_lemmatizer.lemmatize(word,tag)
            grab = True
            #print(grab)
        elif pos != 'NNP':
            if lemmatizedWord:
                new_text.append(lemmatizedWord)
            if tag is None:
                lemmatizedWord = word
            else:
                lemmatizedWord = wordnet_lemmatizer.lemmatize(word,tag)
            new_text.append(lemmatizedWord)
            lemmatizedWord = None
            grab = False
    return new_text

In [62]:
sentence_split("Computer Science is a better subject consisting of streams like Information Technology, Networks, Telecommunications and Data Science")

['Computer Science',
 'be',
 'a',
 'good',
 'subject',
 'consisting',
 'of',
 'stream',
 'like',
 'Information Technology',
 ',',
 'Networks',
 ',',
 'Telecommunications',
 'and',
 'Data Science']

In [65]:
sentence_split("Computer Science is a better but a daunting subject")

['Computer Science', 'be', 'a', 'good', 'but', 'a', 'daunting', 'subject']