In [1]:
###########################################################
#
#       Subject= Information Reterival
#
#       Author= Muhammad Afzal
#
#       Registration# MSITS07203002
#
#       Example=Lemmitzation
#
###########################################################

#Wordnet Lemmatizer with NLTK
import nltk
import nltk
from nltk.stem import WordNetLemmatizer 

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize Single Word
print(lemmatizer.lemmatize("bats"))

print(lemmatizer.lemmatize("are"))

print(lemmatizer.lemmatize("feet"))

# Define the sentence to be lemmatized
sentence = "The striped bats are hanging on their feet for best"

# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)

# Lemmatize list of words and join
lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
print(lemmatized_output)


bat
are
foot
['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']
The striped bat are hanging on their foot for best


In [2]:
#Wordnet Lemmatizer with appropriate POS tag
print(nltk.pos_tag(['feet']))

print(nltk.pos_tag(nltk.word_tokenize(sentence)))

# Lemmatize with POS Tag
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


# 1. Init Lemmatizer
lemmatizer = WordNetLemmatizer()

# 2. Lemmatize Single Word with the appropriate POS tag
word = 'feet'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

# 3. Lemmatize a Sentence with the appropriate POS tag
sentence = "The striped bats are hanging on their feet for best"
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

[('feet', 'NNS')]
[('The', 'DT'), ('striped', 'JJ'), ('bats', 'NNS'), ('are', 'VBP'), ('hanging', 'VBG'), ('on', 'IN'), ('their', 'PRP$'), ('feet', 'NNS'), ('for', 'IN'), ('best', 'JJS')]
foot
['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']


In [3]:
#spaCy Lemmatization
import spacy

# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load("en_core_web_sm")

sentence = "The striped bats are hanging on their feet for best"

# Parse the sentence using the loaded 'en' model object `nlp`
doc = nlp(sentence)

# Extract the lemma for each token and join
" ".join([token.lemma_ for token in doc])

'the stripe bat be hang on their foot for good'

In [4]:
#TextBlob Lemmatizer
from textblob import TextBlob, Word

# Lemmatize a word
word = 'stripes'
w = Word(word)
w.lemmatize()
# Lemmatize a sentence
sentence = "The striped bats are hanging on their feet for best"
sent = TextBlob(sentence)
" ". join([w.lemmatize() for w in sent.words])

'The striped bat are hanging on their foot for best'

In [5]:
#TextBlob Lemmatizer with appropriate POS tag
# Define function to lemmatize each word with its POS tag
def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    return " ".join(lemmatized_list)

# Lemmatize
sentence = "The striped bats are hanging on their feet for best"
lemmatize_with_postag(sentence)

'The striped bat be hang on their foot for best'

In [3]:
#Pattern Lemmatizer
import pattern
from pattern.en import lemma, lexeme

sentence = "The striped bats were hanging on their feet and ate best fishes"
" ".join([lemma(wd) for wd in sentence.split()])
# Lexeme's for each word 
[lexeme(wd) for wd in sentence.split()]

from pattern.en import parse
print(parse('The striped bats were hanging on their feet and ate best fishes', 
            lemmata=True, tags=False, chunks=False))

The/DT/the striped/JJ/striped bats/NNS/bat were/VBD/be hanging/VBG/hang on/IN/on their/PRP$/their feet/NNS/foot and/CC/and ate/VBD/eat best/JJ/best fishes/NNS/fish


In [8]:
#Stanford CoreNLP Lemmatization
from stanfordcorenlp import StanfordCoreNLP
import json

# Connect to the CoreNLP server we just started
nlp = StanfordCoreNLP('http://localhost', port=9000, timeout=30000)

# Define proporties needed to get lemma
props = {'annotators': 'pos,lemma',
         'pipelineLanguage': 'en',
         'outputFormat': 'json'}


sentence = "The striped bats were hanging on their feet and ate best fishes"
parsed_str = nlp.annotate(sentence, properties=props)
parsed_dict = json.loads(parsed_str)
parsed_dict

{'sentences': [{'index': 0,
   'tokens': [{'index': 1,
     'word': 'The',
     'originalText': 'The',
     'lemma': 'the',
     'characterOffsetBegin': 0,
     'characterOffsetEnd': 3,
     'pos': 'DT',
     'before': '',
     'after': ' '},
    {'index': 2,
     'word': 'striped',
     'originalText': 'striped',
     'lemma': 'striped',
     'characterOffsetBegin': 4,
     'characterOffsetEnd': 11,
     'pos': 'JJ',
     'before': ' ',
     'after': ' '},
    {'index': 3,
     'word': 'bats',
     'originalText': 'bats',
     'lemma': 'bat',
     'characterOffsetBegin': 12,
     'characterOffsetEnd': 16,
     'pos': 'NNS',
     'before': ' ',
     'after': ' '},
    {'index': 4,
     'word': 'were',
     'originalText': 'were',
     'lemma': 'be',
     'characterOffsetBegin': 17,
     'characterOffsetEnd': 21,
     'pos': 'VBD',
     'before': ' ',
     'after': ' '},
    {'index': 5,
     'word': 'hanging',
     'originalText': 'hanging',
     'lemma': 'hang',
     'characterOffsetB

In [7]:
lemma_list = [v for d in parsed_dict['sentences'][0]['tokens'] for k,v in d.items() if k == 'lemma']
" ".join(lemma_list)

'the striped bat be hang on they foot and eat best fish'

In [9]:
from stanfordcorenlp import StanfordCoreNLP
import json, string

def lemmatize_corenlp(conn_nlp, sentence):
    props = {
        'annotators': 'pos,lemma',
        'pipelineLanguage': 'en',
        'outputFormat': 'json'
    }

    # tokenize into words
    sents = conn_nlp.word_tokenize(sentence)

    # remove punctuations from tokenised list
    sents_no_punct = [s for s in sents if s not in string.punctuation]

    # form sentence
    sentence2 = " ".join(sents_no_punct)

    # annotate to get lemma
    parsed_str = conn_nlp.annotate(sentence2, properties=props)
    parsed_dict = json.loads(parsed_str)

    # extract the lemma for each word
    lemma_list = [v for d in parsed_dict['sentences'][0]['tokens'] for k,v in d.items() if k == 'lemma']

    # form sentence and return it
    return " ".join(lemma_list)


# make the connection and call `lemmatize_corenlp`
nlp = StanfordCoreNLP('http://localhost', port=9000, timeout=30000)
lemmatize_corenlp(conn_nlp=nlp, sentence=sentence)

'the striped bat be hang on they foot and eat best fish'

In [4]:
#Gensim Lemmatize
from gensim.utils import lemmatize
sentence = "The striped bats were hanging on their feet and ate best fishes"
lemmatized_out = [wd.decode('utf-8').split('/')[0] for wd in lemmatize(sentence)]
print(lemmatized_out)

['striped', 'bat', 'be', 'hang', 'foot', 'eat', 'best', 'fish']


In [8]:
#TreeTagger
import treetaggerwrapper as ttpw
tagger = ttpw.TreeTagger(TAGLANG='en')
tags = tagger.tag_text("The striped bats were hanging on their feet and ate best fishes")
lemmas = [t.split('\t')[-1] for t in tags]
print(lemmas)

['the', 'striped', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'and', 'eat', 'good', 'fish']


In [11]:
#Comparing NLTK, TextBlob, spaCy, Pattern and Stanford CoreNLP
sentence = """Following mice attacks, caring farmers were marching to Delhi for better living conditions. 
Delhi police on Tuesday fired water cannons and teargas shells at protesting farmers as they tried to 
break barricades with their cars, automobiles and tractors."""

# NLTK
from nltk.stem import WordNetLemmatizer
import string
from textblob import TextBlob
from pprint import pprint
from stanfordcorenlp import StanfordCoreNLP
import json, string

lemmatizer = WordNetLemmatizer()

# Lemmatize with POS Tag
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


pprint(" ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence) if w not in string.punctuation]))
# ('Following mouse attack care farmer be march to Delhi for well living '
#  'condition Delhi police on Tuesday fire water cannon and teargas shell at '
#  'protest farmer a they try to break barricade with their car automobile and '
#  'tractor')

# Spacy
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(sentence)
pprint(" ".join([token.lemma_ for token in doc]))
# ('follow mice attack , care farmer be march to delhi for good living condition '
#  '. delhi police on tuesday fire water cannon and teargas shell at protest '
#  'farmer as -PRON- try to break barricade with -PRON- car , automobile and '
#  'tractor .')


# TextBlob
def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    return " ".join(lemmatized_list)

pprint(lemmatize_with_postag(sentence))
# ('Following mouse attack care farmer be march to Delhi for good living '
#  'condition Delhi police on Tuesday fire water cannon and teargas shell at '
#  'protest farmer a they try to break barricade with their car automobile and '
#  'tractor')

# Pattern
from pattern.en import lemma
pprint(" ".join([lemma(wd) for wd in sentence.split()]))
# ('follow mice attacks, care farmer be march to delhi for better live '
#  'conditions. delhi police on tuesday fire water cannon and tearga shell at '
#  'protest farmer a they try to break barricade with their cars, automobile and '
#  'tractors.')

# Stanford
def lemmatize_corenlp(conn_nlp, sentence):
    props = {
        'annotators': 'pos,lemma',
        'pipelineLanguage': 'en',
        'outputFormat': 'json'
    }

    # tokenize into words
    sents = conn_nlp.word_tokenize(sentence)

    # remove punctuations from tokenised list
    sents_no_punct = [s for s in sents if s not in string.punctuation]

    # form sentence
    sentence2 = " ".join(sents_no_punct)

    # annotate to get lemma
    parsed_str = conn_nlp.annotate(sentence2, properties=props)
    parsed_dict = json.loads(parsed_str)

    # extract the lemma for each word
    lemma_list = [v for d in parsed_dict['sentences'][0]['tokens'] for k,v in d.items() if k == 'lemma']

    # form sentence and return it
    return " ".join(lemma_list)


# make the connection and call `lemmatize_corenlp`
nlp = StanfordCoreNLP('http://localhost', port=9000, timeout=30000)
pprint(lemmatize_corenlp(conn_nlp=nlp, sentence=sentence))
# ('follow mouse attack care farmer be march to Delhi for better living '
#  'condition Delhi police on Tuesday fire water cannon and tearga shell at '
#  'protest farmer as they try to break barricade with they car automobile and '
#  'tractor')

('Following mouse attack care farmer be march to Delhi for well living '
 'condition Delhi police on Tuesday fire water cannon and teargas shell at '
 'protest farmer a they try to break barricade with their car automobile and '
 'tractor')
('follow mouse attack , care farmer be march to Delhi for well living '
 'condition . \n'
 ' Delhi police on Tuesday fire water cannon and teargas shell at protest '
 'farmer as they try to \n'
 ' break barricade with their car , automobile and tractor .')
('Following mouse attack care farmer be march to Delhi for good living '
 'condition Delhi police on Tuesday fire water cannon and teargas shell at '
 'protest farmer a they try to break barricade with their car automobile and '
 'tractor')
('follow mice attacks, care farmer be march to delhi for better live '
 'conditions. delhi police on tuesday fire water cannon and tearga shell at '
 'protest farmer a they try to break barricade with their cars, automobile and '
 'tractors.')
('follow mouse at