# Lemmatization

* [Stemming and Lemmatization in Python](https://www.datacamp.com/community/tutorials/stemming-lemmatization-python)

> Lemmatization returns an actual word of the language, it is used where it is necessary to get valid words.

# NLTK 

* [How to download the NLTK library?](https://www.dezyre.com/recipes/download-nltk-library)

In [None]:
!pip install nltk

In [9]:
import nltk
# nltk.download('all')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /home/oonisim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/oonisim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/oonisim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from nltk.corpus import wordnet

You need to provide the context in which you want to lemmatize that is the parts-of-speech (POS). This is done by giving the value for pos parameter in wordnet_lemmatizer.lemmatize.

In [8]:
wordnet_lemmatizer.lemmatize("hugging", pos="v")

'hug'

In [18]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

sentence = "He was playing a baseball among a lot of people and eating at same time. He has the bad habit of swimming after playing long hours in the Sun."
tokens = nltk.word_tokenize(sentence.lower())

print("{0:20}{1:20}".format("Word","Lemma"))
for word in tokens:
    print ("{0:20}{1:20}".format(word,lemmatizer.lemmatize(word, pos="v")))

Word                Lemma               
he                  he                  
was                 be                  
playing             play                
a                   a                   
baseball            baseball            
among               among               
a                   a                   
lot                 lot                 
of                  of                  
people              people              
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
.                   .                   
he                  he                  
has                 have                
the                 the                 
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after           

In [23]:
def lemmatize(sentence):
    lemmatizer = WordNetLemmatizer()

    return " ".join([
        lemmatizer.lemmatize(word, pos="v") 
        for word in nltk.word_tokenize(sentence.lower()) 
        if word not in stopwords.words('english')
    ])

In [24]:
lemmatize(sentence)

'play baseball among lot people eat time . bad habit swim play long hours sun .'

In [26]:
sentence = """
He was playing a baseball among a lot of people and eating at same time. He has the bad habit of swimming after playing long hours in the Sun.
Zürich has a famous website https://www.zuerich.com/ 
WHICH ACCEPTS 40,000 € and adding a random string, :
abc123def456ghi789zero0 for this demo. !!!&*^% tako.hoge@gmail.com' 
I Won't !*%$^&*#$#!!! ?? ? ~!@#$%^&*()_=+\[\]{}\\\|;:\-"\'<>.,/? pierod.
"""


In [25]:
import re
from cleantext import clean

def decontracted(sentences):
    """Restore the contracted words"""
    # specific
    sentences = re.sub(r"won\'t", "will not", sentences, flags=re.IGNORECASE)
    sentences = re.sub(r"can\'t", "can not", sentences, flags=re.IGNORECASE)
    # general
    sentences = re.sub(r"n\'t", " not", sentences, flags=re.IGNORECASE)
    sentences = re.sub(r"\'re", " are", sentences, flags=re.IGNORECASE)
    sentences = re.sub(r"\'s", " is", sentences, flags=re.IGNORECASE)
    sentences = re.sub(r"\'d", " would", sentences, flags=re.IGNORECASE)
    sentences = re.sub(r"\'ll", " will", sentences, flags=re.IGNORECASE)
    sentences = re.sub(r"\'t", " not", sentences, flags=re.IGNORECASE)
    sentences = re.sub(r"\'ve", " have", sentences, flags=re.IGNORECASE)
    sentences = re.sub(r"\'m", " am", sentences, flags=re.IGNORECASE)
    return sentences

def remove_noises(sentences):
    """Clean up noises in the text
    """
    sentences = re.sub(r'[~=+|<>.^]+', "", sentences)
    sentences = clean(sentences,
        fix_unicode=True,               # fix various unicode errors
        to_ascii=True,                  # transliterate to closest ASCII representation
        lower=True,                     # lowercase text
        no_line_breaks=True,            # fully strip line breaks as opposed to only normalizing them
        no_urls=True,                   # replace all URLs with a special token
        no_emails=True,                 # replace all email addresses with a special token
        no_phone_numbers=True,          # replace all phone numbers with a special token
        no_numbers=True,                # replace all numbers with a special token
        no_digits=True,                 # replace all digits with a special token
        no_currency_symbols=True,       # replace all currency symbols with a special token
        no_punct=True,                  # remove punctuations
        replace_with_punct="",          # instead of removing punctuations you may replace them
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        replace_with_number="",
        replace_with_digit="",
        replace_with_currency_symbol="",
        lang="en"                       # set to 'de' for German special handling
    )
    return sentences

lemmatizer = WordNetLemmatizer()
def lemmatize(sentence):
    return " ".join([
        lemmatizer.lemmatize(word, pos="v") 
        for word in nltk.word_tokenize(sentence.lower()) 
        if word not in stopwords.words('english')
    ])

def clean_comment_text(sentences):
    return lemmatize(remove_noises(decontracted(sentences)))

In [27]:
clean_comment_text(sentence)

'play baseball among lot people eat time bad habit swim play long hours sun zurich famous website httpswwwzuerichcom accept add random string abcdefghizero demo takohogegmailcom pierod'