Simple Text Processing Techniques
    * Remove punctuations/Digits
    * Remove short/stop words
    * Stemming
    * Lemmatization
    * Word Tokenize
    

In [49]:
# import modules
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import WhitespaceTokenizer
import string
import re

# Text Cleansing

In [5]:
sample_text = '''Lorem ipsum dolor sit amet, ad case natum duo, in habeo novum consequuntur quo, erant option vim ad.
    Eu fugit voluptua antiopam ius, mel graeco patrioque scripserit ad, vis id justo graeco.
    Dicam munere nemore cum no, mei id erat commodo postulant, eam at dicta iisque scripserit.
    No ius zril solet veniam, ei sea labores eleifend inciderint, eu convenire evertitur incorrupte pro.
    2003 was the greatest Year in the history since 1900
'''

In [13]:
# Remove punctuation
regex = re.compile('[%s]' % re.escape(string.punctuation))
clean_text = regex.sub('', sample_text)
clean_text

'Lorem ipsum dolor sit amet ad case natum duo in habeo novum consequuntur quo erant option vim ad\n    Eu fugit voluptua antiopam ius mel graeco patrioque scripserit ad vis id justo graeco\n    Dicam munere nemore cum no mei id erat commodo postulant eam at dicta iisque scripserit\n    No ius zril solet veniam ei sea labores eleifend inciderint eu convenire evertitur incorrupte pro\n    2003 was the greatest Year in the history since 1900\n'

In [14]:
# Remove numbers
regex = re.compile('[%s]' % re.escape(string.digits))
clean_text = regex.sub('', clean_text)
clean_text

'Lorem ipsum dolor sit amet ad case natum duo in habeo novum consequuntur quo erant option vim ad\n    Eu fugit voluptua antiopam ius mel graeco patrioque scripserit ad vis id justo graeco\n    Dicam munere nemore cum no mei id erat commodo postulant eam at dicta iisque scripserit\n    No ius zril solet veniam ei sea labores eleifend inciderint eu convenire evertitur incorrupte pro\n     was the greatest Year in the history since \n'

In [15]:
# Remove short words
clean_text = ' '.join([ x for x in clean_text.split() if len(x) >=4])
clean_text

'Lorem ipsum dolor amet case natum habeo novum consequuntur erant option fugit voluptua antiopam graeco patrioque scripserit justo graeco Dicam munere nemore erat commodo postulant dicta iisque scripserit zril solet veniam labores eleifend inciderint convenire evertitur incorrupte greatest Year history since'

In [21]:
# import stop words
sample_text1 = '''
    this has to be the most authentic pieces of art
'''
sw = stopwords.words("english")
cleaned_text1 = ' '.join([x for x in sample_text1.split() if x not in sw])
cleaned_text1


'authentic pieces art'

# Stemming

    * Stemming is process of reducing inflection in words to their root forms even if the stem in itself is not a valid word.
    * Two stemmers in NLTK package
        * Porter Stemmer
            - Simplicity and speed
            - Does not follow linguistics
            - Does not check if the final stem is a valid word/not
        * Lancaster Stemmer
            - Simple but heavy due to over iterations
            - On each iteration, tries to find applicable rule by last character of the word
            - Overstemming causes the stems to not be linguistic/ they won't have any meaning
            - Rules are saved externally, one table containing 120 rules indexed by last letter of suffix

In [25]:
porter = PorterStemmer()
lancaster = LancasterStemmer()

words = ["cats", "acheive", "acheiving", "acheived"]

print("{0:20}{1:20}{2:20}".format("Word", "** Porter Stemmer", "** Lancaster Stemmer"))
for x in words:
    print("{0:20}{1:20}{2:20}".format(x, porter.stem(x), lancaster.stem(x)))


Word                ** Porter Stemmer   ** Lancaster Stemmer
cats                cat                 cat                 
acheive             acheiv              acheiv              
acheiving           acheiv              acheiv              
acheived            acheiv              acheiv              


# Lemmatization
* Lemmatization reduces the inflected word to their root word of the language
* WordNetLemmatizer
    - You need to provide the context if you want to lemmatize parts-of-speech

In [31]:
lemma = WordNetLemmatizer()
words = ["cats", "eating", "ate", "eats"]

print("{0:20}{1:20}".format("Word", "** WordNet Lemma"))
for x in words:
    print("{0:20}{1:20}".format(x, lemma.lemmatize(x, pos='v')))

Word                ** WordNet Lemma    
cats                cat                 
eating              eat                 
ate                 eat                 
eats                eat                 


# Tokenize
    - Whenever tokenizing always decode the string
    - word_tokenize - Divides strings into list of words
    - wordpunct_tokenize - Divides strings based on puncuation, text and whitespace
    - sent_tokenize - If the text contains multiple sentences, and you want to operate on level of sentences
    - Multiple ways to tokenize text, WhiteSpaceTokenizer represents a word in a sentence as tuple. This would be
      easier for comparision

In [47]:
s = '''A Chelsea tee shirt costs $100. Could you buy me 3 of them? Thank you!'''
print("Word Punct Tokenize:", wordpunct_tokenize(s))
print("Sent Tokenize:", sent_tokenize(s))
print("Sent-Word Tokenize:", [word_tokenize(x) for x in sent_tokenize(s)])


Word Punct Tokenize: ['A', 'Chelsea', 'tee', 'shirt', 'costs', '$', '100', '.', 'Could', 'you', 'buy', 'me', '3', 'of', 'them', '?', 'Thank', 'you', '!']
Sent Tokenize: ['A Chelsea tee shirt costs $100.', 'Could you buy me 3 of them?', 'Thank you!']
Sent-Word Tokenize: [['A', 'Chelsea', 'tee', 'shirt', 'costs', '$', '100', '.'], ['Could', 'you', 'buy', 'me', '3', 'of', 'them', '?'], ['Thank', 'you', '!']]


In [51]:
list(WhitespaceTokenizer().span_tokenize(s))

[(0, 1),
 (2, 9),
 (10, 13),
 (14, 19),
 (20, 25),
 (26, 31),
 (32, 37),
 (38, 41),
 (42, 45),
 (46, 48),
 (49, 50),
 (51, 53),
 (54, 59),
 (60, 65),
 (66, 70)]