# 03 – Tokenization, Stemming, Lemmatization and POS Tagging


## Setup and Imports

In [None]:
import re
import string
from pprint import pprint

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/miriamplametshofer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/miriamplametshofer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/miriamplametshofer/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/miriamplametshofer/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/miriamplametshofer/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

## Data Preparation

In [20]:
corpus_original = "HONG KONG, Oct 12 (Reuters) - Alibaba Group (9988.HK), opens new tab founder Jack Ma, largely out of public view since a regulatory clampdown started on his business empire late last year, is currently in Hong Kong and has met business associates in recent days, two sources told Reuters."
corpus = "HONG KONG, Oct 12 (Reuters) - Alibaba Group (9988.HK), opens new tab founder Jack Ma, largely out of public view since a regulatory clampdown started on his business empire late last year, is currently in Hong Kong and has met business associates in recent days, two sources told Reuters."

### Text Preprocessing Steps

In [21]:
#removing digits in the corpus
corpus = re.sub(r'\d+','', corpus)
print(corpus)

HONG KONG, Oct  (Reuters) - Alibaba Group (.HK), opens new tab founder Jack Ma, largely out of public view since a regulatory clampdown started on his business empire late last year, is currently in Hong Kong and has met business associates in recent days, two sources told Reuters.


In [22]:
#removing punctuations
corpus = corpus.translate(str.maketrans('', '', string.punctuation))
print(corpus)

HONG KONG Oct  Reuters  Alibaba Group HK opens new tab founder Jack Ma largely out of public view since a regulatory clampdown started on his business empire late last year is currently in Hong Kong and has met business associates in recent days two sources told Reuters


In [23]:
#removing punctuations
corpus = corpus.translate(str.maketrans('', '', string.punctuation))
print(corpus)

HONG KONG Oct  Reuters  Alibaba Group HK opens new tab founder Jack Ma largely out of public view since a regulatory clampdown started on his business empire late last year is currently in Hong Kong and has met business associates in recent days two sources told Reuters


## Tokenization

Tokenization is the process of breaking text into individual words or tokens.

In [24]:
stop_words_nltk = set(stopwords.words('english'))

tokenized_corpus_nltk = word_tokenize(corpus)
print("\nNLTK\nTokenized corpus:",tokenized_corpus_nltk)
tokenized_corpus_without_stopwords = [i for i in tokenized_corpus_nltk if not i in stop_words_nltk]
print("Tokenized corpus without stopwords:",tokenized_corpus_without_stopwords)


NLTK
Tokenized corpus: ['HONG', 'KONG', 'Oct', 'Reuters', 'Alibaba', 'Group', 'HK', 'opens', 'new', 'tab', 'founder', 'Jack', 'Ma', 'largely', 'out', 'of', 'public', 'view', 'since', 'a', 'regulatory', 'clampdown', 'started', 'on', 'his', 'business', 'empire', 'late', 'last', 'year', 'is', 'currently', 'in', 'Hong', 'Kong', 'and', 'has', 'met', 'business', 'associates', 'in', 'recent', 'days', 'two', 'sources', 'told', 'Reuters']
Tokenized corpus without stopwords: ['HONG', 'KONG', 'Oct', 'Reuters', 'Alibaba', 'Group', 'HK', 'opens', 'new', 'tab', 'founder', 'Jack', 'Ma', 'largely', 'public', 'view', 'since', 'regulatory', 'clampdown', 'started', 'business', 'empire', 'late', 'last', 'year', 'currently', 'Hong', 'Kong', 'met', 'business', 'associates', 'recent', 'days', 'two', 'sources', 'told', 'Reuters']


## Stemming

In [25]:
stemmer= PorterStemmer()

print("Before Stemming:")
print(corpus)

print("After Stemming:")
for word in tokenized_corpus_nltk:
    print(stemmer.stem(word),end=" ")

Before Stemming:
HONG KONG Oct  Reuters  Alibaba Group HK opens new tab founder Jack Ma largely out of public view since a regulatory clampdown started on his business empire late last year is currently in Hong Kong and has met business associates in recent days two sources told Reuters
After Stemming:
hong kong oct reuter alibaba group hk open new tab founder jack ma larg out of public view sinc a regulatori clampdown start on hi busi empir late last year is current in hong kong and ha met busi associ in recent day two sourc told reuter 

## Lematization

In [26]:
lemmatizer = WordNetLemmatizer()

for word in tokenized_corpus_nltk:
    print(lemmatizer.lemmatize(word),end=" ")

HONG KONG Oct Reuters Alibaba Group HK open new tab founder Jack Ma largely out of public view since a regulatory clampdown started on his business empire late last year is currently in Hong Kong and ha met business associate in recent day two source told Reuters 

## POS

In [27]:
#POS tagging using spacy
print("POS Tagging using spacy:")
doc = spacy_model(corpus_original)
# Token and Tag
for token in doc:
    print(token,":", token.pos_)

#pos tagging using nltk
print("\nPOS Tagging using NLTK:")
pprint(nltk.pos_tag(word_tokenize(corpus_original)))

POS Tagging using spacy:
HONG : PROPN
KONG : PROPN
, : PUNCT
Oct : PROPN
12 : NUM
( : PUNCT
Reuters : PROPN
) : PUNCT
- : PUNCT
Alibaba : PROPN
Group : PROPN
( : PUNCT
9988.HK : NUM
) : PUNCT
, : PUNCT
opens : VERB
new : ADJ
tab : NOUN
founder : NOUN
Jack : PROPN
Ma : PROPN
, : PUNCT
largely : ADV
out : ADP
of : ADP
public : ADJ
view : NOUN
since : SCONJ
a : DET
regulatory : ADJ
clampdown : NOUN
started : VERB
on : ADP
his : PRON
business : NOUN
empire : NOUN
late : ADV
last : ADJ
year : NOUN
, : PUNCT
is : AUX
currently : ADV
in : ADP
Hong : PROPN
Kong : PROPN
and : CCONJ
has : AUX
met : VERB
business : NOUN
associates : NOUN
in : ADP
recent : ADJ
days : NOUN
, : PUNCT
two : NUM
sources : NOUN
told : VERB
Reuters : PROPN
. : PUNCT

POS Tagging using NLTK:
[('HONG', 'NNP'),
 ('KONG', 'NNP'),
 (',', ','),
 ('Oct', 'NNP'),
 ('12', 'CD'),
 ('(', '('),
 ('Reuters', 'NNPS'),
 (')', ')'),
 ('-', ':'),
 ('Alibaba', 'NNP'),
 ('Group', 'NNP'),
 ('(', '('),
 ('9988.HK', 'CD'),
 (')', ')'),
 (','