In [2]:
import nltk

In [4]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize


In [7]:
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from collections import Counter
from nltk.corpus import wordnet


In [8]:
text = "POS Tagging is a process to mark up the words in text format for a particular part of a speech, based on its definition and context."

In [9]:
tokens = word_tokenize(text)
tokens

['POS',
 'Tagging',
 'is',
 'a',
 'process',
 'to',
 'mark',
 'up',
 'the',
 'words',
 'in',
 'text',
 'format',
 'for',
 'a',
 'particular',
 'part',
 'of',
 'a',
 'speech',
 ',',
 'based',
 'on',
 'its',
 'definition',
 'and',
 'context',
 '.']

In [10]:
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)
tokens

['POS',
 'Tagging',
 'is',
 'a',
 'process',
 'to',
 'mark',
 'up',
 'the',
 'words',
 'in',
 'text',
 'format',
 'for',
 'a',
 'particular',
 'part',
 'of',
 'a',
 'speech',
 'based',
 'on',
 'its',
 'definition',
 'and',
 'context']

In [12]:
print(stopwords.fileids())

['arabic', 'azerbaijani', 'basque', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']


In [13]:
en_sw = stopwords.words('english')
en_sw[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [15]:
ar_sw = stopwords.words('arabic')
ar_sw[:10]

['إذ', 'إذا', 'إذما', 'إذن', 'أف', 'أقل', 'أكثر', 'ألا', 'إلا', 'التي']

In [16]:
clean_tokens =[]
for token in tokens:
    if token not in en_sw:
        clean_tokens.append(token)
clean_tokens

['POS',
 'Tagging',
 'process',
 'mark',
 'words',
 'text',
 'format',
 'particular',
 'part',
 'speech',
 'based',
 'definition',
 'context']

In [18]:
tknzr = TweetTokenizer()
tweet = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
tokens = tknzr.tokenize(tweet)
tokens

['This',
 'is',
 'a',
 'cooool',
 '#dummysmiley',
 ':',
 ':-)',
 ':-P',
 '<3',
 'and',
 'some',
 'arrows',
 '<',
 '>',
 '->',
 '<--']

In [19]:
sentences = 'This is a text written. It uses U.S. english to illustrate sentence tokenization.'
sent_tokens = sent_tokenize(sentences)
sent_tokens

['This is a text written.',
 'It uses U.S. english to illustrate sentence tokenization.']

In [23]:
nltk.download('pos_tag')

[nltk_data] Error loading pos_tag: Package 'pos_tag' not found in
[nltk_data]     index


False

In [24]:
tags = pos_tag(tokens)
tags

[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('cooool', 'JJ'),
 ('#dummysmiley', 'NN'),
 (':', ':'),
 (':-)', 'JJ'),
 (':-P', 'JJ'),
 ('<3', 'NN'),
 ('and', 'CC'),
 ('some', 'DT'),
 ('arrows', 'NNS'),
 ('<', 'VBP'),
 ('>', 'JJ'),
 ('->', 'CD'),
 ('<--', 'JJ')]

In [25]:
tags = pos_tag(tokens, tagset='universal')
tags

[('This', 'DET'),
 ('is', 'VERB'),
 ('a', 'DET'),
 ('cooool', 'ADJ'),
 ('#dummysmiley', 'NOUN'),
 (':', '.'),
 (':-)', 'ADJ'),
 (':-P', 'ADJ'),
 ('<3', 'NOUN'),
 ('and', 'CONJ'),
 ('some', 'DET'),
 ('arrows', 'NOUN'),
 ('<', 'VERB'),
 ('>', 'ADJ'),
 ('->', 'NUM'),
 ('<--', 'ADJ')]

In [26]:
tags_list =[]
for tag in tags:
    tags_list.append(tag[1])
tags_list

['DET',
 'VERB',
 'DET',
 'ADJ',
 'NOUN',
 '.',
 'ADJ',
 'ADJ',
 'NOUN',
 'CONJ',
 'DET',
 'NOUN',
 'VERB',
 'ADJ',
 'NUM',
 'ADJ']

In [27]:
counts = Counter(tag for tag in tags_list)
counts

Counter({'ADJ': 5,
         'DET': 3,
         'NOUN': 3,
         'VERB': 2,
         '.': 1,
         'CONJ': 1,
         'NUM': 1})

In [29]:
counts = Counter(tag for token,tag in tags)
counts

Counter({'ADJ': 5,
         'DET': 3,
         'NOUN': 3,
         'VERB': 2,
         '.': 1,
         'CONJ': 1,
         'NUM': 1})

In [30]:
porter = PorterStemmer()
lancaster  = LancasterStemmer()

porter_stem = porter.stem('probably')
porter_stem

'probabl'

In [31]:
lancaster_stem = lancaster.stem('probably')
lancaster_stem

'prob'

In [33]:
print(porter.stem('changes'))
print(porter.stem('troubling'))
print(porter.stem('troubled'))
print(porter.stem('cats'))
print(porter.stem('charcterization'))


chang
troubl
troubl
cat
charcter


In [34]:
print(lancaster.stem('changes'))
print(lancaster.stem('troubling'))
print(lancaster.stem('troubled'))
print(lancaster.stem('cats'))
print(lancaster.stem('charcterization'))

chang
troubl
troubl
cat
charct


In [35]:
word_list = ["friend", "friendship", "friends", "friendships","stabil", "destabilize", "misunderstanding", "railroad", "moonlight", "football"]

for word in word_list:
    print(f'{word:20} {porter.stem(word):20} {lancaster.stem(word)}')


friend               friend               friend
friendship           friendship           friend
friends              friend               friend
friendships          friendship           friend
stabil               stabil               stabl
destabilize          destabil             dest
misunderstanding     misunderstand        misunderstand
railroad             railroad             railroad
moonlight            moonlight            moonlight
football             footbal              footbal


In [37]:
def stem_sentence(sentence):
    tokens = word_tokenize(sentence)
    stems = []
    for token in tokens:
        stems.append(porter.stem(token))
    
    return " ".join(stems)

sentence = "Pythoners are very intelligent, and work very pythonly and now they are pythoning their way to success."

stems = stem_sentence(sentence)
stems

'python are veri intellig , and work veri pythonli and now they are python their way to success .'