In [1]:
import nltk

In [2]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize


In [15]:
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from collections import Counter
from nltk.corpus import wordnet


In [8]:
text = "POS Tagging is a process to mark up the words in text format for a particular part of a speech, based on its definition and context."

In [9]:
tokens = word_tokenize(text)
tokens

['POS',
 'Tagging',
 'is',
 'a',
 'process',
 'to',
 'mark',
 'up',
 'the',
 'words',
 'in',
 'text',
 'format',
 'for',
 'a',
 'particular',
 'part',
 'of',
 'a',
 'speech',
 ',',
 'based',
 'on',
 'its',
 'definition',
 'and',
 'context',
 '.']

In [10]:
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)
tokens

['POS',
 'Tagging',
 'is',
 'a',
 'process',
 'to',
 'mark',
 'up',
 'the',
 'words',
 'in',
 'text',
 'format',
 'for',
 'a',
 'particular',
 'part',
 'of',
 'a',
 'speech',
 'based',
 'on',
 'its',
 'definition',
 'and',
 'context']

In [12]:
print(stopwords.fileids())

['arabic', 'azerbaijani', 'basque', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']


In [13]:
en_sw = stopwords.words('english')
en_sw[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [15]:
ar_sw = stopwords.words('arabic')
ar_sw[:10]

['إذ', 'إذا', 'إذما', 'إذن', 'أف', 'أقل', 'أكثر', 'ألا', 'إلا', 'التي']

In [16]:
clean_tokens =[]
for token in tokens:
    if token not in en_sw:
        clean_tokens.append(token)
clean_tokens

NameError: name 'en_sw' is not defined

In [17]:
tknzr = TweetTokenizer()
tweet = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
tokens = tknzr.tokenize(tweet)
tokens

['This',
 'is',
 'a',
 'cooool',
 '#dummysmiley',
 ':',
 ':-)',
 ':-P',
 '<3',
 'and',
 'some',
 'arrows',
 '<',
 '>',
 '->',
 '<--']

In [19]:
sentences = 'This is a text written. It uses U.S. english to illustrate sentence tokenization.'
sent_tokens = sent_tokenize(sentences)
sent_tokens

['This is a text written.',
 'It uses U.S. english to illustrate sentence tokenization.']

In [23]:
nltk.download('pos_tag')

[nltk_data] Error loading pos_tag: Package 'pos_tag' not found in
[nltk_data]     index


False

In [24]:
tags = pos_tag(tokens)
tags

[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('cooool', 'JJ'),
 ('#dummysmiley', 'NN'),
 (':', ':'),
 (':-)', 'JJ'),
 (':-P', 'JJ'),
 ('<3', 'NN'),
 ('and', 'CC'),
 ('some', 'DT'),
 ('arrows', 'NNS'),
 ('<', 'VBP'),
 ('>', 'JJ'),
 ('->', 'CD'),
 ('<--', 'JJ')]

In [18]:
tags = pos_tag(tokens, tagset='universal')
tags

[('This', 'DET'),
 ('is', 'VERB'),
 ('a', 'DET'),
 ('cooool', 'ADJ'),
 ('#dummysmiley', 'NOUN'),
 (':', '.'),
 (':-)', 'ADJ'),
 (':-P', 'ADJ'),
 ('<3', 'NOUN'),
 ('and', 'CONJ'),
 ('some', 'DET'),
 ('arrows', 'NOUN'),
 ('<', 'VERB'),
 ('>', 'ADJ'),
 ('->', 'NUM'),
 ('<--', 'ADJ')]

In [26]:
tags_list =[]
for tag in tags:
    tags_list.append(tag[1])
tags_list

['DET',
 'VERB',
 'DET',
 'ADJ',
 'NOUN',
 '.',
 'ADJ',
 'ADJ',
 'NOUN',
 'CONJ',
 'DET',
 'NOUN',
 'VERB',
 'ADJ',
 'NUM',
 'ADJ']

In [19]:
counts = Counter(tag for tag in tags_list)
counts

NameError: name 'tags_list' is not defined

In [29]:
counts = Counter(tag for token,tag in tags)
counts

Counter({'ADJ': 5,
         'DET': 3,
         'NOUN': 3,
         'VERB': 2,
         '.': 1,
         'CONJ': 1,
         'NUM': 1})

In [20]:
porter = PorterStemmer()
lancaster  = LancasterStemmer()

porter_stem = porter.stem('probably')
porter_stem

'probabl'

In [31]:
lancaster_stem = lancaster.stem('probably')
lancaster_stem

'prob'

In [33]:
print(porter.stem('changes'))
print(porter.stem('troubling'))
print(porter.stem('troubled'))
print(porter.stem('cats'))
print(porter.stem('charcterization'))


chang
troubl
troubl
cat
charcter


In [34]:
print(lancaster.stem('changes'))
print(lancaster.stem('troubling'))
print(lancaster.stem('troubled'))
print(lancaster.stem('cats'))
print(lancaster.stem('charcterization'))

chang
troubl
troubl
cat
charct


In [35]:
word_list = ["friend", "friendship", "friends", "friendships","stabil", "destabilize", "misunderstanding", "railroad", "moonlight", "football"]

for word in word_list:
    print(f'{word:20} {porter.stem(word):20} {lancaster.stem(word)}')


friend               friend               friend
friendship           friendship           friend
friends              friend               friend
friendships          friendship           friend
stabil               stabil               stabl
destabilize          destabil             dest
misunderstanding     misunderstand        misunderstand
railroad             railroad             railroad
moonlight            moonlight            moonlight
football             footbal              footbal


In [37]:
def stem_sentence(sentence):
    tokens = word_tokenize(sentence)
    stems = []
    for token in tokens:
        stems.append(porter.stem(token))
    
    return " ".join(stems)

sentence = "Pythoners are very intelligent, and work very pythonly and now they are pythoning their way to success."

stems = stem_sentence(sentence)
stems

'python are veri intellig , and work veri pythonli and now they are python their way to success .'

In [4]:
lemmatizer = WordNetLemmatizer()

In [6]:
print(lemmatizer.lemmatize('bats'))
print(lemmatizer.lemmatize('feet'))
print(lemmatizer.lemmatize('are'))
print(lemmatizer.lemmatize('changes'))


bat
foot
are
change


In [7]:
print(lemmatizer.lemmatize('are',pos='v'))
print(lemmatizer.lemmatize('swimming',pos='v'))
print(lemmatizer.lemmatize('swimming',pos='n'))
print(lemmatizer.lemmatize('stripes',pos='v'))
print(lemmatizer.lemmatize('stripes',pos='n'))


be
swim
swimming
strip
stripe


In [9]:
sentance = 'He is running and eating at the same time. He has bad habit of swimming after playing long hours in the Sun'
sentance

'He is running and eating at the same time. He has bad habit of swimming after playing long hours in the Sun'

In [12]:
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(sentance)
tokens

['He',
 'is',
 'running',
 'and',
 'eating',
 'at',
 'the',
 'same',
 'time',
 'He',
 'has',
 'bad',
 'habit',
 'of',
 'swimming',
 'after',
 'playing',
 'long',
 'hours',
 'in',
 'the',
 'Sun']

In [13]:
for token in tokens:
    print(f'{token:20} {lemmatizer.lemmatize(token)}')

He                   He
is                   is
running              running
and                  and
eating               eating
at                   at
the                  the
same                 same
time                 time
He                   He
has                  ha
bad                  bad
habit                habit
of                   of
swimming             swimming
after                after
playing              playing
long                 long
hours                hour
in                   in
the                  the
Sun                  Sun


In [21]:
print(porter.stem('leaves'))
print(porter.stem('leafs'))


leav
leaf


In [22]:
print(lemmatizer.lemmatize('leaves', pos='v'))
print(lemmatizer.lemmatize('leaves', pos='n'))
print(lemmatizer.lemmatize('leafs'))


leave
leaf
leaf


In [23]:
text = "POS Tagging is a process to mark up the words in text format for a particular part of a speech, based on its definition and context."

In [24]:
tokens = word_tokenize(text)
tokens

['POS',
 'Tagging',
 'is',
 'a',
 'process',
 'to',
 'mark',
 'up',
 'the',
 'words',
 'in',
 'text',
 'format',
 'for',
 'a',
 'particular',
 'part',
 'of',
 'a',
 'speech',
 ',',
 'based',
 'on',
 'its',
 'definition',
 'and',
 'context',
 '.']

In [25]:
bigrams = list(nltk.bigrams(tokens))
bigrams

[('POS', 'Tagging'),
 ('Tagging', 'is'),
 ('is', 'a'),
 ('a', 'process'),
 ('process', 'to'),
 ('to', 'mark'),
 ('mark', 'up'),
 ('up', 'the'),
 ('the', 'words'),
 ('words', 'in'),
 ('in', 'text'),
 ('text', 'format'),
 ('format', 'for'),
 ('for', 'a'),
 ('a', 'particular'),
 ('particular', 'part'),
 ('part', 'of'),
 ('of', 'a'),
 ('a', 'speech'),
 ('speech', ','),
 (',', 'based'),
 ('based', 'on'),
 ('on', 'its'),
 ('its', 'definition'),
 ('definition', 'and'),
 ('and', 'context'),
 ('context', '.')]

In [26]:
trigrams = list(nltk.trigrams(tokens))
trigrams

[('POS', 'Tagging', 'is'),
 ('Tagging', 'is', 'a'),
 ('is', 'a', 'process'),
 ('a', 'process', 'to'),
 ('process', 'to', 'mark'),
 ('to', 'mark', 'up'),
 ('mark', 'up', 'the'),
 ('up', 'the', 'words'),
 ('the', 'words', 'in'),
 ('words', 'in', 'text'),
 ('in', 'text', 'format'),
 ('text', 'format', 'for'),
 ('format', 'for', 'a'),
 ('for', 'a', 'particular'),
 ('a', 'particular', 'part'),
 ('particular', 'part', 'of'),
 ('part', 'of', 'a'),
 ('of', 'a', 'speech'),
 ('a', 'speech', ','),
 ('speech', ',', 'based'),
 (',', 'based', 'on'),
 ('based', 'on', 'its'),
 ('on', 'its', 'definition'),
 ('its', 'definition', 'and'),
 ('definition', 'and', 'context'),
 ('and', 'context', '.')]

In [28]:
dog = wordnet.synsets('dog')[0]
cat = wordnet.synsets('cat')[0]
play = wordnet.synsets('play')[0]

print(dog.wup_similarity(cat))
print(dog.wup_similarity(play))

0.8571428571428571
0.125


In [30]:
for ss in wordnet.synsets('small'):
    for name in ss.lemma_names():
        print(name)

small
small
small
little
minor
modest
small
small-scale
pocket-size
pocket-sized
little
small
small
humble
low
lowly
modest
small
little
minuscule
small
little
small
small
modest
small
belittled
diminished
small
small
