n Noun <br>
a Adjective <br>
r Adverb <br>
v Verb

In [2]:
import nltk
# Skinite nltk na odredjenu putanju i to unesite u path (linija ispod)
# nltk.download()

In [3]:
from nltk.data import path
path.append("/home/milutin/Documents/Projects/MachineLearning/machine-learning/data/06_NLP/")

## Tokenization

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [5]:
paragraph = """Dobro dosli na vezbe iz predmeta `Masinsko ucenje`. 
Danas radimo uvod u NLP (Natural Language Processing). Koristicemo Python biblioteku NLTK."""

In [6]:
sentences = sent_tokenize(paragraph)
sentences

['Dobro dosli na vezbe iz predmeta `Masinsko ucenje`.',
 'Danas radimo uvod u NLP (Natural Language Processing).',
 'Koristicemo Python biblioteku NLTK.']

In [7]:
word_tokenize(sentences[0])

['Dobro',
 'dosli',
 'na',
 'vezbe',
 'iz',
 'predmeta',
 '`Masinsko',
 'ucenje`',
 '.']

In [8]:
word_tokenize("won't")

['wo', "n't"]

In [9]:
from nltk.tokenize import wordpunct_tokenize, regexp_tokenize

In [10]:
wordpunct_tokenize("won't")

['won', "'", 't']

In [11]:
regexp_tokenize("won't", "[\w']+")

["won't"]

In [12]:
regexp_tokenize("I can't do this. I won't do that.", "[\w']+")

['I', "can't", 'do', 'this', 'I', "won't", 'do', 'that']

## Word Stemming

**Stemming**: Trying to shorten a word with simple regex rules

Word stemming means removing affixes from words and return the root word. 
Example: The stem of the word working => work.
Search engines use this technique when indexing pages, so many people write different versions for the same word and all of them are stemmed to the root word.

In [13]:
from nltk.stem import PorterStemmer    # least aggressive
from nltk.stem import LancasterStemmer # most agressive
from nltk.stem import RegexpStemmer
from nltk.stem import SnowballStemmer  # languages other than English

In [14]:
porter = PorterStemmer()

In [15]:
porter.stem('cooking')

'cook'

In [16]:
porter.stem('dancing')

'danc'

In [17]:
porter.stem('dancer')

'dancer'

In [18]:
lancaster = LancasterStemmer()

In [19]:
lancaster.stem('cooking')

'cook'

In [20]:
lancaster.stem('dancing')

'dant'

In [21]:
lancaster.stem('dance')

'dant'

In [22]:
regexp = RegexpStemmer('ing')

In [23]:
regexp.stem('dancing')

'danc'

In [24]:
regexp.stem('king')

'k'

## Word Lemmatizing

**Lemmatization**: Trying to find the root word with linguistics rules (with the use of regexes)

Note: Lemmatization won't really work on single words alone without context or knowledge of its POS (Parts of Speech) tag (i.e. we need to know whether the word is a noun, verb, adjective, adverb)

In [25]:
from nltk.stem import WordNetLemmatizer

In [26]:
lemmatizer = WordNetLemmatizer()

In [27]:
print(lemmatizer.lemmatize('playing', pos="v"))
print(lemmatizer.lemmatize('playing', pos="n"))
print(lemmatizer.lemmatize('playing', pos="a"))
print(lemmatizer.lemmatize('playing', pos="r"))

play
playing
playing
playing


In [28]:
lemmatizer.lemmatize('dancing', pos='v')

'dance'

In [29]:
porter.stem('dancing')

'danc'

## Count Word Frequency

In [30]:
from nltk import FreqDist
from nltk.tokenize import word_tokenize

In [31]:
sent = 'hello world, hello people, this is hello world example of word count.'

In [32]:
sent_tok = word_tokenize(sent)

In [33]:
sent_tok

['hello',
 'world',
 ',',
 'hello',
 'people',
 ',',
 'this',
 'is',
 'hello',
 'world',
 'example',
 'of',
 'word',
 'count',
 '.']

In [34]:
freq = FreqDist(sent_tok)

In [35]:
freq.plot(30, cumulative=False)

<matplotlib.figure.Figure at 0x7fd33f629a58>

## Remove Stop Words and Punctuation

In [36]:
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

In [37]:
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."

In [38]:
# Get a list of stopwords for english
stopword_list = set(stopwords.words('english'))

# Split paragraph into words
words = word_tokenize(data)

In [39]:
words_filtered = [w for w in words if w not in stopword_list]
words_filtered
# wordsFiltered = []
# for w in words:
#     if w not in stopWords:
#         wordsFiltered.append(w)

['All',
 'work',
 'play',
 'makes',
 'jack',
 'dull',
 'boy',
 '.',
 'All',
 'work',
 'play',
 'makes',
 'jack',
 'dull',
 'boy',
 '.']

In [40]:
stopwords_punctuation_list = set(stopword_list).union(set(punctuation))

In [41]:
words_filtered = [w for w in words if w not in stopwords_punctuation_list]
words_filtered

['All',
 'work',
 'play',
 'makes',
 'jack',
 'dull',
 'boy',
 'All',
 'work',
 'play',
 'makes',
 'jack',
 'dull',
 'boy']

In [42]:
words_filtered = [w.lower() for w in words if w not in stopwords_punctuation_list]
words_filtered

['all',
 'work',
 'play',
 'makes',
 'jack',
 'dull',
 'boy',
 'all',
 'work',
 'play',
 'makes',
 'jack',
 'dull',
 'boy']

## Lemmas, Synonyms and Antonyms

### Synsets

Synonym set - synset

In [43]:
from nltk.corpus import wordnet

In [44]:
sArr = wordnet.synsets('win')

In [45]:
sArr

[Synset('win.n.01'),
 Synset('winnings.n.01'),
 Synset('win.v.01'),
 Synset('acquire.v.05'),
 Synset('gain.v.05'),
 Synset('succeed.v.01')]

In [46]:
sArr[0].definition()

'a victory (as in a race or other competition)'

In [47]:
synArr = []
for syn in sArr:
    for lem in syn.lemmas():
        synArr.append(lem.name())
synArr

['win',
 'winnings',
 'win',
 'profits',
 'win',
 'acquire',
 'win',
 'gain',
 'gain',
 'advance',
 'win',
 'pull_ahead',
 'make_headway',
 'get_ahead',
 'gain_ground',
 'succeed',
 'win',
 'come_through',
 'bring_home_the_bacon',
 'deliver_the_goods']

In [48]:
set(synArr)

{'acquire',
 'advance',
 'bring_home_the_bacon',
 'come_through',
 'deliver_the_goods',
 'gain',
 'gain_ground',
 'get_ahead',
 'make_headway',
 'profits',
 'pull_ahead',
 'succeed',
 'win',
 'winnings'}

In [49]:
woi = sArr[2]
print(woi) # Treba nam v
woi.lemmas()[0].antonyms()[0].name()

Synset('win.v.01')


'lose'

In [50]:
antArr = []
for syn in sArr:
    for lem in syn.lemmas():
        for ant in lem.antonyms():
            antArr.append(ant.name())
antArr

['losings', 'lose', 'lose', 'fall_back', 'fail']

### Hypernyms

Synsets are organized in a kind of inheritance tree. More abstract terms are known as <b>hypernyms</b> and more specific terms are <b> hyponyms</b>. This tree can be traced all the way up to a root hypernym.

In [51]:
sArr[0].hypernyms()

[Synset('victory.n.01')]

In [52]:
sArr[0].hypernyms()[0].hyponyms()

[Synset('checkmate.n.01'),
 Synset('fall.n.10'),
 Synset('independence.n.02'),
 Synset('landslide.n.01'),
 Synset('last_laugh.n.01'),
 Synset('pyrrhic_victory.n.01'),
 Synset('runaway.n.01'),
 Synset('service_break.n.01'),
 Synset('slam.n.01'),
 Synset('walk-in.n.03'),
 Synset('win.n.01')]

In [53]:
sArr[0].root_hypernyms()

[Synset('entity.n.01')]

## Bigrams, Trigrams

### Word level n-grams

In [54]:
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
# https://en.wikipedia.org/wiki/Bigram

from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

In [55]:
words = word_tokenize('hello world, hello people, this is hello world example of word count.')
stopwords_punctuation_list = set(stopwords.words('english')).union(set(punctuation))
words_clean = [w.lower() for w in words if w not in stopwords_punctuation_list]

In [56]:
words_clean

['hello',
 'world',
 'hello',
 'people',
 'hello',
 'world',
 'example',
 'word',
 'count']

In [57]:
finder = BigramCollocationFinder.from_words(words_clean)
finder.nbest(BigramAssocMeasures.likelihood_ratio, 2)

[('example', 'word'), ('word', 'count')]

In [58]:
finder = TrigramCollocationFinder.from_words(words_clean)
finder.nbest(TrigramAssocMeasures.likelihood_ratio, 2)

[('example', 'word', 'count'), ('world', 'example', 'word')]

### Character level n-grams

In [59]:
def ngrams(text, n=3):
    return [text[i:i + n] for i in range(len(text) - n + 1)]

In [60]:
print(ngrams('hello world, hello people, this is hello world example of word count.'))

['hel', 'ell', 'llo', 'lo ', 'o w', ' wo', 'wor', 'orl', 'rld', 'ld,', 'd, ', ', h', ' he', 'hel', 'ell', 'llo', 'lo ', 'o p', ' pe', 'peo', 'eop', 'opl', 'ple', 'le,', 'e, ', ', t', ' th', 'thi', 'his', 'is ', 's i', ' is', 'is ', 's h', ' he', 'hel', 'ell', 'llo', 'lo ', 'o w', ' wo', 'wor', 'orl', 'rld', 'ld ', 'd e', ' ex', 'exa', 'xam', 'amp', 'mpl', 'ple', 'le ', 'e o', ' of', 'of ', 'f w', ' wo', 'wor', 'ord', 'rd ', 'd c', ' co', 'cou', 'oun', 'unt', 'nt.']


## POS tagging

In [61]:
from nltk.tag import pos_tag

https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [62]:
words = word_tokenize('And now for something completely different')

In [63]:
pos_tag(words)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

## From Strings to Vectors

**Vector Space Model** is conceptualizing language as a whole lot of numbers

**Bag-of-Words (BoW)**: Counting each document/sentence as a vector of numbers, with each number representing the count of a word in the corpus

In [64]:
from nltk import FreqDist

In [65]:
sent = 'John likes to watch movies. Mary likes movies too. \
John also likes to watch football games.'

In [66]:
words = word_tokenize(sent)

In [67]:
words

['John',
 'likes',
 'to',
 'watch',
 'movies',
 '.',
 'Mary',
 'likes',
 'movies',
 'too',
 '.',
 'John',
 'also',
 'likes',
 'to',
 'watch',
 'football',
 'games',
 '.']

In [68]:
stopwords_punct_list = set(stopwords.words('english')).union(set(punctuation))
words_clean = [w.lower() for w in words if w not in stopwords_punctuation_list]

In [69]:
words_clean

['john',
 'likes',
 'watch',
 'movies',
 'mary',
 'likes',
 'movies',
 'john',
 'also',
 'likes',
 'watch',
 'football',
 'games']

In [70]:
freq = FreqDist(words_clean)

In [71]:
freq

FreqDist({'also': 1,
          'football': 1,
          'games': 1,
          'john': 2,
          'likes': 3,
          'mary': 1,
          'movies': 2,
          'watch': 2})

In [72]:
from nltk import ngrams

In [73]:
n = 2

In [74]:
for gram in ngrams(words_clean, n):
    print(gram)

('john', 'likes')
('likes', 'watch')
('watch', 'movies')
('movies', 'mary')
('mary', 'likes')
('likes', 'movies')
('movies', 'john')
('john', 'also')
('also', 'likes')
('likes', 'watch')
('watch', 'football')
('football', 'games')


## String diffrence

In [75]:
from nltk.metrics import *

In [76]:
# String edit distance (Levenshtein)
edit_distance("rain", "shine")

3

In [77]:
# rain
# **  *
# shine