# Introduction
---
---

First let's set some text...

In [109]:
sentence = "If you can talk with crowds and keep your virtue,\nOr walk with Kings—nor lose the common touch.\nIf neither foes nor loving friends can hurt you,\nIf all men count with you, but none too much.\nIf you can fill the unforgiving minute,\nWith sixty seconds’ worth of distance run,\nYours is the Earth and everything that’s in it,\nAnd—which is more—you’ll be a Man, my son!"

Now let's try to split the text based on spaces (default function)

In [110]:
sentence.split()

['If',
 'you',
 'can',
 'talk',
 'with',
 'crowds',
 'and',
 'keep',
 'your',
 'virtue,',
 'Or',
 'walk',
 'with',
 'Kings—nor',
 'lose',
 'the',
 'common',
 'touch.',
 'If',
 'neither',
 'foes',
 'nor',
 'loving',
 'friends',
 'can',
 'hurt',
 'you,',
 'If',
 'all',
 'men',
 'count',
 'with',
 'you,',
 'but',
 'none',
 'too',
 'much.',
 'If',
 'you',
 'can',
 'fill',
 'the',
 'unforgiving',
 'minute,',
 'With',
 'sixty',
 'seconds’',
 'worth',
 'of',
 'distance',
 'run,',
 'Yours',
 'is',
 'the',
 'Earth',
 'and',
 'everything',
 'that’s',
 'in',
 'it,',
 'And—which',
 'is',
 'more—you’ll',
 'be',
 'a',
 'Man,',
 'my',
 'son!']

We observe that some words are not well separated from punctuation.
So let's try to remove those characters... but before that, let's create a quick feature vector first.

In [111]:
tokens = sorted(sentence.split()) # splitting based on spaces
vocab = sorted(set(tokens)) # sorting and removing duplicates by using set()
vocab # just printing the vocab so we can look at it

['And—which',
 'Earth',
 'If',
 'Kings—nor',
 'Man,',
 'Or',
 'With',
 'Yours',
 'a',
 'all',
 'and',
 'be',
 'but',
 'can',
 'common',
 'count',
 'crowds',
 'distance',
 'everything',
 'fill',
 'foes',
 'friends',
 'hurt',
 'in',
 'is',
 'it,',
 'keep',
 'lose',
 'loving',
 'men',
 'minute,',
 'more—you’ll',
 'much.',
 'my',
 'neither',
 'none',
 'nor',
 'of',
 'run,',
 'seconds’',
 'sixty',
 'son!',
 'talk',
 'that’s',
 'the',
 'too',
 'touch.',
 'unforgiving',
 'virtue,',
 'walk',
 'with',
 'worth',
 'you',
 'you,',
 'your']

We see that the order is: numbers first, followerd by capital letters, and then lower case letters (all alphabetically sorted). We also note that some repeating words appear only once in the vocabulary list. Let's compare the size of the two lists.

In [112]:
tokens_len = len(tokens)
vocab_len = len(vocab)

print("tokens:", tokens_len)
print("vocab:", vocab_len)

tokens: 68
vocab: 55


Let's try and print the matrix of tokens against vocabulary. We will use the numpy lib for that.

In [113]:
import numpy as np

matrix = np.zeros((tokens_len, vocab_len), int)
for i, token in enumerate(tokens):
    matrix[i, vocab.index(token)] = 1

matrix

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 1]])

To make it more readable, we can use Pandas and DataFrame.

In [114]:
import pandas as pd

pd.DataFrame(matrix, columns=vocab, index=tokens)

Unnamed: 0,And—which,Earth,If,Kings—nor,"Man,",Or,With,Yours,a,all,...,too,touch.,unforgiving,"virtue,",walk,with,worth,you,"you,",your
And—which,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Earth,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
If,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
If,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
If,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
you,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
you,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
"you,",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
"you,",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


This is a lot clearer.

Let's now build the bag of words (BoW)

In [115]:
bow = {} # setting this up as a dictionary

for token in tokens:
    bow[token] = 1

sorted(bow.items()) # lets print it

[('And—which', 1),
 ('Earth', 1),
 ('If', 1),
 ('Kings—nor', 1),
 ('Man,', 1),
 ('Or', 1),
 ('With', 1),
 ('Yours', 1),
 ('a', 1),
 ('all', 1),
 ('and', 1),
 ('be', 1),
 ('but', 1),
 ('can', 1),
 ('common', 1),
 ('count', 1),
 ('crowds', 1),
 ('distance', 1),
 ('everything', 1),
 ('fill', 1),
 ('foes', 1),
 ('friends', 1),
 ('hurt', 1),
 ('in', 1),
 ('is', 1),
 ('it,', 1),
 ('keep', 1),
 ('lose', 1),
 ('loving', 1),
 ('men', 1),
 ('minute,', 1),
 ('more—you’ll', 1),
 ('much.', 1),
 ('my', 1),
 ('neither', 1),
 ('none', 1),
 ('nor', 1),
 ('of', 1),
 ('run,', 1),
 ('seconds’', 1),
 ('sixty', 1),
 ('son!', 1),
 ('talk', 1),
 ('that’s', 1),
 ('the', 1),
 ('too', 1),
 ('touch.', 1),
 ('unforgiving', 1),
 ('virtue,', 1),
 ('walk', 1),
 ('with', 1),
 ('worth', 1),
 ('you', 1),
 ('you,', 1),
 ('your', 1)]

Since bow is a dictionary, we see that the same words will not duplicate.

Pandas also has a more efficient form of a dictionary called Series.

In [116]:
df = pd.DataFrame(pd.Series(dict([(token, 1) for token in tokens])), columns=['sent']).T
df

Unnamed: 0,And—which,Earth,If,Kings—nor,"Man,",Or,With,Yours,a,all,...,too,touch.,unforgiving,"virtue,",walk,with,worth,you,"you,",your
sent,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [117]:
corpus = {}
for i, sent in enumerate(sentence.split('\n')):
    corpus['sent{}'.format(i)] = dict((tok, 1) for tok in sent.split())

df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
df

Unnamed: 0,If,you,can,talk,with,crowds,and,keep,your,"virtue,",...,that’s,in,"it,",And—which,more—you’ll,be,a,"Man,",my,son!
sent0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
sent1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sent2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sent3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sent4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sent5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sent6,0,0,0,0,0,0,1,0,0,0,...,1,1,1,0,0,0,0,0,0,0
sent7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,1,1,1


Now let's try some Dot Product calculation.

In [118]:
df = df.T
print("dot product of sent0 from sent1: ", df.sent0.dot(df.sent1), "\ndot product of sent0 from sent2: ", df.sent0.dot(df.sent2), "\ndot product of sent0 from sent3: ", df.sent0.dot(df.sent2),"\ndot product of sent0 from sent4: ", df.sent0.dot(df.sent4))

dot product of sent0 from sent1:  1 
dot product of sent0 from sent2:  2 
dot product of sent0 from sent3:  2 
dot product of sent0 from sent4:  3


As we see from the results, the higher the dot product the more similar the vectors are...

---
# Tokenization

We can improve our vocabulary now if we were to remove all other punctuation. Let's first do that with regular expressions.

In [119]:
import re

tokens = re.split(r'[-\s.,;!?]+', sentence)
tokens

['If',
 'you',
 'can',
 'talk',
 'with',
 'crowds',
 'and',
 'keep',
 'your',
 'virtue',
 'Or',
 'walk',
 'with',
 'Kings—nor',
 'lose',
 'the',
 'common',
 'touch',
 'If',
 'neither',
 'foes',
 'nor',
 'loving',
 'friends',
 'can',
 'hurt',
 'you',
 'If',
 'all',
 'men',
 'count',
 'with',
 'you',
 'but',
 'none',
 'too',
 'much',
 'If',
 'you',
 'can',
 'fill',
 'the',
 'unforgiving',
 'minute',
 'With',
 'sixty',
 'seconds’',
 'worth',
 'of',
 'distance',
 'run',
 'Yours',
 'is',
 'the',
 'Earth',
 'and',
 'everything',
 'that’s',
 'in',
 'it',
 'And—which',
 'is',
 'more—you’ll',
 'be',
 'a',
 'Man',
 'my',
 'son',
 '']

Although this seems great... we might still face issues with different characters that are not anticipated. So we usually use an existing NLP related tokenizer to do this job. Let's try the NLTK lib.

NLTK also supports regular expressions:

### RegexpTokenizer

In [144]:
# RegexpTokenizer here
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(sentence)

['If',
 'you',
 'can',
 'talk',
 'with',
 'crowds',
 'and',
 'keep',
 'your',
 'virtue',
 ',',
 'Or',
 'walk',
 'with',
 'Kings',
 '—nor',
 'lose',
 'the',
 'common',
 'touch',
 '.',
 'If',
 'neither',
 'foes',
 'nor',
 'loving',
 'friends',
 'can',
 'hurt',
 'you',
 ',',
 'If',
 'all',
 'men',
 'count',
 'with',
 'you',
 ',',
 'but',
 'none',
 'too',
 'much',
 '.',
 'If',
 'you',
 'can',
 'fill',
 'the',
 'unforgiving',
 'minute',
 ',',
 'With',
 'sixty',
 'seconds',
 '’',
 'worth',
 'of',
 'distance',
 'run',
 ',',
 'Yours',
 'is',
 'the',
 'Earth',
 'and',
 'everything',
 'that',
 '’s',
 'in',
 'it',
 ',',
 'And',
 '—which',
 'is',
 'more',
 '—you’ll',
 'be',
 'a',
 'Man',
 ',',
 'my',
 'son',
 '!']

Then there are other, more specialised tokenizers:

### TreebankWordTokenizer

In [143]:
# TreebankWordTokenizer here
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(sentence)

['If',
 'you',
 'can',
 'talk',
 'with',
 'crowds',
 'and',
 'keep',
 'your',
 'virtue',
 ',',
 'Or',
 'walk',
 'with',
 'Kings—nor',
 'lose',
 'the',
 'common',
 'touch.',
 'If',
 'neither',
 'foes',
 'nor',
 'loving',
 'friends',
 'can',
 'hurt',
 'you',
 ',',
 'If',
 'all',
 'men',
 'count',
 'with',
 'you',
 ',',
 'but',
 'none',
 'too',
 'much.',
 'If',
 'you',
 'can',
 'fill',
 'the',
 'unforgiving',
 'minute',
 ',',
 'With',
 'sixty',
 'seconds’',
 'worth',
 'of',
 'distance',
 'run',
 ',',
 'Yours',
 'is',
 'the',
 'Earth',
 'and',
 'everything',
 'that’s',
 'in',
 'it',
 ',',
 'And—which',
 'is',
 'more—you’ll',
 'be',
 'a',
 'Man',
 ',',
 'my',
 'son',
 '!']

For now let's use the regular expression special word pattern w, so as to have control

In [122]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(sentence)
print(tokens)

['If', 'you', 'can', 'talk', 'with', 'crowds', 'and', 'keep', 'your', 'virtue', 'Or', 'walk', 'with', 'Kings', 'nor', 'lose', 'the', 'common', 'touch', 'If', 'neither', 'foes', 'nor', 'loving', 'friends', 'can', 'hurt', 'you', 'If', 'all', 'men', 'count', 'with', 'you', 'but', 'none', 'too', 'much', 'If', 'you', 'can', 'fill', 'the', 'unforgiving', 'minute', 'With', 'sixty', 'seconds', 'worth', 'of', 'distance', 'run', 'Yours', 'is', 'the', 'Earth', 'and', 'everything', 'that', 's', 'in', 'it', 'And', 'which', 'is', 'more', 'you', 'll', 'be', 'a', 'Man', 'my', 'son']


At this point we can try out different tokenizers from other libraries to make note of the differences.

### TweetTokenizer

In [123]:
# TweetTokenizer here
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer(r"\w+")
tokens = tokenizer.tokenize(sentence)
print(tokens)

['If', 'you', 'can', 'talk', 'with', 'crowds', 'and', 'keep', 'your', 'virtue', ',', 'Or', 'walk', 'with', 'Kings', '—', 'nor', 'lose', 'the', 'common', 'touch', '.', 'If', 'neither', 'foes', 'nor', 'loving', 'friends', 'can', 'hurt', 'you', ',', 'If', 'all', 'men', 'count', 'with', 'you', ',', 'but', 'none', 'too', 'much', '.', 'If', 'you', 'can', 'fill', 'the', 'unforgiving', 'minute', ',', 'With', 'sixty', 'seconds', '’', 'worth', 'of', 'distance', 'run', ',', 'Yours', 'is', 'the', 'Earth', 'and', 'everything', 'that', '’', 's', 'in', 'it', ',', 'And', '—', 'which', 'is', 'more', '—', 'you', '’', 'll', 'be', 'a', 'Man', ',', 'my', 'son', '!']


### PunktSentenceTokenizer

In [124]:
# PunktSentenceTokenizer here
from nltk.tokenize import PunktSentenceTokenizer

tokenizer = PunktSentenceTokenizer(r"\w+")
tokens = tokenizer.tokenize(sentence)
print(tokens)

['If you can talk with crowds and keep your virtue,\nOr walk with Kings—nor lose the common touch.', 'If neither foes nor loving friends can hurt you,\nIf all men count with you, but none too much.', 'If you can fill the unforgiving minute,\nWith sixty seconds’ worth of distance run,\nYours is the Earth and everything that’s in it,\nAnd—which is more—you’ll be a Man, my son!']


### MWETokenizer

In [125]:
# MWETokenizer here
from nltk.tokenize import MWETokenizer

tokenizer = MWETokenizer(r"\w+")
tokens = tokenizer.tokenize(sentence)
print(tokens)

['I', 'f', ' ', 'y', 'o', 'u', ' ', 'c', 'a', 'n', ' ', 't', 'a', 'l', 'k', ' ', 'w', 'i', 't', 'h', ' ', 'c', 'r', 'o', 'w', 'd', 's', ' ', 'a', 'n', 'd', ' ', 'k', 'e', 'e', 'p', ' ', 'y', 'o', 'u', 'r', ' ', 'v', 'i', 'r', 't', 'u', 'e', ',', '\n', 'O', 'r', ' ', 'w', 'a', 'l', 'k', ' ', 'w', 'i', 't', 'h', ' ', 'K', 'i', 'n', 'g', 's', '—', 'n', 'o', 'r', ' ', 'l', 'o', 's', 'e', ' ', 't', 'h', 'e', ' ', 'c', 'o', 'm', 'm', 'o', 'n', ' ', 't', 'o', 'u', 'c', 'h', '.', '\n', 'I', 'f', ' ', 'n', 'e', 'i', 't', 'h', 'e', 'r', ' ', 'f', 'o', 'e', 's', ' ', 'n', 'o', 'r', ' ', 'l', 'o', 'v', 'i', 'n', 'g', ' ', 'f', 'r', 'i', 'e', 'n', 'd', 's', ' ', 'c', 'a', 'n', ' ', 'h', 'u', 'r', 't', ' ', 'y', 'o', 'u', ',', '\n', 'I', 'f', ' ', 'a', 'l', 'l', ' ', 'm', 'e', 'n', ' ', 'c', 'o', 'u', 'n', 't', ' ', 'w', 'i', 't', 'h', ' ', 'y', 'o', 'u', ',', ' ', 'b', 'u', 't', ' ', 'n', 'o', 'n', 'e', ' ', 't', 'o', 'o', ' ', 'm', 'u', 'c', 'h', '.', '\n', 'I', 'f', ' ', 'y', 'o', 'u', ' ', 'c', 

---
# n-Gram Creation

Let's now calculate 2-grams

In [126]:
from nltk.util import ngrams

list(ngrams(tokens, 2))

[('I', 'f'),
 ('f', ' '),
 (' ', 'y'),
 ('y', 'o'),
 ('o', 'u'),
 ('u', ' '),
 (' ', 'c'),
 ('c', 'a'),
 ('a', 'n'),
 ('n', ' '),
 (' ', 't'),
 ('t', 'a'),
 ('a', 'l'),
 ('l', 'k'),
 ('k', ' '),
 (' ', 'w'),
 ('w', 'i'),
 ('i', 't'),
 ('t', 'h'),
 ('h', ' '),
 (' ', 'c'),
 ('c', 'r'),
 ('r', 'o'),
 ('o', 'w'),
 ('w', 'd'),
 ('d', 's'),
 ('s', ' '),
 (' ', 'a'),
 ('a', 'n'),
 ('n', 'd'),
 ('d', ' '),
 (' ', 'k'),
 ('k', 'e'),
 ('e', 'e'),
 ('e', 'p'),
 ('p', ' '),
 (' ', 'y'),
 ('y', 'o'),
 ('o', 'u'),
 ('u', 'r'),
 ('r', ' '),
 (' ', 'v'),
 ('v', 'i'),
 ('i', 'r'),
 ('r', 't'),
 ('t', 'u'),
 ('u', 'e'),
 ('e', ','),
 (',', '\n'),
 ('\n', 'O'),
 ('O', 'r'),
 ('r', ' '),
 (' ', 'w'),
 ('w', 'a'),
 ('a', 'l'),
 ('l', 'k'),
 ('k', ' '),
 (' ', 'w'),
 ('w', 'i'),
 ('i', 't'),
 ('t', 'h'),
 ('h', ' '),
 (' ', 'K'),
 ('K', 'i'),
 ('i', 'n'),
 ('n', 'g'),
 ('g', 's'),
 ('s', '—'),
 ('—', 'n'),
 ('n', 'o'),
 ('o', 'r'),
 ('r', ' '),
 (' ', 'l'),
 ('l', 'o'),
 ('o', 's'),
 ('s', 'e'),
 ('e', ' '

and 3-grams

In [127]:
list(ngrams(tokens, 3))

[('I', 'f', ' '),
 ('f', ' ', 'y'),
 (' ', 'y', 'o'),
 ('y', 'o', 'u'),
 ('o', 'u', ' '),
 ('u', ' ', 'c'),
 (' ', 'c', 'a'),
 ('c', 'a', 'n'),
 ('a', 'n', ' '),
 ('n', ' ', 't'),
 (' ', 't', 'a'),
 ('t', 'a', 'l'),
 ('a', 'l', 'k'),
 ('l', 'k', ' '),
 ('k', ' ', 'w'),
 (' ', 'w', 'i'),
 ('w', 'i', 't'),
 ('i', 't', 'h'),
 ('t', 'h', ' '),
 ('h', ' ', 'c'),
 (' ', 'c', 'r'),
 ('c', 'r', 'o'),
 ('r', 'o', 'w'),
 ('o', 'w', 'd'),
 ('w', 'd', 's'),
 ('d', 's', ' '),
 ('s', ' ', 'a'),
 (' ', 'a', 'n'),
 ('a', 'n', 'd'),
 ('n', 'd', ' '),
 ('d', ' ', 'k'),
 (' ', 'k', 'e'),
 ('k', 'e', 'e'),
 ('e', 'e', 'p'),
 ('e', 'p', ' '),
 ('p', ' ', 'y'),
 (' ', 'y', 'o'),
 ('y', 'o', 'u'),
 ('o', 'u', 'r'),
 ('u', 'r', ' '),
 ('r', ' ', 'v'),
 (' ', 'v', 'i'),
 ('v', 'i', 'r'),
 ('i', 'r', 't'),
 ('r', 't', 'u'),
 ('t', 'u', 'e'),
 ('u', 'e', ','),
 ('e', ',', '\n'),
 (',', '\n', 'O'),
 ('\n', 'O', 'r'),
 ('O', 'r', ' '),
 ('r', ' ', 'w'),
 (' ', 'w', 'a'),
 ('w', 'a', 'l'),
 ('a', 'l', 'k'),
 ('l', 

We can include the n-grams as a string rather than as tuples

In [128]:
bigrams = [" ".join(x) for x in list(ngrams(tokens, 2))]
print(bigrams)
trigrams = [" ".join(x) for x in list(ngrams(tokens, 3))]
print("\n\n")
print(trigrams)

['I f', 'f  ', '  y', 'y o', 'o u', 'u  ', '  c', 'c a', 'a n', 'n  ', '  t', 't a', 'a l', 'l k', 'k  ', '  w', 'w i', 'i t', 't h', 'h  ', '  c', 'c r', 'r o', 'o w', 'w d', 'd s', 's  ', '  a', 'a n', 'n d', 'd  ', '  k', 'k e', 'e e', 'e p', 'p  ', '  y', 'y o', 'o u', 'u r', 'r  ', '  v', 'v i', 'i r', 'r t', 't u', 'u e', 'e ,', ', \n', '\n O', 'O r', 'r  ', '  w', 'w a', 'a l', 'l k', 'k  ', '  w', 'w i', 'i t', 't h', 'h  ', '  K', 'K i', 'i n', 'n g', 'g s', 's —', '— n', 'n o', 'o r', 'r  ', '  l', 'l o', 'o s', 's e', 'e  ', '  t', 't h', 'h e', 'e  ', '  c', 'c o', 'o m', 'm m', 'm o', 'o n', 'n  ', '  t', 't o', 'o u', 'u c', 'c h', 'h .', '. \n', '\n I', 'I f', 'f  ', '  n', 'n e', 'e i', 'i t', 't h', 'h e', 'e r', 'r  ', '  f', 'f o', 'o e', 'e s', 's  ', '  n', 'n o', 'o r', 'r  ', '  l', 'l o', 'o v', 'v i', 'i n', 'n g', 'g  ', '  f', 'f r', 'r i', 'i e', 'e n', 'n d', 'd s', 's  ', '  c', 'c a', 'a n', 'n  ', '  h', 'h u', 'u r', 'r t', 't  ', '  y', 'y o', 'o u', '

---
# Stopwords Removal

First let's download the list from nltk

In [129]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/nn007/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

and then check it up

In [130]:
stop_words = nltk.corpus.stopwords.words('english')
print("number of stopwords:", len(stop_words))
print("\n",stop_words)

number of stopwords: 179

 ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'ow

Other libs have different stopwords. Let's see a much larger set from sklearn

In [131]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words

print("number of stopwords:", len(sklearn_stop_words))
print("\n",sklearn_stop_words)

number of stopwords: 318

 frozenset({'must', 'each', 'made', 'something', 'had', 'get', 'there', 'at', 'behind', 'whereafter', 'three', 'couldnt', 'twelve', 'whereby', 'fifty', 'sometime', 'another', 'because', 'became', 'this', 'over', 'once', 'system', 'per', 'amoungst', 'often', 'throughout', 'one', 'not', 'almost', 'yourselves', 'several', 'their', 'thin', 'has', 'same', 'after', 'nine', 'rather', 'all', 'top', 'interest', 'part', 'less', 'empty', 'via', 'without', 'anyhow', 'our', 'done', 'some', 'whose', 'except', 'thereby', 'always', 'amongst', 'upon', 'be', 'very', 'take', 'beside', 'these', 'thus', 'until', 'wherein', 'more', 'any', 'for', 'ltd', 'cannot', 'ie', 'however', 'across', 'whom', 'hereafter', 'them', 'un', 'seems', 'the', 'others', 'such', 'fill', 'even', 'it', 'or', 'anyone', 'someone', 'call', 'down', 'none', 'us', 'never', 'i', 'ten', 'here', 'but', 'you', 'her', 'me', 'hence', 'both', 'twenty', 'mill', 'whence', 'either', 'other', 'sixty', 'your', 'will', 'him'

We note that although there are more stopwords in sklearn, nltk has words that are not contained in sklearn. So we might want to join the two lists.

For normalizing the text we could do something as simple as making sure all words are lower case.

In [132]:
norm_tokens = [x.lower() for x in tokens]
print(norm_tokens)

['i', 'f', ' ', 'y', 'o', 'u', ' ', 'c', 'a', 'n', ' ', 't', 'a', 'l', 'k', ' ', 'w', 'i', 't', 'h', ' ', 'c', 'r', 'o', 'w', 'd', 's', ' ', 'a', 'n', 'd', ' ', 'k', 'e', 'e', 'p', ' ', 'y', 'o', 'u', 'r', ' ', 'v', 'i', 'r', 't', 'u', 'e', ',', '\n', 'o', 'r', ' ', 'w', 'a', 'l', 'k', ' ', 'w', 'i', 't', 'h', ' ', 'k', 'i', 'n', 'g', 's', '—', 'n', 'o', 'r', ' ', 'l', 'o', 's', 'e', ' ', 't', 'h', 'e', ' ', 'c', 'o', 'm', 'm', 'o', 'n', ' ', 't', 'o', 'u', 'c', 'h', '.', '\n', 'i', 'f', ' ', 'n', 'e', 'i', 't', 'h', 'e', 'r', ' ', 'f', 'o', 'e', 's', ' ', 'n', 'o', 'r', ' ', 'l', 'o', 'v', 'i', 'n', 'g', ' ', 'f', 'r', 'i', 'e', 'n', 'd', 's', ' ', 'c', 'a', 'n', ' ', 'h', 'u', 'r', 't', ' ', 'y', 'o', 'u', ',', '\n', 'i', 'f', ' ', 'a', 'l', 'l', ' ', 'm', 'e', 'n', ' ', 'c', 'o', 'u', 'n', 't', ' ', 'w', 'i', 't', 'h', ' ', 'y', 'o', 'u', ',', ' ', 'b', 'u', 't', ' ', 'n', 'o', 'n', 'e', ' ', 't', 'o', 'o', ' ', 'm', 'u', 'c', 'h', '.', '\n', 'i', 'f', ' ', 'y', 'o', 'u', ' ', 'c', 

---
# Stemming

For stemming the words, we can use NLTK again

In [133]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
stem_tokens = [stemmer.stem(x) for x in norm_tokens]
print(stem_tokens)

['i', 'f', ' ', 'y', 'o', 'u', ' ', 'c', 'a', 'n', ' ', 't', 'a', 'l', 'k', ' ', 'w', 'i', 't', 'h', ' ', 'c', 'r', 'o', 'w', 'd', 's', ' ', 'a', 'n', 'd', ' ', 'k', 'e', 'e', 'p', ' ', 'y', 'o', 'u', 'r', ' ', 'v', 'i', 'r', 't', 'u', 'e', ',', '\n', 'o', 'r', ' ', 'w', 'a', 'l', 'k', ' ', 'w', 'i', 't', 'h', ' ', 'k', 'i', 'n', 'g', 's', '—', 'n', 'o', 'r', ' ', 'l', 'o', 's', 'e', ' ', 't', 'h', 'e', ' ', 'c', 'o', 'm', 'm', 'o', 'n', ' ', 't', 'o', 'u', 'c', 'h', '.', '\n', 'i', 'f', ' ', 'n', 'e', 'i', 't', 'h', 'e', 'r', ' ', 'f', 'o', 'e', 's', ' ', 'n', 'o', 'r', ' ', 'l', 'o', 'v', 'i', 'n', 'g', ' ', 'f', 'r', 'i', 'e', 'n', 'd', 's', ' ', 'c', 'a', 'n', ' ', 'h', 'u', 'r', 't', ' ', 'y', 'o', 'u', ',', '\n', 'i', 'f', ' ', 'a', 'l', 'l', ' ', 'm', 'e', 'n', ' ', 'c', 'o', 'u', 'n', 't', ' ', 'w', 'i', 't', 'h', ' ', 'y', 'o', 'u', ',', ' ', 'b', 'u', 't', ' ', 'n', 'o', 'n', 'e', ' ', 't', 'o', 'o', ' ', 'm', 'u', 'c', 'h', '.', '\n', 'i', 'f', ' ', 'y', 'o', 'u', ' ', 'c', 

---
# Lemmatising

In [134]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stem_tokens = [lemmatizer.lemmatize(x) for x in norm_tokens]
print(stem_tokens)

['i', 'f', ' ', 'y', 'o', 'u', ' ', 'c', 'a', 'n', ' ', 't', 'a', 'l', 'k', ' ', 'w', 'i', 't', 'h', ' ', 'c', 'r', 'o', 'w', 'd', 's', ' ', 'a', 'n', 'd', ' ', 'k', 'e', 'e', 'p', ' ', 'y', 'o', 'u', 'r', ' ', 'v', 'i', 'r', 't', 'u', 'e', ',', '\n', 'o', 'r', ' ', 'w', 'a', 'l', 'k', ' ', 'w', 'i', 't', 'h', ' ', 'k', 'i', 'n', 'g', 's', '—', 'n', 'o', 'r', ' ', 'l', 'o', 's', 'e', ' ', 't', 'h', 'e', ' ', 'c', 'o', 'm', 'm', 'o', 'n', ' ', 't', 'o', 'u', 'c', 'h', '.', '\n', 'i', 'f', ' ', 'n', 'e', 'i', 't', 'h', 'e', 'r', ' ', 'f', 'o', 'e', 's', ' ', 'n', 'o', 'r', ' ', 'l', 'o', 'v', 'i', 'n', 'g', ' ', 'f', 'r', 'i', 'e', 'n', 'd', 's', ' ', 'c', 'a', 'n', ' ', 'h', 'u', 'r', 't', ' ', 'y', 'o', 'u', ',', '\n', 'i', 'f', ' ', 'a', 'l', 'l', ' ', 'm', 'e', 'n', ' ', 'c', 'o', 'u', 'n', 't', ' ', 'w', 'i', 't', 'h', ' ', 'y', 'o', 'u', ',', ' ', 'b', 'u', 't', ' ', 'n', 'o', 'n', 'e', ' ', 't', 'o', 'o', ' ', 'm', 'u', 'c', 'h', '.', '\n', 'i', 'f', ' ', 'y', 'o', 'u', ' ', 'c', 

[nltk_data] Downloading package wordnet to /home/nn007/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


The sentences we have has no issues with the lemma... but look into the following example

In [135]:
print(lemmatizer.lemmatize("better"))
print(lemmatizer.lemmatize("better", 'a')) # declaring the POS as adjective

better
good


If we don't include the POS, the nltk library with wordnet does not work well. So let's try fix that

In [136]:
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    # now we need to convert from nltk to wordnet POS notations (for compatibility reasons)
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) # return and default to noun if not found

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/nn007/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [137]:
stem_tokens = [lemmatizer.lemmatize(x, pos=get_wordnet_pos(x)) for x in norm_tokens]
print(stem_tokens)

['i', 'f', ' ', 'y', 'o', 'u', ' ', 'c', 'a', 'n', ' ', 't', 'a', 'l', 'k', ' ', 'w', 'i', 't', 'h', ' ', 'c', 'r', 'o', 'w', 'd', 's', ' ', 'a', 'n', 'd', ' ', 'k', 'e', 'e', 'p', ' ', 'y', 'o', 'u', 'r', ' ', 'v', 'i', 'r', 't', 'u', 'e', ',', '\n', 'o', 'r', ' ', 'w', 'a', 'l', 'k', ' ', 'w', 'i', 't', 'h', ' ', 'k', 'i', 'n', 'g', 's', '—', 'n', 'o', 'r', ' ', 'l', 'o', 's', 'e', ' ', 't', 'h', 'e', ' ', 'c', 'o', 'm', 'm', 'o', 'n', ' ', 't', 'o', 'u', 'c', 'h', '.', '\n', 'i', 'f', ' ', 'n', 'e', 'i', 't', 'h', 'e', 'r', ' ', 'f', 'o', 'e', 's', ' ', 'n', 'o', 'r', ' ', 'l', 'o', 'v', 'i', 'n', 'g', ' ', 'f', 'r', 'i', 'e', 'n', 'd', 's', ' ', 'c', 'a', 'n', ' ', 'h', 'u', 'r', 't', ' ', 'y', 'o', 'u', ',', '\n', 'i', 'f', ' ', 'a', 'l', 'l', ' ', 'm', 'e', 'n', ' ', 'c', 'o', 'u', 'n', 't', ' ', 'w', 'i', 't', 'h', ' ', 'y', 'o', 'u', ',', ' ', 'b', 'u', 't', ' ', 'n', 'o', 'n', 'e', ' ', 't', 'o', 'o', ' ', 'm', 'u', 'c', 'h', '.', '\n', 'i', 'f', ' ', 'y', 'o', 'u', ' ', 'c', 

If we look at the words now we are getting more counts for our bag of words

---
# Feature-vector creation

In [138]:
from collections import Counter

bow = Counter(stem_tokens)
bow

Counter({'i': 25,
         'f': 9,
         ' ': 60,
         'y': 10,
         'o': 27,
         'u': 16,
         'c': 11,
         'a': 14,
         'n': 27,
         't': 24,
         'l': 10,
         'k': 4,
         'w': 8,
         'h': 17,
         'r': 16,
         'd': 7,
         's': 14,
         'e': 21,
         'p': 1,
         'v': 4,
         ',': 7,
         '\n': 7,
         'g': 5,
         '—': 3,
         'm': 8,
         '.': 2,
         'b': 2,
         'x': 1,
         '’': 3,
         '!': 1})

Now let's check the most frequent 10 words

In [139]:
bow.most_common(10)

[(' ', 60),
 ('o', 27),
 ('n', 27),
 ('i', 25),
 ('t', 24),
 ('e', 21),
 ('h', 17),
 ('u', 16),
 ('r', 16),
 ('a', 14)]

Then let's remove the stopwords

In [140]:
no_stop_tokens = [x for x in stem_tokens if x not in stop_words]
count = Counter(no_stop_tokens)
count

Counter({'f': 9,
         ' ': 60,
         'u': 16,
         'c': 11,
         'n': 27,
         'l': 10,
         'k': 4,
         'w': 8,
         'h': 17,
         'r': 16,
         'e': 21,
         'p': 1,
         'v': 4,
         ',': 7,
         '\n': 7,
         'g': 5,
         '—': 3,
         '.': 2,
         'b': 2,
         'x': 1,
         '’': 3,
         '!': 1})

Finally... let's make our feature vector using the frequency ratio (term count / total number of terms in the doc)

In [141]:
document_vector = []
doc_length = len(no_stop_tokens)
for key, value in count.most_common():
    document_vector.append(value / doc_length)

print(document_vector)

[0.2553191489361702, 0.1148936170212766, 0.08936170212765958, 0.07234042553191489, 0.06808510638297872, 0.06808510638297872, 0.04680851063829787, 0.0425531914893617, 0.03829787234042553, 0.03404255319148936, 0.029787234042553193, 0.029787234042553193, 0.02127659574468085, 0.01702127659574468, 0.01702127659574468, 0.01276595744680851, 0.01276595744680851, 0.00851063829787234, 0.00851063829787234, 0.00425531914893617, 0.00425531914893617, 0.00425531914893617]


---
---