# Text Encoding

In [262]:
# Importing Keras library 
from keras.preprocessing.text import text_to_word_sequence as keras_text_to_word_sequence

# Importing nltk library
import nltk
from nltk.tokenize import word_tokenize as nltk_work_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Importing some other supporting libraries 
import pandas as pd

In [263]:
# This is the next we will use for this small encoding experiment 
corpus = [
    "The number of trees in the world, according to a 2015 estimate, is 3.04 trillion.",
    "46% of the trees in the world are in the tropics or sub-tropics.",
    "20% of the trees in the world are in the temperate zones.",
    "24% of threes in the world are in the coniferous boreal forests.",
    "There about 15 billion trees are cut down annually.",
    "There about 5 billion trees are planted annually.",
]

# Feature Extractions

## <font color='blue'> Step 1. Tokenize the datasets seperating words in each sentence. </font>

In [295]:
# Extracting the tokens using Keras
ks_tokens = []
for text in corpus: 
    ks_tokens.append(keras_text_to_word_sequence(text))
print(ks_tokens)

[['the', 'number', 'of', 'trees', 'in', 'the', 'world', 'according', 'to', 'a', '2015', 'estimate', 'is', '3', '04', 'trillion'], ['46', 'of', 'the', 'trees', 'in', 'the', 'world', 'are', 'in', 'the', 'tropics', 'or', 'sub', 'tropics'], ['20', 'of', 'the', 'trees', 'in', 'the', 'world', 'are', 'in', 'the', 'temperate', 'zones'], ['24', 'of', 'threes', 'in', 'the', 'world', 'are', 'in', 'the', 'coniferous', 'boreal', 'forests'], ['there', 'about', '15', 'billion', 'trees', 'are', 'cut', 'down', 'annually'], ['there', 'about', '5', 'billion', 'trees', 'are', 'planted', 'annually']]


In [296]:
ks_tokens

[['the',
  'number',
  'of',
  'trees',
  'in',
  'the',
  'world',
  'according',
  'to',
  'a',
  '2015',
  'estimate',
  'is',
  '3',
  '04',
  'trillion'],
 ['46',
  'of',
  'the',
  'trees',
  'in',
  'the',
  'world',
  'are',
  'in',
  'the',
  'tropics',
  'or',
  'sub',
  'tropics'],
 ['20',
  'of',
  'the',
  'trees',
  'in',
  'the',
  'world',
  'are',
  'in',
  'the',
  'temperate',
  'zones'],
 ['24',
  'of',
  'threes',
  'in',
  'the',
  'world',
  'are',
  'in',
  'the',
  'coniferous',
  'boreal',
  'forests'],
 ['there',
  'about',
  '15',
  'billion',
  'trees',
  'are',
  'cut',
  'down',
  'annually'],
 ['there', 'about', '5', 'billion', 'trees', 'are', 'planted', 'annually']]

In [297]:
# Extracting the tokens using NLTK
# We need to serialize the text to splits the works into tokens
nltk_tokens = [] 
for text in corpus:
    pre_text = pd.Series(text).str.cat(sep=' ')
    nltk_tokens.append(word_tokenize(pre_text))
print(nltk_tokens)

[['The', 'number', 'of', 'trees', 'in', 'the', 'world', ',', 'according', 'to', 'a', '2015', 'estimate', ',', 'is', '3.04', 'trillion', '.'], ['46', '%', 'of', 'the', 'trees', 'in', 'the', 'world', 'are', 'in', 'the', 'tropics', 'or', 'sub-tropics', '.'], ['20', '%', 'of', 'the', 'trees', 'in', 'the', 'world', 'are', 'in', 'the', 'temperate', 'zones', '.'], ['24', '%', 'of', 'threes', 'in', 'the', 'world', 'are', 'in', 'the', 'coniferous', 'boreal', 'forests', '.'], ['There', 'about', '15', 'billion', 'trees', 'are', 'cut', 'down', 'annually', '.'], ['There', 'about', '5', 'billion', 'trees', 'are', 'planted', 'annually', '.']]


## <font color='blue'> Step 2. Remove the stop words. </font>

In [298]:
# Retriving the stop words from the library 
stop_words = set(stopwords.words('english'))
# Examples of some stop words
list(stop_words)[0:10]

['on', 'other', 'of', 'him', 'didn', 'have', 'haven', 'or', 'won', 'couldn']

In [299]:
for i in range(0, len(ks_tokens)): 
    ks_tokens[i] = [w for w in pd.Series(ks_tokens[i]) if not w in stop_words]
ks_tokens

[['number',
  'trees',
  'world',
  'according',
  '2015',
  'estimate',
  '3',
  '04',
  'trillion'],
 ['46', 'trees', 'world', 'tropics', 'sub', 'tropics'],
 ['20', 'trees', 'world', 'temperate', 'zones'],
 ['24', 'threes', 'world', 'coniferous', 'boreal', 'forests'],
 ['15', 'billion', 'trees', 'cut', 'annually'],
 ['5', 'billion', 'trees', 'planted', 'annually']]

In [300]:
# Unlike the list generated by Keras libary, here we have to make sure all tokens are either capitalized or in lower case.
for i in range(0, len(nltk_tokens)): 
    nltk_tokens[i] = [tok.lower() for tok in nltk_tokens[i]]
    nltk_tokens[i] = [w for w in nltk_tokens[i] if not w in stop_words]
nltk_tokens

[['number',
  'trees',
  'world',
  ',',
  'according',
  '2015',
  'estimate',
  ',',
  '3.04',
  'trillion',
  '.'],
 ['46', '%', 'trees', 'world', 'tropics', 'sub-tropics', '.'],
 ['20', '%', 'trees', 'world', 'temperate', 'zones', '.'],
 ['24', '%', 'threes', 'world', 'coniferous', 'boreal', 'forests', '.'],
 ['15', 'billion', 'trees', 'cut', 'annually', '.'],
 ['5', 'billion', 'trees', 'planted', 'annually', '.']]

## <font color='blue'> Step 3. Stemming the tokens, bringing each word to its root. </font>

In [301]:
# We will use Porter algorithm to reduce the words. Porter has 5 phases of word reductions: 
porter = PorterStemmer()
print(porter.stem('Eating'))
print(porter.stem('running'))

eat
run


In [303]:
for tokens in ks_tokens:
    for i in range(0, len(tokens)):    
        tokens[i] = porter.stem(tokens[i])
print(ks_tokens)

[['number', 'tree', 'world', 'accord', '2015', 'estim', '3', '04', 'trillion'], ['46', 'tree', 'world', 'tropic', 'sub', 'tropic'], ['20', 'tree', 'world', 'temper', 'zone'], ['24', 'three', 'world', 'conifer', 'boreal', 'forest'], ['15', 'billion', 'tree', 'cut', 'annual'], ['5', 'billion', 'tree', 'plant', 'annual']]


In [304]:
for tokens in nltk_tokens:
    for i in range (0,len(tokens)):    
        tokens[i] = porter.stem(tokens[i])
print(nltk_tokens)

[['number', 'tree', 'world', ',', 'accord', '2015', 'estim', ',', '3.04', 'trillion', '.'], ['46', '%', 'tree', 'world', 'tropic', 'sub-trop', '.'], ['20', '%', 'tree', 'world', 'temper', 'zone', '.'], ['24', '%', 'three', 'world', 'conifer', 'boreal', 'forest', '.'], ['15', 'billion', 'tree', 'cut', 'annual', '.'], ['5', 'billion', 'tree', 'plant', 'annual', '.']]


## <font color='blue'> Step 4. Remove punctuations, special characters, and words which have length equal to 1. </font>

In [305]:
for i in range(0, len(ks_tokens)):
    new_ks_tokens = []
    for tok in ks_tokens[i]: 
        tok = tok.translate(str.maketrans('', '', string.punctuation))
        if tok != "" and len(tok) > 1 and (tok.isnumeric()==False):
            new_ks_tokens.append(tok)
    ks_tokens[i] = new_ks_tokens
print(ks_tokens)

[['number', 'tree', 'world', 'accord', 'estim', 'trillion'], ['tree', 'world', 'tropic', 'sub', 'tropic'], ['tree', 'world', 'temper', 'zone'], ['three', 'world', 'conifer', 'boreal', 'forest'], ['billion', 'tree', 'cut', 'annual'], ['billion', 'tree', 'plant', 'annual']]


In [306]:
for sentences in ks_tokens: 
    print(sentences)

['number', 'tree', 'world', 'accord', 'estim', 'trillion']
['tree', 'world', 'tropic', 'sub', 'tropic']
['tree', 'world', 'temper', 'zone']
['three', 'world', 'conifer', 'boreal', 'forest']
['billion', 'tree', 'cut', 'annual']
['billion', 'tree', 'plant', 'annual']


# Text Encoding

In [322]:
from keras.preprocessing.text import Tokenizer

In [324]:
# First step is to get an instance of the tokenizer class
tok = Tokenizer()
# Keras can fit our corpus in a single call
tok.fit_on_texts(corpus)

In [331]:
# Let's explore what is the content of the tokenized corpus
print('The frequency of each word in the corpus:')
print(tok.word_counts)

The frequency of each word in the corpus:
OrderedDict([('the', 10), ('number', 1), ('of', 4), ('trees', 5), ('in', 7), ('world', 4), ('according', 1), ('to', 1), ('a', 1), ('2015', 1), ('estimate', 1), ('is', 1), ('3', 1), ('04', 1), ('trillion', 1), ('46', 1), ('are', 5), ('tropics', 2), ('or', 1), ('sub', 1), ('20', 1), ('temperate', 1), ('zones', 1), ('24', 1), ('threes', 1), ('coniferous', 1), ('boreal', 1), ('forests', 1), ('there', 2), ('about', 2), ('15', 1), ('billion', 2), ('cut', 1), ('down', 1), ('annually', 2), ('5', 1), ('planted', 1)])


In [332]:
print('The number of sentences in the corpus:')
print(tok.document_count)

The number of sentences in the corpus:
6


In [335]:
corpus

['The number of trees in the world, according to a 2015 estimate, is 3.04 trillion.',
 '46% of the trees in the world are in the tropics or sub-tropics.',
 '20% of the trees in the world are in the temperate zones.',
 '24% of threes in the world are in the coniferous boreal forests.',
 'There about 15 billion trees are cut down annually.',
 'There about 5 billion trees are planted annually.']

In [336]:
print('The index of each extracted tokens:')
print(tok.word_index)

The index of each extracted tokens:
{'the': 1, 'in': 2, 'trees': 3, 'are': 4, 'of': 5, 'world': 6, 'tropics': 7, 'there': 8, 'about': 9, 'billion': 10, 'annually': 11, 'number': 12, 'according': 13, 'to': 14, 'a': 15, '2015': 16, 'estimate': 17, 'is': 18, '3': 19, '04': 20, 'trillion': 21, '46': 22, 'or': 23, 'sub': 24, '20': 25, 'temperate': 26, 'zones': 27, '24': 28, 'threes': 29, 'coniferous': 30, 'boreal': 31, 'forests': 32, '15': 33, 'cut': 34, 'down': 35, '5': 36, 'planted': 37}


In [339]:
print('The appearnce of each words in sentences:')
print(tok.word_docs)

The appearnce of each words in sentences:
defaultdict(<class 'int'>, {'3': 1, 'estimate': 1, 'the': 4, 'of': 4, 'according': 1, 'a': 1, 'trillion': 1, 'world': 4, 'in': 4, 'to': 1, 'trees': 5, 'is': 1, 'number': 1, '2015': 1, '04': 1, '46': 1, 'or': 1, 'are': 5, 'sub': 1, 'tropics': 1, 'temperate': 1, '20': 1, 'zones': 1, 'coniferous': 1, 'boreal': 1, '24': 1, 'threes': 1, 'forests': 1, '15': 1, 'annually': 2, 'cut': 1, 'billion': 2, 'about': 2, 'down': 1, 'there': 2, 'planted': 1, '5': 1})


In [327]:
corpus

['The number of trees in the world, according to a 2015 estimate, is 3.04 trillion.',
 '46% of the trees in the world are in the tropics or sub-tropics.',
 '20% of the trees in the world are in the temperate zones.',
 '24% of threes in the world are in the coniferous boreal forests.',
 'There about 15 billion trees are cut down annually.',
 'There about 5 billion trees are planted annually.']

In [349]:
tok.word_index

{'the': 1,
 'in': 2,
 'trees': 3,
 'are': 4,
 'of': 5,
 'world': 6,
 'tropics': 7,
 'there': 8,
 'about': 9,
 'billion': 10,
 'annually': 11,
 'number': 12,
 'according': 13,
 'to': 14,
 'a': 15,
 '2015': 16,
 'estimate': 17,
 'is': 18,
 '3': 19,
 '04': 20,
 'trillion': 21,
 '46': 22,
 'or': 23,
 'sub': 24,
 '20': 25,
 'temperate': 26,
 'zones': 27,
 '24': 28,
 'threes': 29,
 'coniferous': 30,
 'boreal': 31,
 'forests': 32,
 '15': 33,
 'cut': 34,
 'down': 35,
 '5': 36,
 'planted': 37}

## <font color='green'> Approach #1: One Hot Encoding </font>

In [351]:
print(tok.texts_to_matrix(corpus, mode='binary'))

[[0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.
  1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0.]
 [0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]]


In [352]:
# Encoding the frequency of each word in the sentence (TFIDF)
print(tok.texts_to_matrix(corpus, mode='count'))

[[0. 2. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 3. 2. 1. 1. 1. 1. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.
  1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 3. 2. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 2. 2. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0.]
 [0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]]


In [353]:
# Encoding the frequency of each word in the sentence
print(tok.texts_to_matrix(corpus, mode='freq'))

[[0.         0.125      0.0625     0.0625     0.         0.0625
  0.0625     0.         0.         0.         0.         0.
  0.0625     0.0625     0.0625     0.0625     0.0625     0.0625
  0.0625     0.0625     0.0625     0.0625     0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.21428571 0.14285714 0.07142857 0.07142857 0.07142857
  0.07142857 0.14285714 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.07142857 0.07142857
  0.07142857 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.25       0.16666667 0.08333333 0.08333333 0.08333333
  0.08333333 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.   