In [97]:
import pandas as pd
import tensorflow as tf
import os
import nltk
import pickle
from tqdm import tqdm_notebook as tqdm
from collections import OrderedDict

In [2]:
tf.enable_eager_execution()

In [3]:
NUM_WORDS = 10000
OOV_TOKEN = '<UNK>'

In [94]:
dataset = tf.data.experimental.CsvDataset(os.path.join('data', 'reviews.csv'), 
                                          [tf.float32, tf.string], 
                                          header=True)
dataset = dataset.map(lambda stars, text: (text, stars))

# Tokenize

In [10]:
def tokenize(text, lower=True, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r'):
    text = text.lower()
    text = text.translate(str.maketrans(filters, ' ' * len(filters)))
    return nltk.word_tokenize(text)

In [21]:
word_counts = OrderedDict()
for text, _ in tqdm(dataset):
    text = text.numpy().decode('utf-8')
    tokens = tokenize(text)
    for token in tokens:
        if token in word_counts:
            word_counts[token] += 1
        else:
            word_counts[token] = 1

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [22]:
word_counts = list(word_counts.items())
word_counts.sort(key=lambda x: x[1], reverse=True)
vocabulary_list = [word for word, count in word_counts[:NUM_WORDS]]

In [23]:
print(len(word_counts))
print(len(vocabulary_list))

172485
10000


# Encode

In [40]:
class TokenizerEncoder:
    def __init__(self, vocab_list, oov_token):
        self.word_index = {word:(index+2) for index, word in enumerate(vocab_list)} # index 0 is reserved for padding token
                                                                                    # index 1 is reserved for out of vocab token
        self.word_index[oov_token] = 1
        
        self.reverse_word_index = {v: k for k, v in self.word_index.items()}
        self.reverse_word_index[0] = '' # for decoding
        
        self.oov_token = oov_token
    def encode(self, text):
        tokens = tokenize(text)
        encoded_tokens = [self.word_index[token] if token in self.word_index 
                          else self.word_index[self.oov_token] for token in tokens]
        return encoded_tokens
    
    def decode(self, encoded_tokens):
        tokens = [self.reverse_word_index[idx] for idx in encoded_tokens]
        return ' '.join(tokens)

In [41]:
tokenizer = TokenizerEncoder(vocab_list=vocabulary_list, oov_token=OOV_TOKEN)

In [43]:
encoded_tokens = tokenizer.encode('Total\r\n bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.')
print(encoded_tokens)
print(tokenizer.decode(encoded_tokens))

[825, 577, 11, 19, 549, 45, 110, 1, 250, 8617, 283, 25, 2, 4925, 6, 552, 78, 7165, 11, 143, 8003, 4, 777, 752, 2, 8003, 72, 34, 25, 11, 2524, 2796, 290, 1003, 2071, 1, 30, 40, 1799]
total bill for this horrible service over <UNK> these crooks actually had the nerve to charge us 69 for 3 pills i checked online the pills can be had for 19 cents each avoid hospital <UNK> at all costs


# Test with dataset

In [95]:
def tf_encode(text, stars):
    tf_text, tf_stars = tf.py_function(
        func=lambda text, stars: (tokenizer.encode(text.numpy().decode('utf-8')), tf.cast(stars, tf.int64)), 
        inp=[text, stars],
        Tout=(tf.int64, tf.int64)
    )
    return tf_text, tf_stars

dataset = dataset.map(tf_encode)

In [96]:
next(iter(dataset))

(<tf.Tensor: id=3000427, shape=(39,), dtype=int64, numpy=
 array([ 825,  577,   11,   19,  549,   45,  110,    1,  250, 8617,  283,
          25,    2, 4925,    6,  552,   78, 7165,   11,  143, 8003,    4,
         777,  752,    2, 8003,   72,   34,   25,   11, 2524, 2796,  290,
        1003, 2071,    1,   30,   40, 1799], dtype=int64)>,
 <tf.Tensor: id=3000428, shape=(), dtype=int64, numpy=1>)

# Save `vocabulary_list` and `word_counts` to a pickle file

In [98]:
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(vocabulary_list, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(word_counts, f, pickle.HIGHEST_PROTOCOL)

# Test pickle file

In [99]:
with open('tokenizer.pkl', 'rb') as f:
    vocab_list = pickle.load(f)
    wc = pickle.load(f)

In [101]:
len(vocab_list)

10000

In [103]:
len(wc)

172485