# TextLineDataset
Using TextLineDataset for loading custom text datasets.

In [4]:
import tensorflow as tf
import pandas as pd
import tensorflow_datasets as tfds
from tensorflow import keras 
from tensorflow.keras import layers 
import pickle


In [8]:
# Lets load and create our test and train datasets
ds_train = tf.data.TextLineDataset("../../datasets/movies/imdb.csv")
ds_test = tf.data.TextLineDataset("../../datasets/movies/imdb.csv") 

In [21]:
# lets analyze the contents
def describe_text_dataset(dataset,lines):
    for line in dataset.skip(1).take(lines):    
        print(tf.strings.split(line, ",",maxsplit=4))

describe_text_dataset(ds_train, 1)


tf.Tensor(
[b'25001' b'train' b'neg' b'10000_4.txt'
 b'"Airport \'77 starts as a brand new luxury 747 plane is loaded up with valuable paintings & such belonging to rich businessman Philip Stevens (James Stewart) who is flying them & a bunch of VIP\'s to his estate in preparation of it being opened to the public as a museum, also on board is Stevens daughter Julie (Kathleen Quinlan) & her son. The luxury jetliner takes off as planned but mid-air the plane is hi-jacked by the co-pilot Chambers (Robert Foxworth) & his two accomplice\'s Banker (Monte Markham) & Wilson (Michael Pataki) who knock the passengers & crew out with sleeping gas, they plan to steal the valuable cargo & land on a disused plane strip on an isolated island but while making his descent Chambers almost hits an oil rig in the Ocean & loses control of the plane sending it crashing into the sea where it sinks to the bottom right bang in the middle of the Bermuda Triangle. With air in short supply, water leaking in & havi

In [24]:
def filter_text(line, type='test'):
    split_line = tf.strings.split(line, ",", maxsplit=4)
    dataset_belonging = split_line[1] # train or test
    sentiment_category = split_line[2] # pos, neg, unsup

    return (
        True
        if dataset_belonging == type and sentiment_category != 'unsup'
        else False
    )

# Filter the train and test data
FILEPATH = "../../datasets/movies/imdb.csv"
ds_train =  tf.data.TextLineDataset(FILEPATH).filter(lambda line: filter_text(line, 'train'))
ds_test =  tf.data.TextLineDataset(FILEPATH).filter(lambda line: filter_text(line, 'test'))
# Check that the filter works
describe_text_dataset(ds_test, 1)

tf.Tensor(
[b'1' b'test' b'neg' b'10000_4.txt'
 b'"This is an example of why the majority of action films are the same. Generic and boring, there\'s really nothing worth watching here. A complete waste of the then barely-tapped talents of Ice-T and Ice Cube, who\'ve each proven many times over that they are capable of acting, and acting well. Don\'t bother with this one, go see New Jack City, Ricochet or watch New York Undercover for Ice-T, or Boyz n the Hood, Higher Learning or Friday for Ice Cube and see the real deal. Ice-T\'s horribly cliched dialogue alone makes this film grate at the teeth, and I\'m still wondering what the heck Bill Paxton was doing in this film? And why the heck does he always play the exact same character? From Aliens onward, every film I\'ve seen with Bill Paxton has him playing the exact same irritating character, and at least in Aliens his character died, which made it somewhat gratifying...<br /><br />Overall, this is second-rate action trash. There are co

1. Create Vocabulary
2. Numericalize text str -> indices(TokenTextEncoder)
3. Pad the batches so we can send as input of a RNN


In [29]:
tokenizer = tfds.features.text.Tokenizer()
# "Tokenize my string" -> ['Tokenize', 'my', 'string'] -> [0,1,2]

def build_vocabulary(dataset, threshold=200):
    """Build a vocabulary"""
    frequencies = {}
    vocabulary=set()
    vocabulary.update(["sostoken"])
    vocabulary.update(["eostoken"])

    for line in dataset.skip(1):
        split_line = tf.strings.split(line, ",", maxsplit=4)
        review = split_line[4]
        tokenized_text = tokenizer.tokenize(review.numpy().lower())

        for word in tokenized_text:
            if word not in frequencies:
                frequencies[word] += 1
            else:
                frequencies[word] += 1
            #if we've reached the threshold
            if frequencies[word] == threshold:
                vocabulary.update(tokenized_text)
    return vocabulary

# Build vocabulary and save it to vocabulary.obj
vocabulary = build_vocabulary(ds_train)
vocab_file = open('vocabulary.obj', 'wb') #wb=write byte
pickle.dump(vocabulary, vocab_file)

# loading the vocabulary object
#vocab_file = open("vocabulary.obj", "rb")
#vocabulary = picke.load(vocab_file)
#tfds.features.Text.encoder()  
encoder = tfds.deprecated.text.TokenTextEncoder(
    list(vocabulary), oov_token="<UNK>", lowercase=True, tokenizer = tokenizer,
)

def my_encoder(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label 

def encode_map_fn(line):
    split_line = tf.strings.split(line, ",", maxsplit=4)
    label_str = split_line[2] #neg, pos
    review = "sostoken " + split_line[4] + " eostoken"
    label = 1 if label_str == "pos" else 0

    (encoded_text, label) = tf.py_function(
        my_encoder, inp=[review, label], Tout=(tf.int64, tf.int32)
    )
    encoded_text.set_shape([None])
    label.set_shape([])
    return encoded_text, label

AUTOTUNE = tf.data.experimental.AUTOTUNE
ds_train  = ds_train.map(encode_map_fn, num_parallel_calls=AUTOTUNE).cache()
ds_train = ds_train.shuffle(25000)
ds_train =  ds_train.padded_batch(32, padded_shapes=([None],()))

ds_test = ds_test.map(encode_map_fn)
ds_test = ds_test.padded_batch(32, padded_shapes=([None],()))


# create model
model = keras.Sequential(
    [
        layers.Masking(mask_value=0),
        layers.Embedding(input_dim=len(vocabulary)+2, output_dim=32),
        layers.GlobalAveragePooling1D(),
        layers.Dense(64, activation='relu'),
        layers.Dense(1),

    ]
)

model.compile(
    loss = keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer= keras.optimizers.Adam(3e-4, clipnorm=1),   
    metrics=["accuracy"],
)

model.fit(ds_train, epochs=15, verbose=2 )
model.evaluate()


AttributeError: module 'tensorflow_datasets.core.features' has no attribute 'text'