In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("../../movie_data.csv", encoding = 'utf-8')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,"Election is a Chinese mob movie, or triads in ...",1
1,I was just watching a Forensic Files marathon ...,0
2,Police Story is a stunning series of set piece...,1
3,"Dear Readers,<br /><br />The final battle betw...",1
4,I have seen The Perfect Son about three times....,1


In [5]:
# create a Tensorflow dataset object
target = df.pop('sentiment')
ds_raw = tf.data.Dataset.from_tensor_slices((df.values, target.values))
ds_raw

<TensorSliceDataset shapes: ((1,), ()), types: (tf.string, tf.int64)>

In [26]:
# verify
# print first 50 characters of the review, and the sentiment
for ex in ds_raw.take(3):
    tf.print(ex[0].numpy()[0][:50], ex[1])

b'Election is a Chinese mob movie, or triads in this' 1
b'I was just watching a Forensic Files marathon on C' 0
b'Police Story is a stunning series of set pieces fo' 1


In [18]:
# split into training, testing and validation dataset
tf.random.set_seed(1)
ds_raw = ds_raw.shuffle(50000, reshuffle_each_iteration = False)
ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [19]:
# collect unique tokens
# use Counter class from the collections package
from collections import Counter

tokenizer = tfds.features.text.Tokenizer()
token_counts = Counter()

In [31]:
for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)
# show vocab size
print(len(token_counts))

87397


In [21]:
for example in ds_raw_train:
    print(example[0])

(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b"A rather mild horror movie; if not for a couple of sex scenes, it could easily have been a TV movie. Plot holes abound (one example: why would there be a secret passage from the 18th century leading from the upper floor of a house that was burned to the ground and a new building put ther 200 years later?), cardboard acting, characters doing things that anyone with an IQ bigger than their shoe size wouldn't do...<br /><br />It's got a few fun moments, but overall it's a sub-par film that managed to get Roy Scheider because his bills were due. If you're looking for an extremely formulaic, predictable film that might provide a few laughs, it might be worth watching. If not, then this one's not for you."],
      dtype=object)>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b"This is a very odd film ... I wasn't really sure what is was about, some N London lowlifes find a mute kid in t

(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'Very strange. But meant to be. This director is his own man. Even through there are strains if Polanski, Bergman, and Kafka at least in the episode no 6, the peeping tom one. What made it all so strange, and reminiscent of the above three artists, was that it went all over the place, you never knew where it was headed, and could have ended anyplace, and finally when it did end, could have kept going. The ending is hardly a finality, nobody could tell you what these two characters would be doing in even the next frame. One other thing should be said about the director: No wonder Kubrick found him fascinating. There is a lot of Eyes Wide Shut in this episode somehow, in the direct approach to character, the realistic fantasy elements of both. A Kubrick placement of the camera without any of the stark effects, much more washed out, and hurried, not as fussed over. That said, back to the beginning, still this guy has his own things to 

(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b"Someone i know said that there was this film called flatliners that was probably up my street. I was told about this movie after watching final destination 2 and watching the extra feature about near deth experience.<br /><br />I bought the DVD of flatliners at the modest price of 5 pounds. Got home and watched it. And i could not help but smile and feel good wondering how this film hadn't been in my life before. The film is about a group of medical students try to see what it's like after near death experience. But then there sins come back in reality and can harm them physically.<br /><br />Acting from Kevin Bacon and Keifer Sutherland is great as you would expect from the pair. And Joel Schmacheur made this a great movie like he did with the lost boys.<br /><br />This is an edgy and stylish thriller bound to please nay type of film fan."],
      dtype=object)>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(1,)

(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'There is one good thing in this movie: Lola Glaudini\'s ass! Sorry to be so blunt but it\'s the truth. Too bad she didn\'t do a nude. It would at least have made this mess tolerable. We see another chick\'s boobs but she\'s nowhere near Lola. And man, is Armand Assante old or what? The man looks like crap! "Consequence" is the usual B-Movie you would expect. The story had potential. It\'s like they had good ideas but didn\'t know how to execute them. The cinematography is just plain awful. Ugly! The directing is uninspired and the end result is a bland thriller with lame twists and washed up actors. Lola Gaudini is great as the vixen in a cheap, slutty way but not even she saves "Consequence" from being trash and not funny trash, just plain old stinking trash.'],
      dtype=object)>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b"Miraculously, this is actually quite watchable. I

(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b"How do you spell washed up fat Italian who can barely pull off a martial arts move without needing some heart medication? In this movie we see Steven Seagal at his lowest level of accomplishment- since his career started it has been a steady decline into pathetic over indulgent behavior that has scuttled his career. In this movie it looks like most of his training consisted of ordering the fetuccini alfredo at his restaurant every day.<br /><br />He is fat, slow and very old looking in this movie, hardly a martial arts action hero, more like a laughing stock clown.<br /><br />It's time for Steven Seagal to retire- this movie is about 2 hours of reasons why.<br /><br />Plot: fat Italian guy with a big reputation on the force gets wind that a crime group may be playing around with a drug designed by the military to create the ultimate warrior response. This pretense, although pathetic and laughable, gives opportunity for some over th

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'The novel WEAPON which serves as the basis for this atrocious piece of garbage is one of the best techno-thrillers to come down the pike in a long time.The character of SOLO, who is NOT supposed to look like a human, is a wonderful creation and it was simply awful to see him reduced to just another Terminator-clone with Mario Van Peebles horrendously trying to "act" like a robot. There is NOTHING worthwhile about this film.<br /><br />Why does Hollywood insist on snapping up the rights to excellent novels and then butchering them? There are so many things wrong with SOLO that listing them seems as unfair as inviting a man with no legs to a brisk game of Hopscotch. Avoid this awful film and seek out the 2 novels by ROBERT MASON that feature the awesome character of SOLO. The books are WEAPON and an excellent sequel SOLO.But don\'t pay any attention to this awful dreck of a film.'],
      dtype=object)>, <tf.Tensor: shape=(), dtype=i

(<tf.Tensor: shape=(1,), dtype=string, numpy=
      dtype=object)>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b"I've seen the previews everywhere before deciding to watch it. And what do you know, I actually liked it! It has a new twist of the 18th century england. Although the music in the dance scene were obviously modernized and also the colors of Liv Tyler's clothers (although it IS pretty!), it fit quite perfectly.<br /><br />If you just want a good time, you should check this out. Very different from other 18th century detailed films."],
      dtype=object)>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'If you\'ve never experienced the thing that is Zasu Pitts, here is a Zasu zinger! In 1933 Mae Questel caricatured Pitt\'s voice for the character Olive Oyl for the Fleischer Studios animated cartoon version of the comic strip Popeye. Zasu (pronounced Zay-Sue) does her bes

(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'I first saw this film in the theater way back in the 40s when I was a kid and always remembered the ending. There is nothing like the first impression but some movies are always a treat each time they are viewed. Something just resonates with them. This is one of those films and I agree with another reviewer who said Fritz Lang should have directed more westerns. To add to it I have always liked Randolph Scott and Robert Young. In fact, Robert Young stars in what I consider my favorite movie if I have to name just one, not an easy thing to do. That film is Northwest Passage. It led me to the superb historical novels of Kenneth Roberts. Western Union likewise led me to reading Zane Grey\'s novel which, in this case turned out to be one of those rare cases where I like the movie better than the novel. Not that Grey\'s novel is a bad one; I just like the movie story better. The movie in no way resembles the novel. It is a completely d

(<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b"'The Big Snit' came into my life complete by accident and has left an indelible mark on my soul. A scar of love, destruction and pointlessness that will forever be a part of my life. This is tale of beautiful futility. We are helpless without each other. We are helpless against governmental wrong-doings. We are helpless as to the choices we all try to make when in love. Deaf to the mutterings and goings on of an world outside the window. Blind to an inevitable apocalypse. Dumb of the hatred and greedy opinions of an over-indulgent society. This is a tale of personal commitment and triumphant love defeating the ideologies of war. Their petty bickering is a sublime observation of human nature and of how love comes with it's pains and darkness Everyone has some irritating aspect to their personality and this is observed by the makers in the most simplistic and fantastic way. We travel only a short distance with the two main characters

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [36]:
encoder = tfds.features.text.TokenTextEncoder(token_counts)
example_str = 'This is example, YOU!'
print(encoder.encode(example_str))

[104, 105, 24, 10193]
