# Load data

In [None]:
import pandas as pd
from IPython.display import Image
from IPython.core.display import HTML 
import sys
sys.path.append("..")
from lib import is_rotten, predict, plot_confusion_matrix

reviews = pd.read_csv('/tmp/imdb.csv', sep="\t", header=None)
reviews.columns = ['review', 'like']

In [None]:
import warnings
warnings.filterwarnings('ignore')

from keras import preprocessing

max_features = 10000
maxlen = 80

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(reviews['review'], reviews['like'], test_size=0.33, random_state=42)

from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

x_train = preprocessing.sequence.pad_sequences(train_sequences, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(test_sequences, maxlen=maxlen)

In [None]:
import os

log_dir = "./logs"

with open(os.path.join(log_dir, 'metadata.tsv'), 'w') as f:
    for key, value in tokenizer.word_index.items():
        f.write("%s\t%s\n"%(key, value))


In [None]:
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, LSTM, Dropout
batch_size = 16
epochs = 10
model = Sequential()
# without input_length, callback will throw "__int__ returned non-int" error
model.add(Embedding(max_features, batch_size, input_length=maxlen))
model.add(LSTM(32))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
        
#https://keras.io/callbacks/#tensorboard
callbacks = [
    TensorBoard(
        log_dir=log_dir,
        embeddings_freq=5,
        embeddings_metadata="metadata.tsv",
        embeddings_data=x_test
    )
]

history = model.fit(x_train, y_train,
                    epochs=epochs,
                    callbacks=callbacks,
                    batch_size=batch_size,
                    validation_split=0.2)

In [None]:
y_predict = model.predict_classes(x_test)
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
print(accuracy_score(y_test, y_predict))
print(f1_score(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))

# Visualization

<pre>
<code>
run ./tensorboard.sh
</code>
</pre>

larger dataset (https://projector.tensorflow.org/)