# =========================
# Load libraries
# =========================

In [9]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras import callbacks, layers, models
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# =========================
# Load data
# =========================

In [10]:
# Load stanford data
data = pd.read_csv("../../data/processed/stanford.csv")

train_data, test_data, train_labels, test_labels = train_test_split(
    data["text"], data["label"], test_size=0.2
)


# =========================
# One-hot encode
# =========================

In [11]:
# Create unique index for every word and fit to training data
tokenizer = Tokenizer(num_words = 10000)
tokenizer.fit_on_texts(train_data)

# Print the number of unique words found in the data set (not the limit placed
# on the tokenizer), use this as feedback to the num_words arg of Tokenizer().
print('Found %d unique words.' % len(tokenizer.word_index))

# One-hot encode
train_encodings = tokenizer.texts_to_matrix(train_data, mode = 'binary')

Found 54397 unique words.


# =========================
# Split data
# =========================

In [12]:
# Randomly shuffle data
indices = np.arange(train_encodings.shape[0])
np.random.shuffle(indices)
train_encodings = train_encodings[indices]
train_labels = train_labels.values[indices]

# Split into training and validation data (approximately 80:20)
x_train = train_encodings[:24000]
y_train = train_labels[:24000]
x_val   = train_encodings[24000:]
y_val   = train_labels[24000:]

# =========================
# Callbacks
# =========================

In [13]:
callbacks = [ callbacks.TensorBoard(
    log_dir='log_lstm_one_hot',
    histogram_freq=1,
    embeddings_freq = 1,
    embeddings_data = x_train[:100],
    ),
    callbacks.EarlyStopping(
    monitor='acc',
    patience=1,
    ),
    callbacks.ModelCheckpoint(
    filepath='my_model.h5',
    monitor='val_loss',
    save_best_only=True,
        ),
    callbacks.ReduceLROnPlateau(
        monitor='val_loss',
         factor=0.1,
        patience=10,
        ),
]

# =========================
# Build model
# =========================

In [14]:
# Note the input_shape of the first layer will match the num_words arg from
# the Tokenizer() function. Double check with len(data[0])
# The final sigmoid layer outputs probability values between [0, 1]
model = models.Sequential()
model.add(layers.Dense(16, activation = 'relu', input_shape = (10000,)))
model.add(layers.Dense(16, activation = 'relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))

# =========================
# Train model
# =========================

In [15]:
# As the model outputs probabilities, binary crossentropy is the best loss
# metric as it measures the distance between probability distributions
model.compile(optimizer = 'rmsprop',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

history = model.fit(x_train,
                    y_train,
                    epochs = 20,
                    batch_size = 512,
                    validation_data = (x_val, y_val), 
                    callbacks=callbacks)

# Prep history dictionary
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

# Plot the training and validation loss
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot the training and validation accuracy
plt.clf()
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()





ValueError: Input 0 is incompatible with layer lstm_2: expected ndim=3, found ndim=2

# =========================
# Evaluate on test data
# =========================

In [None]:
# DO NOT retrain to tokenizer. Use the argument oov_token=True to reserve a
# token for unkown words. See https://bit.ly/2lNh15g

# One-hot encode
test_encodings = tokenizer.texts_to_matrix(test_data, mode = 'binary')

# Print results as ['loss', 'acc'] check names with model.metrics_names
model.evaluate(test_encodings, test_labels.values)

model.metrics_names



