In [None]:
from tensorflow import keras

#Load IMDB dataset
(train_data, train_labels), (test_data, test_labels) = keras.datasets.imdb.load_data()

print('Training data: ', train_data.shape)
print('Training labels: ', train_labels.shape, train_labels)
train_zero = (train_labels == 0).sum()
train_one = (train_labels == 1).sum()
print('Number of 0 and 1 in train_labels: {} {}'.format(train_zero, train_one))

In [None]:
print('The first row in training data (length: {}): \n'.format(len(train_data[0])), train_data[0])
print('\nThe second row in training data (length: {}): \n'.format(len(train_data[1])), train_data[1])

lens = [len(train_data[i]) for i in range(len(train_data))]
max_length = max(lens)
min_length = min(lens)
print('\nThe min and max length in train_data: {}, {}'.format(max_length, min_length))

In [None]:
word2index = keras.datasets.imdb.get_word_index()

word2index = dict([(key, value + 3) for (key, value) in word2index.items()])
word2index['<PAD>'] = 0
word2index['<START>'] = 1
word2index['<UNKNOWN>'] = 2
word2index['<UNUSED>'] = 3

print('The length of word2index: ', len(word2index))
print('The word2index: \n', word2index)

index2word = dict([(value, key) for (key, value) in word2index.items()])

print('\nThe index2word:')
print('Sorted index: ', sorted(index2word.keys()))
for i in range(10):
    print(index2word[i])

In [None]:
#Function to convert a sequence of int to a sequance of words
def getText(int_seq):
    text = ' '.join(index2word.get(i) for i in int_seq)
    return text

print('The first review:')
print(getText(train_data[0]))
print('The label: ',train_labels[0])

print('\nThe last review:')
print(getText(train_data[len(train_data) - 1]))
print('The label: ', train_labels[len(train_labels) - 1])

In [None]:
#Add PAD to train_data and test_data
train_data = keras.preprocessing.sequence.pad_sequences(train_data, maxlen = max_length, padding = 'post', value = word2index['<PAD>'])
test_data = keras.preprocessing.sequence.pad_sequences(test_data, maxlen = max_length, padding = 'post', value = word2index['<PAD>'])

print('The first row of training data (length: {}): \n'.format(len(train_data[0])), train_data[0])

print('\nThe last row of training data (length: {}): \n'.format(len(train_data[len(train_data) - 1])), train_data[len(train_data) - 1])

In [None]:
from keras.layers import Embedding, Conv1D, MaxPool1D, GlobalAvgPool1D, GlobalMaxPool1D, Flatten, Dense

vocal_size = len(word2index)
embedding_dim = 50

model = keras.Sequential()

#Add an Embedding layer
model.add(Embedding(input_dim = vocal_size, output_dim = embedding_dim, input_length = max_length))

#Add a Convolutional layer
model.add(Conv1D(filters = 128, kernel_size = 3, activation = 'relu', input_shape = (None, max_length, embedding_dim)))

#Add a Max Pooling layer
model.add(MaxPool1D(pool_size = 2))

#Add a GlobalAvgPool1D layer
model.add(GlobalAvgPool1D())

#Add the output layer
model.add(Dense(64, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

#Display model summary
model.summary()

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

#Compile the model
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

#Early stop training
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10, verbose = 1)

#Save the best model
best_model = ModelCheckpoint(filepath = 'Text_Classification_bestmodel.h5', monitor = 'val_loss', verbose = 1, save_best_only = True)

#Reduce learning rate
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 5, verbose = 1, min_lr = 0.001)

#Start training
model_history = model.fit(train_data, train_labels,
                          batch_size = 512, epochs = 50, validation_split = 0.3,
                          callbacks = [early_stopping, best_model, reduce_lr],
                          shuffle = True)

In [None]:
#Load the best model
keras.models.load_model(filepath = 'Text_Classification_bestmodel.h5')

model.evaluate(test_data, test_labels)

In [None]:
import matplotlib.pyplot as plt

# Get training loss and validation loss from model history
history_dict = model_history.history
loss = history_dict['loss']
val_loss = history_dict['val_loss']

# Diplay a chart of training loss and validation loss
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss)
plt.plot(epochs, val_loss)

plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['Training loss', 'Val loss'], loc='center right')

plt.show()

In [None]:
import matplotlib.pyplot as plt

# Get training accuracy and validation loss from model history
history_dict = model_history.history
accuracy = history_dict['accuracy']
val_accuracy = history_dict['val_accuracy']

# Diplay a chart of training accuracy and validation accuracy
epochs = range(1, len(accuracy) + 1)
plt.plot(epochs, accuracy)
plt.plot(epochs, val_accuracy)

plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(['Training accuracy', 'Val accuracy'], loc='center right')

plt.show()