<a href="https://colab.research.google.com/github/unclepeddy/deeplearning/blob/master/1-imdb-binary-classification/glove_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [80]:
import tensorflow as tf

from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

import os

print(tf.__version__)

2.0.0-dev20190330


In [0]:
# Download the 822 MB GLoVe embedding
! curl -o glove.6B.zip https://nlp.stanford.edu/data/glove.6B.zip

# Unzip the GLoVe embedding 
! mkdir glove
! unzip -o glove.6B.zip -d glove

In [0]:
# Download the raw IMDB reviews task data
#! curl -o imdb.zip https://mng.bz/0tlo
#! ls
# This doesn't work on hosted runtimes due to a lack of SSL certificate..
# For now, the workaround is to download aclImdb.zip on your host and upload 
# to the runtime's file system out of band
# From here on out, we assume aclImdb.zip exists in /content

In [0]:
%%bash
if [ ! -f aclImdb.zip ]; then
  echo "aclImdb.zip does not exist - exiting"
  exit 1
fi
unzip -o aclImdb.zip 
mv aclImdb imdb_data

In [0]:
# Set base, data and training directories
base_dir = os.getcwd()
data_dir = os.path.join(base_dir, 'imdb_data')
train_dir = os.path.join(data_dir, 'train')

labels = []
texts = []

# Go through all text files in train_dir/[pos, neg] and
# populate texts with examples and labels (0 for positive, 1 for negative)
for label in ['pos', 'neg']:
	dir_name = os.path.join(train_dir, label)
	for fname in os.listdir(dir_name):
		if fname[-4:] == '.txt':
			file = open(os.path.join(dir_name, fname))
			content = file.read()
			texts.append(content)
			file.close()
			if (label == 'pos'):
				labels.append(0)
			else:
				labels.append(1)

In [73]:
# Maximum length of a sentence sequence
maxlen = 100
# Number of training samples
training_samples = 200
# Number of validation samples
validation_samples = 10000
# Vocabulary size
max_words = 10000
embedding_input_dim = max_words
# Size of embedding output space
embedding_output_dim = 100

# Create a tokenizer and teach it the vocabulary
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

# Vectorize texts to produce a list sequences with a word index
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print("Found %s unique tokens" % len(word_index))

# Convert sequences and labels to input and output tensors
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)

print("Shape of data tensor: ", data.shape)
print("Shape of label tensor: ", labels.shape)

# Shuffle the data and labels
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

# Build training and validation data sets
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples : training_samples + validation_samples]
y_val = labels[training_samples : training_samples + validation_samples]

Found 88582 unique tokens
Shape of data tensor:  (25000, 100)
Shape of label tensor:  (25000,)


In [72]:
# Locate the GLoVe index file and build empty index
glove_dir = os.path.join(base_dir, 'glove')
glove_file = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
glove_index = {}

# Build the index in memory by iterating through file line by line
# Each line is composed of a word followed by N embedding coefficients
for line in glove_file:
	values = line.split()
	word = values[0]
	coefficients = np.asarray(values[1:], dtype='float32')
	glove_index[word] = coefficients
glove_file.close()

print('Imported GloVe index with %s words' % len(glove_index))

Imported GloVe index with 400000 words


In [79]:
# Build embedding matrix mapping using word_index
embedding_matrix = np.zeros((embedding_input_dim, embedding_output_dim))
for word, i in word_index.items():
	embedding = glove_index.get(word)
	if (i < embedding_input_dim and embedding is not None):
		embedding_matrix[i] = embedding

# Build a simple model: pre-trained embedding -> ReLU dense layer -> sigmoid
model = models.Sequential()
model.add(layers.Embedding(embedding_input_dim, embedding_output_dim, input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

# Set the weights of the embedding layer to GLoVe and freeze them
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

# Compile and train the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train, 
	epochs=10, 
	batch_size=32, 
	validation_data=(x_val, y_val))

# Optionally save the model weights
# Increase the size of training dataset (from 200) to achieve better results
model.save_weights('pre_trained_glove_model.h5')

Train on 200 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
