# Example notebook 2: visualize word embeddings after fine-tuning
In this example the embeddings are fine-tuned on a classification task on the 20 newsgroup dataset.
The classifier is a TextCNN, i.e. a 1d convolutional network used for text classification.

In [1]:
import os
try:
    import zeugma
except ImportError:
    !pip install zeugma
from zeugma import EmbeddingTransformer

embedding_transformer = EmbeddingTransformer('glove')

Using TensorFlow backend.


## Preprocessing

In [2]:
from tensorflow.keras.utils import to_categorical
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
corpus = newsgroups_train.data
y_train = to_categorical(newsgroups_train.target)

In [3]:
from zeugma import TextsToSequences, Padder
from sklearn.pipeline import make_pipeline

max_length = 200
num_words = 10000

sequencer = TextsToSequences(num_words=num_words)
padder = Padder(max_length=max_length)

# Build the Scikit-learn pipeline
pipeline = make_pipeline(sequencer, padder)
x_train = pipeline.fit_transform(corpus)

In [4]:
import numpy as np

word_index = {k: v for k, v in sequencer.word_index.items() if v <= num_words}

def create_embedding_matrix(word_index, embedding_transformer):
    """ Prepare the embedding matrix """
    embedding_dim = embedding_transformer.transform(['the']).shape[1]
    embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))
    for word, i in word_index.items():
        embedding_matrix[i] = embedding_transformer.transform([word])[0]
    return embedding_matrix

embedding_matrix = create_embedding_matrix(word_index, embedding_transformer)

LOG_DIR = os.path.join(os.environ['PWD'], 'logs')

if not os.path.exists(LOG_DIR):
    os.mkdir(LOG_DIR)
    
# Write word names in a file for tensorboard display
with open(os.path.join(LOG_DIR, 'metadata.tsv'), 'w') as f:
    f.write('Word\n' + '\n'.join(word_index.keys()))

## Model definition and traning

In [5]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

def create_model(max_length, embedding_matrix):
    """ Model creation function: returns a compiled TextCNN"""
    embedding_layer = Embedding(
        len(embedding_matrix),
        embedding_matrix.shape[1],
        weights=[embedding_matrix],
        input_length=max_length,
        trainable=True,
        name='embeddings'
    )

    input_layer = Input(shape=(max_length,), dtype='int32', name='input')
    embedded_sequences = embedding_layer(input_layer)

    conv_blocks = []
    for filter_size in [3]:
        conv = Conv1D(128, filter_size, activation='relu', name='conv_'+str(filter_size))(embedded_sequences)
        pool = GlobalMaxPooling1D(name='global_pool_cv_'+str(filter_size))(conv)
        conv_blocks.append(pool)
    merge = concatenate(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
    dropout = Dropout(0.2)(merge)
    dense = Dense(128, activation='relu', name='dense')(merge)
    preds = Dense(20, activation='softmax', name='softmax')(dense)

    model = Model(inputs=input_layer, outputs=preds)
    model.compile(loss='categorical_crossentropy', optimizer='adam',
                  metrics=['accuracy'])
    return model

In [7]:
from tensorflow.keras.callbacks import TensorBoard

tensorboard = TensorBoard(
    histogram_freq=0,
    write_graph=True,
    write_images=False,
    embeddings_freq=1,
)

# Use Keras Scikit-learn wrapper to instantiate a TextCNN with all methods
# required by Scikit-learn for the last step of a Pipeline
sklearn_textcnn = KerasClassifier(
    build_fn=create_model, epochs=2, batch_size=128, max_length=max_length, 
    embedding_matrix=embedding_matrix, verbose=2, callbacks=[tensorboard]
)

sklearn_textcnn.fit(x_train, y_train)

Train on 11314 samples
Epoch 1/2
11314/11314 - 2s - loss: 2.8295 - accuracy: 0.1623
Epoch 2/2
11314/11314 - 2s - loss: 2.0645 - accuracy: 0.3819


<tensorflow.python.keras.callbacks.History at 0x1708ca3d0>

## Tensorboard visualization of the fine-tuned embeddings

In [8]:
!pkill -f tensorboard
%load_ext tensorboard
%tensorboard --logdir /Users/nicolasthiebaut/projects/zeugma/examples/logs 
# Navigate to the 'Projector' tab to visualize the embeddings, then
# Load the words metadata by clicking on the "Load" and selecting "metadata.tsv"

# External version if you want to run TensorBoard locally, outside of the notebook
# Run tensorboard to visualize the embeddings in your
# browser on port 6006: http://localhost:6006
# !tensorboard --logdir logs/ --host 127.0.0.1