# NLP - Word representation

## The corpus

In [1]:
texts = np.array(["I like chocolate",
            "I like tea",
            "You like chocolate",
            'You hate beer',
            'I hate wine'])
labels = np.array([1,1,1,0,0])

## The imports

In [2]:
import os
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, TextVectorization, Dense, Flatten, Embedding

In [3]:
import warnings
warnings.filterwarnings("ignore")

## BOW representation (the fist week)

In [4]:
# with Keras preprocessing layer
vectorize_layer = TextVectorization(output_mode='count', ngrams=(1,2))
# Fit the layer with the corpus
vectorize_layer.adapt(texts)

# define the model
input_ = Input(shape=(1,), dtype=tf.string)
x = vectorize_layer(input_)
hidden = Dense(32, activation='relu')(x)
output_ = Dense(1, activation='sigmoid')(hidden)
model = Model(input_, output_)

# summarize the model
model.summary()

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# fit the model
model.fit(texts, labels, epochs=10, verbose=0)

# evaluate the model
loss, accuracy = model.evaluate(texts, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 17)               0         
 torization)                                                     
                                                                 
 dense (Dense)               (None, 32)                576       
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 609
Trainable params: 609
Non-trainable params: 0
_________________________________________________________________
Accuracy: 80.000001


## Keras word embedding

In [5]:
# Constants
vocab_size = 10  # Maximum vocab size.
max_len = 5      # Sequence length to pad the outputs to.
embedding_size = 8

# with Keras preprocessing layer
vectorize_layer = TextVectorization(max_tokens=vocab_size,
                                    output_mode='int',
                                    output_sequence_length=max_len)
# Fit the layer with the corpus
vectorize_layer.adapt(texts)

# define the model
input_ = Input(shape=(1,), dtype=tf.string)
x = vectorize_layer(input_)
x = Embedding(vocab_size, embedding_size, name="Embedding")(x)
x = Flatten()(x)
hidden = Dense(32, activation="relu")(x)
output_ = Dense(1, activation='sigmoid')(hidden)
model = Model(input_, output_)

# summarize the model
model.summary()

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# fit the model
model.fit(texts, labels, epochs=10, verbose=0)

# evaluate the model
loss, accuracy = model.evaluate(texts, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 5)                0         
 ectorization)                                                   
                                                                 
 Embedding (Embedding)       (None, 5, 8)              80        
                                                                 
 flatten (Flatten)           (None, 40)                0         
                                                                 
 dense_2 (Dense)             (None, 32)                1312      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                           

## Use a pre-trained embedding : Glove/Word2Vec/FastText embedding

**Traditional word embedding** techniques (Glove/Word2Vec/FastText) learn a global word embedding. They first build a global vocabulary using unique words in the documents by ignoring the meaning of words in different context. Then, similar representations are learnt for the words appeared more frequently close each other in the documents. The problem is that in such word representations the words' contextual meaning (the meaning derived from the words' surroundings), is ignored. For example, only one representation is learnt for "left" in sentence "I left my phone on the left side of the table." However, "left" has two different meanings in the sentence, and needs to have two different representations in the embedding space.

For example, consider the two sentences:

1. I will show you a valid point of reference and talk to the point.
1. Where have you placed the point.

The word embeddings from a pre-trained embeddings such as word2vec, the embeddings for the word 'point' is same for both of its occurrences in example 1 and also the same for the word 'point' in example 2. (all three occurrences has same embeddings).

In [6]:
# Same steps as Keras Embedding
vocab_size = 10  # Maximum vocab size.
max_len = 5      # Sequence length to pad the outputs to.
hidden_size = 16

vectorizer = TextVectorization(max_tokens=vocab_size, output_sequence_length=max_len)
vectorizer.adapt(texts)

In [7]:
# Build word dict
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))
word_index

{'': 0,
 '[UNK]': 1,
 'like': 2,
 'i': 3,
 'you': 4,
 'hate': 5,
 'chocolate': 6,
 'wine': 7,
 'tea': 8,
 'beer': 9}

In [8]:
# Download the pre-trained embedding matrix for exemple from glove
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip -q glove.6B.zip

In [9]:
# Make a dict mapping words (strings) to their NumPy vector representation:
path_to_glove_file = "/users/riveill/DS-models/glove.6B.50d.txt"

# pre-trained embedding matrix
embedding_dim = 50 # fixed by thepre-trained embedding matrix

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


Let's prepare a corresponding embedding matrix that we can use in a Keras Embedding layer. It's a simple NumPy matrix where entry at index i is the pre-trained vector for the word of index i in our vectorizer's vocabulary.

In [10]:
num_tokens = len(voc) + 2 # UNK/OOV and PAD
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    print(word, i)
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

 0
[UNK] 1
like 2
i 3
you 4
hate 5
chocolate 6
wine 7
tea 8
beer 9
Converted 8 words (2 misses)


In [11]:
# Initialize the Embedding layer with the weight of each word
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False,
)
num_tokens,embedding_dim, max_len

(12, 50, 5)

In [12]:
# define the model
input_ = Input(shape=(1,), dtype=tf.string)
x = vectorize_layer(input_)
x = embedding_layer(x)
x = Flatten()(x)
hidden = Dense(32, activation="relu")(x)
output_ = Dense(1, activation='sigmoid')(hidden)
model = Model(input_, output_)

# summarize the model
model.summary()

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# fit the model
model.fit(texts, labels, epochs=10, verbose=0)

# evaluate the model
loss, accuracy = model.evaluate(texts, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 5)                0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 5, 50)             600       
                                                                 
 flatten_1 (Flatten)         (None, 250)               0         
                                                                 
 dense_4 (Dense)             (None, 32)                8032      
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                           

## Train your own Word2Vec model with gensim

In [13]:
#!pip install gensim

To begin with, you need data to train a model. We will use part of the Brown corpus.

In [14]:
from nltk.corpus import brown
from gensim.models import Word2Vec

train_set = brown.sents()[:10000]

Let's go ahead and train a model on our corpus. Don't worry about the training parameters much for now, we'll revisit them later.

In [15]:
model = Word2Vec(sentences=train_set, size=embedding_dim, window=5, min_count=1, workers=4)
#model = Word2Vec(sentences=common_texts, vector_size=embedding_dim, window=5, min_count=1, workers=4)

Once we have our model, we can use it.

The main part of the model is model.wv\ , where "wv" stands for "word vectors".

In [16]:
vector = model.wv['university']  # get numpy vector of a word
vector

array([-0.04136918, -0.10978555, -0.15034047, -0.24463718, -0.05027288,
       -0.07669671,  0.06218094,  0.00680144, -0.08232249, -0.33849627,
       -0.2387926 ,  0.02191992, -0.06522922,  0.33818543,  0.18337403,
        0.10902342,  0.03109928, -0.07143142, -0.12798679,  0.01539179,
       -0.03205783,  0.04300994,  0.12229457, -0.29632238,  0.05040243,
        0.00419194,  0.09828603,  0.06172842, -0.23809716, -0.10589421,
        0.02364612, -0.11272949, -0.03686354,  0.18639238,  0.18105426,
       -0.31819963, -0.41908318,  0.18029644,  0.2593548 ,  0.19554946,
        0.22721523, -0.15141046,  0.17618845, -0.16063029,  0.10343505,
       -0.0245653 , -0.16474013, -0.16749346,  0.18614413,  0.3696545 ],
      dtype=float32)

In [17]:
model.similarity('university','school')

0.99821913

In [18]:
sims = model.wv.most_similar('university', topn=10)  # get other similar words
sims

[('series', 0.9993743896484375),
 ('meeting', 0.9993742108345032),
 ('program', 0.9993041753768921),
 ('came', 0.99930340051651),
 ('other', 0.999301016330719),
 ('court', 0.999296247959137),
 ('early', 0.9992777109146118),
 ('history', 0.9992622137069702),
 ('hand', 0.9992523193359375),
 ('two', 0.999245285987854)]

Training non-trivial models can take time.  Once the model is built, it can be saved using standard gensim methods:

In [19]:
import tempfile

with tempfile.NamedTemporaryFile(prefix='gensim-model-', delete=False) as tmp:
    temporary_filepath = tmp.name
    print(temporary_filepath)
    model.save(temporary_filepath)
    #
    # The model is now safely stored in the filepath.
    # You can copy it to other machines, share it with others, etc.
    #
    # To load a saved model:
    #
    new_model = Word2Vec.load(temporary_filepath)

/var/folders/1p/3c9gtfld201dy53fjq35ky7c0000gn/T/gensim-model-nmy5oces


If you save the model you can continue training it later:

In [20]:
from nltk.tokenize import word_tokenize
new_model.train([word_tokenize(sent) for sent in texts], total_examples=1, epochs=1)

(15, 15)

If you no longer need to retrain the model, it can be saved with only the vectors and their keys. This results in a much smaller and faster object that can be loaded more quickly.

In [21]:
from gensim.models import KeyedVectors

# Store just the words + their trained embeddings.
word_vectors = new_model.wv
word_vectors.save("word2vec.wordvectors")

# Load back with memory-mapping = read-only, shared across processes.
new_word_vectors = KeyedVectors.load("word2vec.wordvectors", mmap='r')

vector = new_word_vectors['university']  # Get numpy vector of a word
vector

array([-0.04136918, -0.10978555, -0.15034047, -0.24463718, -0.05027288,
       -0.07669671,  0.06218094,  0.00680144, -0.08232249, -0.33849627,
       -0.2387926 ,  0.02191992, -0.06522922,  0.33818543,  0.18337403,
        0.10902342,  0.03109928, -0.07143142, -0.12798679,  0.01539179,
       -0.03205783,  0.04300994,  0.12229457, -0.29632238,  0.05040243,
        0.00419194,  0.09828603,  0.06172842, -0.23809716, -0.10589421,
        0.02364612, -0.11272949, -0.03686354,  0.18639238,  0.18105426,
       -0.31819963, -0.41908318,  0.18029644,  0.2593548 ,  0.19554946,
        0.22721523, -0.15141046,  0.17618845, -0.16063029,  0.10343505,
       -0.0245653 , -0.16474013, -0.16749346,  0.18614413,  0.3696545 ],
      dtype=float32)

You can then use the template exactly as if it were a Glove/Word2Vec/FastText template retrieved from the Internet.

In [22]:
# Build word dict
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))
word_index

{'': 0,
 '[UNK]': 1,
 'like': 2,
 'i': 3,
 'you': 4,
 'hate': 5,
 'chocolate': 6,
 'wine': 7,
 'tea': 8,
 'beer': 9}

In [23]:
num_tokens = len(voc) + 2
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    try:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = new_word_vectors[word]
        hits += 1
    except :
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 7 words (3 misses)


In [24]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False,
    name="Embedding"
)

In [25]:
# define the model
input_ = Input(shape=(1,), dtype=tf.string, name="Input")
x = vectorize_layer(input_)
x = embedding_layer(x)
x = Flatten()(x)
hidden = Dense(32, activation="relu", name="Hidden")(x)
output_ = Dense(1, activation='sigmoid', name="Output")(hidden)
model = Model(input_, output_)

# summarize the model
model.summary()

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# fit the model
model.fit(texts, labels, epochs=10, verbose=0)

# evaluate the model
loss, accuracy = model.evaluate(texts, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 5)                0         
 ectorization)                                                   
                                                                 
 Embedding (Embedding)       (None, 5, 50)             600       
                                                                 
 flatten_2 (Flatten)         (None, 250)               0         
                                                                 
 Hidden (Dense)              (None, 32)                8032      
                                                                 
 Output (Dense)              (None, 1)                 33        
                                                           

## Use a all the pre-trained embedding : Glove/Word2Vec/FastText embedding (this week)

Up to now, the embedding is a matrix of size vocab_size * embedding_size
* vocab_size being the number of tokens in the training data: for example in the previous situation, this size was fixed at 5000
* The objective here is to have a matrix of size pre_traine

In [26]:
# Build the vocabulary list
# Build the embedding matrix
path_to_glove_file = "/users/riveill/DS-models/glove.6B.50d.txt"

vocabulary = []
embedding_matrix = [np.zeros((embedding_dim)),
                    np.zeros((embedding_dim))] # See later : 0=PAD, 1=OOV
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        vocabulary += [word]
        embedding_matrix += [coefs]
embedding_matrix = np.array(embedding_matrix)

In [27]:
embedding_matrix.shape

(400002, 50)

In [28]:
vocab_size = len(embedding_matrix)
len(vocabulary), len(embedding_matrix), 

(400000, 400002)

In [29]:
# Build vectorizer layer and initialize it with the vocabulary list
vectorize_layer = tf.keras.layers.TextVectorization(
        max_tokens=len(embedding_matrix),
        output_mode="int",
        output_sequence_length=max_len,
        vocabulary=vocabulary  # Pass the vocabulary - no need to adapt the layer
                               # Contain the padding token ('') and OOV token ('[UNK]')
)
len(vectorize_layer.get_vocabulary())

400002

In [30]:
# Get the begining of the vocabulary list
vectorize_layer.get_vocabulary()[:10]

['', '[UNK]', 'the', ',', '.', 'of', 'to', 'and', 'in', 'a']

In [31]:
# Test vectorizer layer
vectorize_layer('the sandberger oov')

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([     2, 400001,      1,      0,      0])>

In [32]:
# The rest is similar to an approach with Keras embedding

In [33]:
# Define embedding layer
embedding_layer = Embedding(
    vocab_size,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False, # False: don't fine tune the embedding matrix / True: fine tune
    name="Embedding"
)

In [34]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# fit the model
model.fit(texts, labels, epochs=10, verbose=0)

# evaluate the model
loss, accuracy = model.evaluate(texts, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 80.000001
