# Word embeddings

Tuto from tensorflow website: https://www.tensorflow.org/tutorials/text/word_embeddings

In [1]:
import os
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D, Input
from tensorflow.keras import Sequential
import io

## Data Loading

In [2]:
(train_data, test_data), info = tfds.load('imdb_reviews/subwords8k', split = (tfds.Split.TRAIN, tfds.Split.TEST), with_info=True, as_supervised=True)
print(info)



tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=8185>),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Pot

In [3]:
encoder = info.features['text'].encoder
encoder.subwords[:20]

['the_',
 ', ',
 '. ',
 'a_',
 'and_',
 'of_',
 'to_',
 's_',
 'is_',
 'br',
 'in_',
 'I_',
 'that_',
 'this_',
 'it_',
 ' /><',
 ' />',
 'was_',
 'The_',
 'as_']

In [4]:
train_batches = train_data.shuffle(1000).padded_batch(10,padded_shapes=([None],()))
test_batches = test_data.shuffle(1000).padded_batch(10,padded_shapes=([None],()))

In [5]:
train_batch, train_labels = next(iter(train_batches))

## Model Creation

In [6]:
from tensorflow.keras.callbacks import TensorBoard

log = TensorBoard('C:\\Users\\rose_\\Desktop\\olivier\\tuto-nlp\\log')

In [7]:
embedding_dim = 16

In [8]:
model = Sequential()
model.add(Embedding(encoder.vocab_size, embedding_dim,name='embd'))
model.add(GlobalAveragePooling1D())
model.add(Dense(16, activation='relu'))
model.add(Dense(1))

model.compile(optimizer='adam',loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embd (Embedding)             (None, None, 16)          130960    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 131,249
Trainable params: 131,249
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.fit(train_batches,epochs=10,validation_data=test_batches, validation_steps=20,callbacks=[log])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b44a22b288>

In [10]:
embedding = model.get_layer('embd').get_weights()[0]

In [11]:
out_v = io.open('C:\\Users\\rose_\\Desktop\\olivier\\tuto-nlp\\log\\vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('C:\\Users\\rose_\\Desktop\\olivier\\tuto-nlp\\log\\meta.tsv', 'w', encoding='utf-8')

for num, word in enumerate(encoder.subwords):
    vec = embedding[num+1]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")

out_v.close()
out_m.close()