# Word Embedding

In [1]:
import numpy as np
from keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [2]:
sent = ["the glass of milk", 
       "the cup of tea",
       "I am a good boy",
       "I am a good developer",
       "understand the meaning of words",
        "your videos are good"]
sent

['the glass of milk',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [3]:
voc_size = 10000

## One Hot Representation

In [4]:
onehot_repr = [one_hot(word,voc_size) for word in sent]
print(onehot_repr)

[[7338, 9810, 254, 4321], [7338, 5506, 254, 9239], [308, 3302, 1477, 2681, 8983], [308, 3302, 1477, 2681, 4161], [5024, 7338, 4366, 254, 2172], [3859, 3864, 1082, 2681]]


## Word Embedding Representation

In [5]:
sent_lengths = 8
embedded_docs = pad_sequences(onehot_repr, padding="pre", maxlen=sent_lengths)
print(embedded_docs)

[[   0    0    0    0 7338 9810  254 4321]
 [   0    0    0    0 7338 5506  254 9239]
 [   0    0    0  308 3302 1477 2681 8983]
 [   0    0    0  308 3302 1477 2681 4161]
 [   0    0    0 5024 7338 4366  254 2172]
 [   0    0    0    0 3859 3864 1082 2681]]


In [6]:
dim = 10

In [7]:
model = Sequential()
model.add(Embedding(voc_size,dim,input_length=sent_lengths))
model.compile("adam", "mse")

In [8]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             100000    
                                                                 
Total params: 100000 (390.62 KB)
Trainable params: 100000 (390.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
embedded_docs[0]

array([   0,    0,    0,    0, 7338, 9810,  254, 4321])

In [10]:
print(model.predict(embedded_docs)[0])

[[-0.00364469 -0.00601213  0.00324316  0.04274101 -0.01200644  0.01716233
  -0.03339497 -0.02411114 -0.03585697  0.03284723]
 [-0.00364469 -0.00601213  0.00324316  0.04274101 -0.01200644  0.01716233
  -0.03339497 -0.02411114 -0.03585697  0.03284723]
 [-0.00364469 -0.00601213  0.00324316  0.04274101 -0.01200644  0.01716233
  -0.03339497 -0.02411114 -0.03585697  0.03284723]
 [-0.00364469 -0.00601213  0.00324316  0.04274101 -0.01200644  0.01716233
  -0.03339497 -0.02411114 -0.03585697  0.03284723]
 [-0.0306612  -0.0038766   0.00132865  0.03428889 -0.01391967  0.00906748
   0.01373957 -0.04994048 -0.04641762 -0.01421662]
 [ 0.00231536  0.04843319  0.02058661  0.03740182 -0.00232104  0.00721301
   0.03016471  0.02800473  0.03146173 -0.03825809]
 [-0.02261137 -0.00895966 -0.00303614 -0.03599832  0.04634165  0.03256069
   0.04077398  0.01048644  0.03390707 -0.04967196]
 [-0.0182876   0.03138968  0.0076889  -0.02666131  0.03908943 -0.01715308
   0.01139582 -0.0192439   0.00824518 -0.02135234]]