# Embeddings 

In [11]:
sentences = [
    'Machine Learning', 
    'Model training',
    'Supervised learning', 
    'Unsupervised learning', 
    'Reinforcement learning', 
    'Computer Hardware Engineering', 
    'Computer Security', 
    'Computer architecture', 
    'Operating System', 
    'Parallel computing'
]

In [12]:
# Get the tokens from the list of sentences 
token_sentences = [sen.split() for sen in sentences]
token_sentences

[['Machine', 'Learning'],
 ['Model', 'training'],
 ['Supervised', 'learning'],
 ['Unsupervised', 'learning'],
 ['Reinforcement', 'learning'],
 ['Computer', 'Hardware', 'Engineering'],
 ['Computer', 'Security'],
 ['Computer', 'architecture'],
 ['Operating', 'System'],
 ['Parallel', 'computing']]

In [13]:
# Get a set of unique tokens 
all_tokens = set([word for sentence in token_sentences for word in sentence])

In [14]:
# To increase the readability, let's sort the corpus of our text 
all_tokens = sorted(all_tokens)
all_tokens

['Computer',
 'Engineering',
 'Hardware',
 'Learning',
 'Machine',
 'Model',
 'Operating',
 'Parallel',
 'Reinforcement',
 'Security',
 'Supervised',
 'System',
 'Unsupervised',
 'architecture',
 'computing',
 'learning',
 'training']

In [15]:
# Integer encoding using Sklearn 
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(all_tokens)
integer_encoded

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16])

In [16]:
encoded_sentences = []
for sentence in sentences: 
    encoded_sentences.append(label_encoder.transform(sentence.split(' ')) + 1)
    
encoded_sentences

[array([5, 4]),
 array([ 6, 17]),
 array([11, 16]),
 array([13, 16]),
 array([ 9, 16]),
 array([1, 3, 2]),
 array([ 1, 10]),
 array([ 1, 14]),
 array([ 7, 12]),
 array([ 8, 15])]

In [17]:
from keras.preprocessing.sequence import pad_sequences
# Since the sentences size is different, we need to add padding. 
max_length = 3 # No more than three words per sentence 
#padding = post since we need to fill the zeros at the end
padded_enc_sentences = pad_sequences(encoded_sentences, maxlen=max_length, padding='post')
padded_enc_sentences

Using TensorFlow backend.


array([[ 5,  4,  0],
       [ 6, 17,  0],
       [11, 16,  0],
       [13, 16,  0],
       [ 9, 16,  0],
       [ 1,  3,  2],
       [ 1, 10,  0],
       [ 1, 14,  0],
       [ 7, 12,  0],
       [ 8, 15,  0]], dtype=int32)

## Building an Embedding Layer 

In [18]:
from keras.models import Sequential 
from keras.layers.embeddings import Embedding
from keras.layers import Flatten
from keras.layers import Dense 

In [19]:
model = Sequential()
embedding_layer = Embedding(input_dim=len(all_tokens), output_dim=2, input_length=3)
model.add(embedding_layer)
model.compile('adam', 'mse')







In [20]:
print(sentences[0])
print(padded_enc_sentences[0].shape)
model.predict(padded_enc_sentences[0].reshape(1,-1))

Machine Learning
(3,)








array([[[-0.04924556,  0.04319364],
        [ 0.03612458, -0.0233996 ],
        [-0.00706597, -0.01268903]]], dtype=float32)