# Embeddings 

In [28]:
sentences = [
    'Machine Learning', 
    'Model training',
    'Supervised learning', 
    'Unsupervised learning', 
    'Reinforcement learning', 
    'Computer Hardware Engineering', 
    'Computer Security', 
    'Computer architecture', 
    'Operating System', 
    'Parallel computing'
]

In [29]:
# Get the tokens from the list of sentences 
token_sentences = [sen.split() for sen in sentences]
token_sentences

[['Machine', 'Learning'],
 ['Model', 'training'],
 ['Supervised', 'learning'],
 ['Unsupervised', 'learning'],
 ['Reinforcement', 'learning'],
 ['Computer', 'Hardware', 'Engineering'],
 ['Computer', 'Security'],
 ['Computer', 'architecture'],
 ['Operating', 'System'],
 ['Parallel', 'computing']]

In [30]:
# Get a set of unique tokens 
all_tokens = set([word for sentence in token_sentences for word in sentence])

In [31]:
# To increase the readability, let's sort the corpus of our text 
all_tokens = sorted(all_tokens)
all_tokens

['Computer',
 'Engineering',
 'Hardware',
 'Learning',
 'Machine',
 'Model',
 'Operating',
 'Parallel',
 'Reinforcement',
 'Security',
 'Supervised',
 'System',
 'Unsupervised',
 'architecture',
 'computing',
 'learning',
 'training']

In [32]:
# Integer encoding using Sklearn 
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(all_tokens)
integer_encoded

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16])

In [33]:
encoded_sentences = []
for sentence in sentences: 
    encoded_sentences.append(label_encoder.transform(sentence.split(' ')))
    
encoded_sentences

[array([4, 3]),
 array([ 5, 16]),
 array([10, 15]),
 array([12, 15]),
 array([ 8, 15]),
 array([0, 2, 1]),
 array([0, 9]),
 array([ 0, 13]),
 array([ 6, 11]),
 array([ 7, 14])]

In [36]:
from keras.preprocessing.sequence import pad_sequences
# Since the sentences size is different, we need to add padding. 
max_length = 3 # No more than three words per sentence 
#padding = post since we need to fill the zeros at the end
padded_enc_sentences = pad_sequences(encoded_sentences, maxlen=max_length, padding='post')
padded_enc_sentences

array([[ 4,  3,  0],
       [ 5, 16,  0],
       [10, 15,  0],
       [12, 15,  0],
       [ 8, 15,  0],
       [ 0,  2,  1],
       [ 0,  9,  0],
       [ 0, 13,  0],
       [ 6, 11,  0],
       [ 7, 14,  0]], dtype=int32)

## Building an Embedding Layer 

In [58]:
from keras.models import Sequential 
from keras.layers.embeddings import Embedding
from keras.layers import Flatten
from keras.layers import Dense 

In [59]:
model = Sequential()
embedding_layer = Embedding(input_dim=len(all_tokens), output_dim=2, input_length=3)
model.add(embedding_layer)
model.compile('adam', 'mse')

In [104]:
print(sentences[0])
print(padded_enc_sentences[0].shape)
model.predict(padded_enc_sentences[0].reshape(1,-1))

Machine Learning
(3,)


array([[[ 0.02791763, -0.02687442],
        [ 0.00129487,  0.02928725],
        [ 0.00248948,  0.04000784]]], dtype=float32)