# Applying One-Hot Encoding and Word Embeddings

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

#input data
texts = [
    'I love cats',
    'I adore dogs',
    'Dogs are the best',
    'Cats and dogs are friends'
]

# Tokenize the input texts 
# convert them into sequences of integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Vocabulary size is determined by the number of unique words in the input data
vocabulary_size = len(tokenizer.word_index) + 1

# Padding the sequences to make them of equal length
max_sequence_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

#  embedding dimension
embedding_dim = 50

# Model building
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_dim, input_length=max_sequence_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Convert target labels to NumPy array
labels = np.array([1, 1, 0, 0])

# Train the model
model.fit(padded_sequences, labels, epochs=10)


embedding_matrix = model.layers[0].get_weights()[0]

# Print the word embeddings
for word, index in tokenizer.word_index.items():
    print(f"{word}: {embedding_matrix[index]}")

2023-07-01 18:20:56.904887: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-01 18:20:57.270314: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-01 18:20:57.271912: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dogs: [ 0.03083587  0.01999585 -0.04435885 -0.00131087 -0.02537207 -0.00349475
  0.00086652  0.01861478  0.03325668 -0.05791751  0.05910502  0.00358215
 -0.02887191  0.00640361  0.03506241  0.00209469  0.02788948 -0.01956146
  0.02115652  0.01553223 -0.03646922  0.00412313 -0.03897573 -0.00051035
  0.01889807 -0.01284106 -0.05835516 -0.0030121   0.0285296  -0.04742509
 -0.03990437  0.01890193  0.04802112 -0.00081431 -0.0243078  -0.03775312
 -0.01541958 -0.03932305  0.0086851   0.01086652 -0.03926727  0.00294969
  0.01014555 -0.05232569  0.04622016 -0.0081026   0.05375386  0.02431481
  0.03808679  0.00553927]
i: [-0.05774513 -0.0380602   0.0103313   0.03932075 -0.00953235 -0.04939635
 -0.03473277 -0.05126707  0.01445158 -0.00678062 -0.03994674  0.02027858
 -0.0406269  -0.00640239 -0.0070122   0.0204288  -0.05706678  0.05145664
 -0.0248176  -0.00274139 -0.03227123  0.03088594 -0