In [33]:
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [34]:
# word embedding:Word embeddings are used in natural language processing (NLP) due to their ability to capture the semantic meaning of words. They map words to a high-dimensional semantic space,
# where words that are used in similar contexts are given similar representations. This means that words used in similar ways are placed close together within the high-dimensional semantic space, 
# clustering together and having low distances from each other 

In [35]:
# handpicked sample data to understand the working of supervise word embedding

reviews=["nice work","work well","nice work","amazing project","horrible work","awesome task welldone"]                   # corpus: it is the list of all the document(document are the number of string available inside the corpus(list))
sentiments=["positive","positive","positive","positive","negative","positive"]

# conversion of sentiment into integer as per classes count
sentiments_num_class=[1 if i=="negative" else 0 for i in sentiments]
# conversion of sentiments_num_class into array
sentiment_array=np.array(sentiments_num_class)
sentiment_array

array([0, 0, 0, 0, 1, 0])

In [36]:
#encoding Test: to convert this number to encoding with one document later we will encode all document into vector
one_hot("nice work",30)

[13, 20]

In [37]:
# encoding all document using one hote encoding
vocab_size=30                                           # it can be anything (Note: vocabulary size always grater the the number of differnt word in the document given above)
reviews_encode=[one_hot(i,vocab_size) for i in reviews]
reviews_encode

[[13, 20], [20, 3], [13, 20], [8, 5], [19, 20], [8, 25, 22]]

In [38]:
# if we want then we can also change the vocab size 
# encoding all document using one hote encoding
vocab_size=60                                          # it can be anything (Note: vocabulary size always grater the the number of differnt word in the document given above)
reviews_encode=[one_hot(i,vocab_size) for i in reviews]
reviews_encode

[[47, 33], [33, 47], [47, 33], [2, 4], [38, 33], [2, 26, 16]]

In [39]:
# now max length of a the biggest document and count the number of string available in it. Always add 1 to the len(of the biggest string of the corpus): exmaple :::>>>"awesome task welldone" it has 3 string when you will tokenize or split
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len=4
padding_reviews=pad_sequences(reviews_encode,maxlen=max_len,padding='post')    # padding="post"  is used to put 0 at the end to make all vector of same dimension.
padding_reviews

array([[47, 33,  0,  0],
       [33, 47,  0,  0],
       [47, 33,  0,  0],
       [ 2,  4,  0,  0],
       [38, 33,  0,  0],
       [ 2, 26, 16,  0]], dtype=int32)

In [40]:
# create model 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Flatten,Dense

embedding_vector_size=10
model=Sequential()
model.add(Embedding(vocab_size,embedding_vector_size,input_length=max_len,name="embedding"))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

2024-01-21 05:22:02.391234: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [42]:
# Problem face by those machine who doesnot have GPU in their machin. || The above code is giving us a problem because above code is using GPU and GPU is use by CUDA 
# Solution: with tensorflow library only ...
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"     # this is running your code on CPU not on GPU now.

# Now run all the above code 

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Flatten,Dense

embedding_vector_size=10
model=Sequential()
model.add(Embedding(vocab_size,embedding_vector_size,input_length=max_len,name="embedding"))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [43]:
# now create feature and target and compile the model 

X= padding_reviews
Y= sentiment_array

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])


In [45]:
# now to see the model summary use 

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 10)             600       
                                                                 
 flatten_1 (Flatten)         (None, 40)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 41        
                                                                 
Total params: 641 (2.50 KB)
Trainable params: 641 (2.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [52]:
# now train the model

model.fit(X,Y,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7f9257fca230>

In [55]:
#evaluating the model
model.evaluate(X,Y)       



[0.208733931183815, 1.0]

In [59]:
# store loss and accuracy and unpack the above model.evaluates(X,Y)

loss,accuracy=model.evaluate(X,Y)
print("accuracy :",accuracy)

accuracy : 1.0


In [67]:
weight=model.get_layer("embedding").get_weights()[0]
print(weight)

[[-1.81481987e-01  9.81912017e-02 -8.11286569e-02  1.84130087e-01
   1.97620824e-01  1.67721063e-01 -1.32870883e-01  1.85122252e-01
   1.70728192e-01  1.76468343e-01]
 [ 4.10062335e-02  1.67233385e-02 -1.41904727e-02  4.27175499e-02
   1.54097117e-02 -4.17105928e-02  6.76435232e-03  1.44226886e-02
   6.20497391e-03 -9.20940191e-04]
 [-1.80071592e-01  1.45849288e-01 -1.30931169e-01  2.10702717e-01
  -1.67756483e-01  1.50376394e-01 -7.63800666e-02  1.74155623e-01
   8.46491903e-02 -2.21624851e-01]
 [-1.36000291e-02  3.32340114e-02 -1.09912641e-02 -1.16771087e-02
  -4.58547249e-02 -1.25338659e-02 -4.78052497e-02  3.58204357e-02
  -2.46064197e-02 -2.25553513e-02]
 [ 2.03014731e-01  2.30629683e-01  2.14637324e-01 -1.56024486e-01
  -2.26284966e-01  1.55017376e-01 -1.60304308e-01  1.60930812e-01
  -1.86613575e-01 -1.96894199e-01]
 [ 4.84451391e-02  3.92975658e-03  4.29464839e-02  4.49511670e-02
  -2.13770624e-02 -2.89131999e-02 -1.98503379e-02 -1.36592016e-02
   3.20086814e-02  4.74194996e-02

In [68]:
len(weight)

60

model.get_layer("embedding").get_weights()[0]

ANSWER 
The expression model.get_layer("embedding").get_weights()[0](https://stackoverflow.com/questions/54430475/keras-embedding-weights-lookup-with-categorical-variables) is used to extract the weights of the embedding layer from a trained model in TensorFlow. 

Let's break down what each part does:

model.get_layer("embedding"): This retrieves the layer named "embedding" from the model. In the context of natural language processing, this usually refers to the embedding layer, which learns word embeddings from the input data.

.get_weights(): This method returns the weights of the layer as a list of Numpy arrays. For an embedding layer, this will return a list with one element: the weight matrix of the embeddings. The weight matrix has a shape of (vocab_size, embedding_dimension), where vocab_size is the size of the vocabulary (i.e., the number of unique words in the input data), and embedding_dimension is the size of the embedding vectors.
[ [0] ](https://stackoverflow.com/questions/54430475/keras-embedding-weights-lookup-with-categorical-variables): This is indexing the list returned by get_weights(). Since there is only one element in the list (the weight matrix), we access it with index 0.
So, overall, model.get_layer("embedding").get_weights()[ [0] ](https://stackoverflow.com/questions/54430475/keras-embedding-weights-lookup-with-categorical-variables) gives us the weight matrix of the embedding layer, which contains the word embeddings learned from the training data 

NOTE: Most important it is not as much efficient as infront of word2Vec using unsupervise learning/self supervise technique