#### Creating Embedding Matrix (Feature Vectors of Words) using supervised Learning

In [16]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [32]:
reviews = [
    'nice food',
    'amazing restaurant',
    'too good',
    'just loved it!',
    'will go again',
    'horrible food',
    'never go there',
    'poor service',
    'poor quality',
    'needs improvement',
    'excellent service',
    'delicious meal',
    'great atmosphere',
    'fantastic experience',
    'highly recommend',
    'best food ever',
    'wonderful staff',
    'perfect dinner',
    'amazing taste',
    'outstanding quality',
    'superb restaurant',
    'incredible flavors',
    'top notch service',
    'absolutely loved it',
    'will definitely return',
    'exceeded expectations',
    'mouth watering dishes',
    'great value for money',
    'cozy and comfortable',
    'fresh ingredients',
    'beautifully presented',
    'friendly waiters',
    'quick service',
    'clean and hygienic',
    'romantic ambiance',
    'perfect for families',
    'creative menu',
    'generous portions',
    'reasonably priced',
    'five star experience',
    'gem of a place',
    'culinary masterpiece',
    'impeccable service',
    'divine food',
    'stellar restaurant',
    'terrible experience',
    'worst meal ever',
    'awful service',
    'disappointing food',
    'overpriced garbage',
    'rude staff',
    'long wait times',
    'cold food served',
    'dirty restaurant',
    'tasteless dishes',
    'not worth the money',
    'poor hygiene standards',
    'stale ingredients',
    'unprofessional behavior',
    'terrible atmosphere',
    'worst customer service',
    'bland and boring',
    'overcooked meal',
    'small portions',
    'noisy environment',
    'unhelpful waiters',
    'food poisoning',
    'disgusting taste',
    'total waste of time',
    'never coming back',
    'horrible experience',
    'substandard quality',
    'shocking service',
    'completely inedible',
    'regret visiting',
    'avoid at all costs',
    'below average food',
    'unacceptable standards',
    'disappointing visit',
    'average food quality',
    'okay experience',
    'decent but not great',
    'mixed feelings about this place',
    'some dishes were good',
    'room for improvement',
    'hit or miss',
    'mediocre service',
    'could be better',
    'nothing special'
]
sentiment = np.array([    
    1,1,1,1,1,
    0,0,0,0,0,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0
])

In [34]:
# vocabulary count
s = set()
maxLen = 0
for i in reviews:
    maxLen = max(maxLen, len(i.split()))
    for j in i.split():
        s.add(j)
        
vocab_size = len(s)
print("Vocabulary Size:", vocab_size)
print("Maximum Length:", maxLen)

Vocabulary Size: 156
Maximum Length: 5


In [55]:
one_hot_reviews = [one_hot(i, vocab_size) for i in reviews] # One-hot encoding of reviews
padded_reviews = pad_sequences(one_hot_reviews, maxlen=maxLen, padding='post') # Padding sequences
padded_reviews

array([[112, 106,   0,   0,   0],
       [ 39, 153,   0,   0,   0],
       [ 10,  12,   0,   0,   0],
       [108, 118, 124,   0,   0],
       [ 59,  91, 114,   0,   0],
       [ 17, 106,   0,   0,   0],
       [ 10,  91,  13,   0,   0],
       [ 34,  83,   0,   0,   0],
       [ 34,  66,   0,   0,   0],
       [106,  87,   0,   0,   0],
       [ 15,  83,   0,   0,   0],
       [119,  28,   0,   0,   0],
       [141,  91,   0,   0,   0],
       [ 50, 152,   0,   0,   0],
       [127,  33,   0,   0,   0],
       [ 52, 106, 113,   0,   0],
       [ 30, 126,   0,   0,   0],
       [105,  69,   0,   0,   0],
       [ 39,   3,   0,   0,   0],
       [ 69,  66,   0,   0,   0],
       [128, 153,   0,   0,   0],
       [131, 140,   0,   0,   0],
       [ 27,  14,  83,   0,   0],
       [125, 118, 124,   0,   0],
       [ 59, 101,  96,   0,   0],
       [132,  15,   0,   0,   0],
       [ 31, 138, 154,   0,   0],
       [141,  96, 141, 133,   0],
       [ 65,  10,  13,   0,   0],
       [153, 1

In [37]:
embedded_vector_size = 8
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedded_vector_size, input_length=maxLen, name='embedding_layer'),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [43]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(padded_reviews, sentiment, epochs=50)

Epoch 1/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8315 - loss: 0.6425  
Epoch 2/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.8315 - loss: 0.6392
Epoch 3/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8315 - loss: 0.6362
Epoch 4/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8539 - loss: 0.6331
Epoch 5/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8539 - loss: 0.6301
Epoch 6/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8539 - loss: 0.6271
Epoch 7/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8539 - loss: 0.6240
Epoch 8/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8539 - loss: 0.6209
Epoch 9/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x1e044615190>

In [45]:
model.summary()

In [47]:
model.evaluate(padded_reviews, sentiment)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9551 - loss: 0.4501


[0.4501272141933441, 0.9550561904907227]

In [53]:
weights = model.get_layer('embedding_layer').get_weights()[0]
weights

array([[ 0.07318002, -0.07634708,  0.01948649, ...,  0.00162106,
         0.01155795, -0.00623969],
       [-0.04038109, -0.00814412,  0.02358997, ...,  0.0470768 ,
         0.0165456 , -0.04454863],
       [-0.17106025, -0.15866937,  0.13591193, ..., -0.0860195 ,
         0.11453905,  0.13375337],
       ...,
       [-0.02225987, -0.00239442, -0.02144768, ..., -0.10499281,
         0.14474848,  0.17353953],
       [-0.04687266, -0.21014754,  0.18640469, ...,  0.07473511,
        -0.05702414, -0.1540353 ],
       [-0.02568763, -0.02503891,  0.03771318, ...,  0.01062683,
        -0.01083267,  0.01875129]], shape=(156, 8), dtype=float32)

In [69]:
def similarity_score(word1, word2):
    idx1 = one_hot(word1, vocab_size)[0]
    idx2 = one_hot(word2, vocab_size)[0]
    vec1 = weights[idx1]
    vec2 = weights[idx2]
    cos_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    return cos_sim.item()

In [79]:
print(similarity_score('poor', 'horrible'))
print(similarity_score('poor', 'great'))
print(similarity_score('waste', 'never'))
print(similarity_score('friendly', 'fresh'))

0.9322223663330078
-0.807985246181488
0.5522575378417969
0.9774363040924072
