In [None]:
from keras import layers,Sequential
from keras.layers import Dense,Embedding
import pandas as pd

In [None]:
# creating a demo vocab
df=pd.DataFrame({'Sentence':['I am a good boy','he is a very very bad boy'],'Sentiment':[1,0]})
df

Unnamed: 0,Sentence,Sentiment
0,I am a good boy,1
1,he is a very very bad boy,0


In [None]:
# encoding voabulary to a numerical data
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Sentence'])
word_index = tokenizer.word_index
word_index

{'a': 1,
 'boy': 2,
 'very': 3,
 'i': 4,
 'am': 5,
 'good': 6,
 'he': 7,
 'is': 8,
 'bad': 9}

In [None]:
# converting sentences to numericals
sequences=tokenizer.texts_to_sequences(df['Sentence'])
sequences

[[4, 5, 1, 6, 2], [7, 8, 1, 3, 3, 9, 2]]

In [None]:
# pdding them to make of equal length
padded_sequences=pad_sequences(sequences,padding='post')
padded_sequences

array([[4, 5, 1, 6, 2, 0, 0],
       [7, 8, 1, 3, 3, 9, 2]], dtype=int32)

In [None]:
vocab_size = len(word_index) + 1  # +1 because index starts at 1
output_dim = 3
input_length = 5

model = Sequential()

# embedding will internally one hot encode our each word so each word will be of shape vocab_size,1 then it multiply this encoded to weight matrix then it get a vector of shape output dim ,1 and it do this for each word in sentence so it will get a vector of shape batch_size,input_length,output_dim and then we flatten it now the shape will become batchsize,output_dim*input_length
model.add(Embedding(input_dim=vocab_size, output_dim=output_dim, input_length=input_length))
model.add(layers.Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(padded_sequences,df[['Sentiment']].values,epochs=5,batch_size=1)



Epoch 1/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - accuracy: 0.0000e+00 - loss: 0.7148
Epoch 2/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.0000e+00 - loss: 0.7187
Epoch 3/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.0000e+00 - loss: 0.7086 
Epoch 4/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.0000e+00 - loss: 0.7057
Epoch 5/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.0000e+00 - loss: 0.7089


<keras.src.callbacks.history.History at 0x79d2e1c637d0>

In [None]:
model.summary()

In [None]:
embedding_layer = model.layers[0]  # first layer is Embedding
embeddings = embedding_layer.get_weights()[0]  # shape: (vocab_size, output_dim)



# Word2Vec

###Word2Vec is a word embedding technique in natural language processing (NLP) that allows words to be represented as vectors in a continuous vector space. Researchers at Google developed word2Vec that maps words to high-dimensional vectors to capture the semantic relationships between words. It works on the principle that words with similar meanings should have similar vector representations. Word2Vec utilizes two architectures:
#### 1)CBOW
#### 2)skip-grams

### embeddings are actually by product of a model like shown above
### so we state a dummy problem and cbow and skip gram differ in this dummy problem

##1. Continuous Bag-of-Words (CBOW)

*   **Dummy Problem:** **Given the context words, predict the target word** (the middle word).
*   **Training Data Creation:** A fixed-size window (e.g., 3 words: C1, T, C2) is slid across the corpus. The input consists of the context words (C1, C2), and the expected output is the target word (T).
*   **Neural Network Structure:**
    *   Input: Typically one-hot vectors representing the context words.
    *   Hidden Layer: The number of nodes in the hidden layer determines the desired **dimension** of the output vector (e.g., 3 nodes for a 3-dimensional vector).
    *   Output Layer: Nodes equal to the size of the total vocabulary, often using a Softmax layer to predict the probability of the target word.
*   **Vector Extraction:** After training the network across multiple epochs to minimize the loss, the **weights** connecting the input layer to the hidden layer become the final word vectors (embeddings).

##2. Skip-gram

*   **Dummy Problem:** **Given the target word, predict the context words**.
*   **Structure:** Skip-gram reverses the architecture of CBOW. The target word is the input, and the context words (C1, C2) are the desired outputs.
*   **Vector Extraction:** Similar to CBOW, the embeddings are extracted from the weights of the trained neural network.

##3. Architecture Selection Guidance

*   **Small Data:** Use **CBOW** (it is faster and performs well on smaller datasets).
*   **Large Data:** Use **Skip-gram** (it tends to yield better results on larger datasets).


In [None]:
# CBOW implementation

In [None]:
sequences

[[4, 5, 1, 6, 2], [7, 8, 1, 3, 3, 9, 2]]

In [None]:
#generating input output pairs
# window_size is the no of words before and after the target as input
# like if window size is one and sequence = i am the king then possible input output pairs are
# X = [[am],        y = i
#      [i, the],    y = am
#      [am, king],  y = the
#      [the]]       y = king

import numpy as np

def generate_pairs(sequences,window_size):
  X=[]
  y=[]
  for i in range(len(sequences)):
    for j in range(len(sequences[i])):
      target=sequences[i][j]
      start=max(0,j-window_size)
      end=min(len(sequences[i]),j+window_size)
      a=[]
      a+=sequences[i][start:j]
      a+=sequences[i][j+1:end+1]
      X.append(a)
      y.append(target)
  X=pad_sequences(X,padding='post')
  y=np.array(y)
  return np.array(X),y

In [None]:
X,y=generate_pairs(sequences,2)

In [None]:
from keras.models import Model
from keras.layers import Embedding,Dense,Input,Lambda
import tensorflow.keras.backend as K

inp=Input(shape=(X.shape[1],))

x=Embedding(input_dim=len(word_index)+1,output_dim=2,input_length=X.shape[1])(inp)
x = Lambda(lambda x: K.mean(x, axis=1))(x)

output=Dense(len(np.unique(y))+1,activation='softmax')(x)


model=Model(inputs=inp,outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()



In [None]:
model.fit(X,y,epochs=5,batch_size=2)

Epoch 1/5
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 2.3000
Epoch 2/5
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 2.2994
Epoch 3/5
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 2.2993  
Epoch 4/5
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 2.3016 
Epoch 5/5
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 2.2923 


<keras.src.callbacks.history.History at 0x79d2e1d47620>

# implementing Skip-gram

#### in skipgram a word is used to predict its context words so the words used as X in CBOW are y for it and those used as y are X for it

In [None]:
# like for sentence I am the king and window size = 1
# am → I
# am → the
# the → am
# the → king

def generate_pairs(sequences,window_size):
  X=[]
  y=[]
  for i in range(len(sequences)):
    for j in range(len(sequences[i])):
      target=sequences[i][j]
      start=max(0,j-window_size)
      end=min(len(sequences[i]),j+window_size)
      a=[]
      a+=sequences[i][start:j]
      a+=sequences[i][j+1:end+1]
      for k in a:
        X.append(k)
        y.append(target)
  return np.array(X),np.array(y)

In [None]:
X,y=generate_pairs(sequences,2)

In [None]:
X,y

(array([5, 1, 4, 1, 6, 4, 5, 6, 2, 5, 1, 2, 1, 6, 8, 1, 7, 1, 3, 7, 8, 3,
        3, 8, 1, 3, 9, 1, 3, 9, 2, 3, 3, 2, 3, 9]),
 array([4, 4, 5, 5, 5, 1, 1, 1, 1, 6, 6, 6, 2, 2, 7, 7, 8, 8, 8, 1, 1, 1,
        1, 3, 3, 3, 3, 3, 3, 3, 3, 9, 9, 9, 2, 2]))

In [None]:
from keras.models import Model
from keras.layers import Embedding,Dense,Input,Lambda
import tensorflow.keras.backend as K

inp=Input(shape=(1,))

x=Embedding(input_dim=len(word_index)+1,output_dim=2,input_length=1)(inp)
x = Lambda(lambda x: K.mean(x, axis=1))(x)

output=Dense(len(np.unique(y))+1,activation='softmax')(x)


model=Model(inputs=inp,outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()

In [None]:
!pip install gensim



In [None]:
# building using genitsm
from gensim.models import Word2Vec

# Example corpus
corpus = [
    "I love deep learning".lower().split(),
    "Deep learning is fun".lower().split(),
    "I am learning NLP".lower().split()
]

# corpus = list of list of words
print(corpus)

[['i', 'love', 'deep', 'learning'], ['deep', 'learning', 'is', 'fun'], ['i', 'am', 'learning', 'nlp']]


In [None]:
# CBOW model: sg=0 (skip-gram=1, CBOW=0)
model = Word2Vec(sentences=corpus,
                 vector_size=50,   # embedding dimension
                 window=2,        # context window size
                 min_count=1,     # ignore words with freq < 1
                 sg=0,            # 0 = CBOW, 1 = Skip-Gram
                 workers=4)       # parallel threads


In [None]:
# Vector for a word
vector = model.wv['learning']
print("Vector shape:", vector.shape)
print(vector)


Vector shape: (50,)
[-1.0724545e-03  4.7286271e-04  1.0206699e-02  1.8018546e-02
 -1.8605899e-02 -1.4233618e-02  1.2917745e-02  1.7945977e-02
 -1.0030856e-02 -7.5267432e-03  1.4761009e-02 -3.0669428e-03
 -9.0732267e-03  1.3108104e-02 -9.7203208e-03 -3.6320353e-03
  5.7531595e-03  1.9837476e-03 -1.6570430e-02 -1.8897636e-02
  1.4623532e-02  1.0140524e-02  1.3515387e-02  1.5257311e-03
  1.2701781e-02 -6.8107317e-03 -1.8928028e-03  1.1537147e-02
 -1.5043275e-02 -7.8722071e-03 -1.5023164e-02 -1.8600845e-03
  1.9076237e-02 -1.4638334e-02 -4.6675373e-03 -3.8754821e-03
  1.6154874e-02 -1.1861792e-02  9.0324880e-05 -9.5074680e-03
 -1.9207101e-02  1.0014586e-02 -1.7519170e-02 -8.7836506e-03
 -7.0199967e-05 -5.9236289e-04 -1.5322480e-02  1.9229487e-02
  9.9641159e-03  1.8466286e-02]


In [None]:
similar_words = model.wv.most_similar('learning', topn=5)
print(similar_words)

In [None]:
# Save
model.save("cbow_model.gensim")

# Load
from gensim.models import Word2Vec
model = Word2Vec.load("cbow_model.gensim")

In [None]:
# GOOGLE news word2vec pretrained model

#This model uses Skip-Gram with negative sampling, not CBOW.

In [None]:
# !wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
# no longer available here

In [None]:
from gensim.models import KeyedVectors
# first download model and paste its address in model path
model_path = "GoogleNews-vectors-negative300.bin.gz"
model = KeyedVectors.load_word2vec_format(model_path, binary=True)