In [5]:
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Embedding,Dense,GlobalAveragePooling1D
from keras.preprocessing.text import Tokenizer

## Data Preparation

In [7]:
#corpus=["The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. The reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult."]

corpus = [] #The corpus is a collection of text documents, and each document is a sequence of words.
with open("../LP-IV-datasets/CBOW/CBOW.txt", "r") as f:
    for line in f:
        corpus.append(line.strip()) #reads each line from the text.txt file, removes any whitespace from the end of the 
        							#line, and adds the line to the corpus list.
            
corpus
            

['The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19.',
 '',
 'Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission.',
 '',
 'The reproductive number – the number of secondary infections generated from one infected individual – is understood 

In [8]:
#Create tokenizer object and fit it on the corpus
#Tokenizer object is used to convert words into numerical identifiers that can be processed by the model.
tokenizer=Tokenizer()     
tokenizer.fit_on_texts(corpus)#Tokenizer object learns the vocabulary of the corpus and creates a mapping between words and 
							 #their corresponding indices.



word_index=tokenizer.word_index
#The word_index dictionary stores the mapping between words and their corresponding indices
#This dictionary is used to convert the text data into numerical data that can be processed by the model.


sequences=tokenizer.texts_to_sequences(corpus)
x,y=[],[]    #The x list will store the sequences aof word indices, and the y list will store the labels. The labels are 1 
				#for semantically similar pairs of words and 0 for dissimilar pairs.
#generate sequences of word indices. 
#The sequences of word indices are generated using the tokenizer's texts_to_sequences method. 
#This method takes a list of text documents as input and returns a list of sequences of word indices. 
#Each sequence of word indices represents a single text document.

#sequences

[[1,
  38,
  2,
  8,
  9,
  39,
  40,
  41,
  2,
  42,
  13,
  1,
  43,
  23,
  3,
  44,
  11,
  24,
  45,
  46,
  47,
  1,
  14,
  25,
  48,
  10,
  26,
  2,
  27,
  12,
  11,
  24,
  15,
  16,
  1,
  14,
  13,
  49,
  50,
  17,
  4,
  5,
  6,
  1,
  15,
  16,
  7,
  4,
  5,
  6,
  9,
  51,
  10,
  18,
  19,
  52,
  20,
  28,
  7,
  3,
  6,
  1,
  15,
  16,
  9,
  29,
  20,
  30,
  53,
  31,
  3,
  32,
  54,
  55,
  17,
  4,
  5],
 [],
 [56,
  8,
  33,
  1,
  57,
  29,
  19,
  20,
  2,
  58,
  59,
  60,
  61,
  62,
  8,
  63,
  2,
  1,
  6,
  64,
  1,
  26,
  2,
  27,
  21,
  9,
  11,
  34,
  35,
  2,
  8,
  7,
  3,
  33,
  65,
  28,
  66,
  22,
  67,
  31,
  68,
  22,
  69,
  70,
  32,
  71,
  4,
  5,
  6,
  72,
  73,
  74,
  75,
  10,
  76,
  77,
  78,
  79,
  30,
  80,
  81,
  82,
  10,
  18,
  11,
  34,
  35,
  2,
  8],
 [],
 [1,
  83,
  36,
  21,
  1,
  36,
  2,
  84,
  85,
  86,
  25,
  87,
  88,
  89,
  21,
  9,
  90,
  10,
  18,
  13,
  37,
  12,
  37,
  19,
  7,
  4,
  5,
  6

In [5]:
#Outer Loop: The outer loop iterates through the sequence of words (seq), assigning each word to the variable target_word. 
			#The enumerate() function is used to keep track of the index (i) of each word in the sequence.
#Inner Loop: The inner loop iterates through a range of indices (j) around the current target word (target_word).
			#The starting index is max(0, i - 2), ensuring that it doesn't go below 0. 
    		#The ending index is min(i + 3, len(seq)), ensuring that it doesn't exceed the sequence length.

for seq in sequences:
    for i, target_word in enumerate(seq): #enumerate() function is used in the code to generate sequences of positive and negative context words for each target word. Specifically, the code iterates through each sequence of word indices (seq) and assigns each word to the variable target_word. The enumerate() function is used to keep track of the index (i) of each word in the sequence.
        for j in range(max(0,i-2),min(i+3,len(seq))):
            if i != j:
                x.append([target_word, seq[j]])
                y.append(1)
                x.append([target_word,np.random.choice(list(word_index.values()))])
                y.append(0)
#The provided code snippet iterates through each sequence of words (seq) in the sequences list.
#For each target word (target_word) in the current sequence, it extracts two context words: one positive context word 
	#(seq[j]) and one negative context word (np.random.choice(list(word_index.values()))).               
#Positive context word- semantically same to target word
#Negative context word- not semantically similar to the target word.(choosen randomly from the vocabulary)
    
    
    
    
#convert the lists of word indices (x) and labels (y) to NumPy arrays for efficient processing.
#NumPy arrays are more efficient to use than lists because they can be manipulated more quickly and easily.
x=np.array(x)                
y=np.array(y)

## Model Building

In [6]:
model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=1, input_length=2),
    GlobalAveragePooling1D(),
    Dense(1, activation='sigmoid')
])

#Embedding: This layer embeds each word pair into a low-dimensional vector. The input_dim parameter specifies the vocabulary
#size (the number of unique words), and the output_dim parameter determines the embedding dimension. 
#The embedding dimension is a hyperparameter that can be tuned to improve the model's performance.



#GlobalAveragePooling1D: This layer averages the embedding vectors for the two words in each pair. This means that the
	#output of this layer is a single vector that represents the semantic similarity of the two words.
#always used with Embeddinglayer 


#Dense: This layer outputs a single value representing the predicted probability of the two words being semantically 
#similar. The activation function is sigmoid, which produces a value between 0 and 1. A value of 1 means that the model 
#predicts that the two words are semantically similar, and a value of 0 means that the model predicts that the two words 
#are not semantically similar.



model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2, 1)              103       
                                                                 
 global_average_pooling1d (  (None, 1)                 0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 2         
                                                                 
Total params: 105 (420.00 Byte)
Trainable params: 105 (420.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Model COmpilation and Training

In [7]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
H=model.fit(x,y,epochs=100,verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


## Word Embeddings Generation

In [8]:
word_embeddings=model.layers[0].get_weights()[0]#extracts the weights of the embedding layer.
#embedding layer is the first layer in the model, it's accessed with index 0
#get_weigths() method returns a list of weights for the layer & the first element of the list corresponds to the embedding 

for word, index in word_index.items():   #word_index dictionary is used to map each word to its corresponding index in
    									 #the embedding matrix.
    print(f"{word}: {word_embeddings[index]}")

the: [-0.50820553]
of: [-0.39724612]
influenza: [-0.36985838]
covid: [-0.39427823]
19: [-0.25514337]
virus: [-0.33730212]
for: [-0.4163628]
transmission: [-0.25232625]
is: [-0.33559835]
to: [-0.15449837]
a: [-0.3972882]
and: [-0.34976885]
between: [-0.18421139]
time: [-0.42584416]
serial: [-0.04274696]
interval: [-0.2568933]
than: [-0.2349225]
be: [0.0002841]
5: [-0.26301587]
days: [-0.05872761]
â€“: [-0.22338726]
are: [-0.46534672]
viruses: [0.01737609]
shorter: [-0.08138589]
from: [-0.4590055]
appearance: [0.03403454]
symptoms: [0.05540299]
while: [-0.10184057]
3: [-0.00510487]
this: [-0.49724892]
that: [0.22803833]
can: [-0.37410295]
in: [-0.08756864]
major: [-0.21287668]
driver: [0.01566189]
number: [0.14530027]
2: [-0.00539199]
speed: [0.38281143]
an: [-0.02985298]
important: [-0.02705348]
point: [-0.09553389]
difference: [0.2614949]
two: [0.15722242]
has: [0.06253094]
median: [0.02398031]
incubation: [0.08154372]
period: [0.04801432]
infection: [0.21617281]
successive: [0.3703797

In [None]:
# e. Prediction
word_to_predict = "period"
context_word = tokenizer.texts_to_sequences([[word_to_predict]])[0][0]
#selecct a traget word and a random context word

#convert it into vector
context_vector = np.array([context_word, np.random.choice(list(word_index.values()))])

predicted_proba = model.predict(np.array([context_vector]))
print(f"Predicted probability for context '{word_to_predict}': {predicted_proba[0][0]}")

In [None]:
# f. Plot graph
import matplotlib.pyplot as plt 

plt.plot(H.history['loss'])
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()