In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Activation, Flatten, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.utils import to_categorical
import re


In [2]:
document = """The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. 

Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. 

The reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult.  """

In [3]:
document = document.lower()
document = re.sub(r'[^a-z\s]', ' ', document)

In [4]:
#tokenize the document
tokenizer = Tokenizer()
tokenizer.fit_on_texts([document])
vocab_size = len(tokenizer.word_index)+1
word_index = tokenizer.word_index
index_word = tokenizer.index_word
sequences = tokenizer.texts_to_sequences([document])[0]


print(f"Vocabulary Size: {vocab_size}")
print(f"Word Index: {word_index}")
print(f"Tokenized Sequences: {sequences}")


Vocabulary Size: 94
Word Index: {'the': 1, 'of': 2, 'transmission': 3, 'influenza': 4, 'covid': 5, 'virus': 6, 'for': 7, 'is': 8, 'to': 9, 'a': 10, 'and': 11, 'between': 12, 'time': 13, 'serial': 14, 'interval': 15, 'than': 16, 'be': 17, 'days': 18, 'are': 19, 'viruses': 20, 'shorter': 21, 'from': 22, 'appearance': 23, 'symptoms': 24, 'while': 25, 'this': 26, 'that': 27, 'can': 28, 'in': 29, 'major': 30, 'driver': 31, 'number': 32, 'speed': 33, 'an': 34, 'important': 35, 'point': 36, 'difference': 37, 'two': 38, 'has': 39, 'median': 40, 'incubation': 41, 'period': 42, 'infection': 43, 'successive': 44, 'cases': 45, 'estimated': 46, 'means': 47, 'spread': 48, 'faster': 49, 'further': 50, 'first': 51, 'illness': 52, 'or': 53, 'potentially': 54, 'pre': 55, 'symptomatic': 56, 'before': 57, 'contrast': 58, 'we': 59, 'learning': 60, 'there': 61, 'people': 62, 'who': 63, 'shed': 64, 'hours': 65, 'prior': 66, 'symptom': 67, 'onset': 68, 'at': 69, 'present': 70, 'does': 71, 'not': 72, 'appear':

In [7]:
# Step 2: Generate Training Data for CBOW
window_size = 2  # You can adjust the window size (number of context words)
context = []
target = []

for i in range(window_size, len(sequences) - window_size):
    context.append([sequences[i - window_size], sequences[i + window_size]])
    target.append(sequences[i])

# Convert to numpy arrays
context = np.array(context)
target = np.array(target)

embedding_dim = 50  # Size of the word embedding
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1))
model.add(SpatialDropout1D(0.2))
model.add(Flatten())
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])



In [8]:
context = np.array([context[:, 0], context[:, 1]]).T  # Reshape for context words

model.fit(context, target, epochs=100, verbose=1)

Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0000e+00 - loss: 4.5498
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0210 - loss: 4.5277
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0705 - loss: 4.5078
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1975 - loss: 4.4940 
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2999 - loss: 4.4719
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3735 - loss: 4.4590
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4336 - loss: 4.4436 
Epoch 8/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5713 - loss: 4.4210 
Epoch 9/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x21f4d177fe0>

In [9]:
embeddings = model.layers[0].get_weights()[0]

# Show some example word embeddings
for word, i in word_index.items():
    print(f"Word: {word}, Embedding: {embeddings[i]}")

Word: the, Embedding: [-0.56278014 -0.46432444 -0.0355475  -0.4163941  -0.15997383 -0.5544567
  0.4262411   0.29150632  0.1440458   0.36098897  0.31397316  0.09393746
 -0.26478654  0.59468603  0.2589884   0.45379356  0.32067892  0.18476033
  0.09739178 -0.09707393 -0.47444785 -0.46674728 -0.3965753   0.35391015
  0.14546832 -0.26194534 -0.09302597  0.1904363  -0.4406688  -0.3154752
  0.5000725  -0.4785641   0.20495708  0.2188633   0.32443336 -0.4820419
  0.4042527  -0.5625696   0.02398057  0.33980095 -0.17411627 -0.6435106
  0.5167852   0.2886935   0.2992081   0.27438647  0.33166417  0.44310474
  0.31949002  0.31157607]
Word: of, Embedding: [ 0.42778227 -0.10081729  0.0386105  -0.59359795  0.04247351 -0.3994817
  0.6396983  -0.52429646  0.1235626   0.06263158  0.36487702  0.43665433
  0.31758806  0.55007404  0.27257612  0.46487057 -0.13250774 -0.5001951
  0.40234187  0.42771357 -0.01783779  0.11951721  0.34370437  0.05144372
 -0.5447822  -0.45251235  0.13035081 -0.36766365 -0.31175664 

In [13]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Activation, Flatten, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
import re

# Step 1: Preprocess the Text
document = """The speed of transmission is an important point of difference between the two viruses. 
Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. 
The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. 

Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. 
In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. 

The reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult."""

# Clean and tokenize the text
document = document.lower()
document = re.sub(r'[^a-z\s]', '', document)  # Remove non-alphabetic characters

tokenizer = Tokenizer()
tokenizer.fit_on_texts([document])
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size
word_index = tokenizer.word_index
index_word = tokenizer.index_word
sequences = tokenizer.texts_to_sequences([document])[0]

print(f"Vocabulary Size: {vocab_size}")
print(f"Word Index: {word_index}")
print(f"Tokenized Sequences: {sequences}")

# Step 2: Generate Context-Target Pairs for CBOW
window_size = 2  # Use context of 2 words before and after the target word
context = []
target = []

# Step 2: Generate Training Data for CBOW
window_size = 2  # You can adjust the window size (number of context words)
context = []
target = []

for i in range(window_size, len(sequences) - window_size):
    context.append([sequences[i - window_size], sequences[i + window_size]])
    target.append(sequences[i])

# Convert to numpy arrays
context = np.array(context)
target = np.array(target)

embedding_dim = 50  # Size of the word embedding
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1))
model.add(SpatialDropout1D(0.2))
model.add(Flatten())
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])

# Step 4: Train the CBOW Model
context = np.array([context[:, 0], context[:, 1]]).T  # Reshape for context words
model.fit(context, target, epochs=100, verbose=1)

# Step 5: Predicting the Target Word from Context
def predict_word(context_words):
    # Convert context words to their corresponding indices
    context_indices = [word_index[word] for word in context_words]
    
    # Reshape context for the model (it expects a 2D array)
    context_input = np.array(context_indices).reshape(1, -1)
    
    # Predict the target word index
    predicted = model.predict(context_input)
    
    # Convert the predicted index to the corresponding word
    predicted_word_index = np.argmax(predicted, axis=1)[0]
    predicted_word = index_word[predicted_word_index]
    
    return predicted_word

# Example Prediction - Predict target word for context words ["speed", "of"]
context_test = ["influenza", "viruses"]
predicted_word = predict_word(context_test)

print(f"Context: {context_test}")
print(f"Predicted Target Word: {predicted_word}")


Vocabulary Size: 93
Word Index: {'the': 1, 'of': 2, 'transmission': 3, 'influenza': 4, 'covid': 5, 'virus': 6, 'for': 7, 'is': 8, 'to': 9, 'a': 10, 'and': 11, 'between': 12, 'serial': 13, 'interval': 14, 'than': 15, 'be': 16, 'days': 17, 'are': 18, 'viruses': 19, 'shorter': 20, 'time': 21, 'from': 22, 'appearance': 23, 'symptoms': 24, 'while': 25, 'this': 26, 'that': 27, 'can': 28, 'in': 29, 'major': 30, 'driver': 31, 'number': 32, 'speed': 33, 'an': 34, 'important': 35, 'point': 36, 'difference': 37, 'two': 38, 'has': 39, 'median': 40, 'incubation': 41, 'period': 42, 'infection': 43, 'successive': 44, 'cases': 45, 'estimated': 46, 'means': 47, 'spread': 48, 'faster': 49, 'further': 50, 'first': 51, 'illness': 52, 'or': 53, 'potentially': 54, 'presymptomatic': 55, 'before': 56, 'contrast': 57, 'we': 58, 'learning': 59, 'there': 60, 'people': 61, 'who': 62, 'shed': 63, 'hours': 64, 'prior': 65, 'symptom': 66, 'onset': 67, 'at': 68, 'present': 69, 'does': 70, 'not': 71, 'appear': 72, 're

In [32]:
context_test = ["covid","influenza"]
predicted_word = predict_word(context_test)

print(f"Context: {context_test}")
print(f"Predicted Target Word: {predicted_word}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step
Context: ['covid', 'influenza']
Predicted Target Word: transmission


In [34]:
distance_matrix = np.sqrt(((embeddings[:, np.newaxis] - embeddings[np.newaxis, :]) ** 2).sum(axis=2))

# Step 5: Define function to find similar words using distance matrix
def find_similar_words(search_term, top_n=6):
    search_term_idx = word_index.get(search_term)
    
    if not search_term_idx:
        return f"'{search_term}' not found in the vocabulary."
    
    similar_indices = distance_matrix[search_term_idx - 1].argsort()[1:top_n + 1]
    similar_words = {search_term: [index_word[idx + 1] for idx in similar_indices]}
    
    return similar_words

# Example: Find similar words to 'influenza'
similar_words = find_similar_words('viruses')
print(f"Similar words to 'influenza': {similar_words}")

Similar words to 'influenza': {'viruses': ['estimates', 'we', 'present', 'serial', 'the', 'both']}
