In [18]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Lambda,Dense
import tensorflow.keras.backend as K

In [26]:
# Textual Document 1
text = """The speed of transmission is an important point of difference between the two viruses. 
Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) 
and a shorter serial interval (the time between successive cases) than COVID-19 virus. 
The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. 
This means that influenza can spread faster than COVID-19. 
Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus 
before the appearance of symptoms – is a major driver of transmission for influenza. 
In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, 
at present, this does not appear to be a major driver of transmission. 
The reproductive number – the number of secondary infections generated from one infected individual – 
is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza."""

# Load textual document from file
# with open("document1.txt", "r", encoding="utf-8") as f:
    # text = f.read()

In [20]:
# Tokenize text
tok=Tokenizer()
tok.fit_on_texts([text])
seq=tok.texts_to_sequences([text])[0]
vocab_size=len(tok.word_index)+1

word_index=tok.word_index
index_word={i: w for w, i in word_index.items()}

print("Vocabulary size:", vocab_size)
print("Sequence:", seq)

Vocabulary size: 92
Sequence: [1, 37, 2, 4, 5, 38, 39, 40, 2, 41, 12, 1, 42, 43, 6, 44, 11, 20, 45, 46, 47, 1, 21, 22, 48, 7, 23, 2, 24, 25, 11, 20, 13, 14, 1, 21, 12, 49, 50, 15, 8, 9, 3, 1, 13, 14, 10, 8, 9, 3, 5, 51, 7, 16, 17, 52, 18, 26, 10, 6, 3, 1, 13, 14, 5, 27, 18, 28, 53, 29, 6, 30, 54, 55, 15, 8, 9, 56, 4, 31, 1, 57, 27, 17, 18, 2, 58, 59, 60, 61, 62, 4, 63, 2, 1, 3, 64, 1, 23, 2, 24, 19, 5, 11, 32, 33, 2, 4, 10, 6, 31, 65, 26, 66, 34, 67, 29, 68, 34, 69, 70, 30, 71, 8, 9, 3, 72, 73, 74, 75, 7, 76, 77, 78, 79, 28, 80, 81, 82, 7, 16, 11, 32, 33, 2, 4, 1, 83, 35, 19, 1, 35, 2, 84, 85, 86, 22, 87, 88, 89, 19, 5, 90, 7, 16, 12, 36, 25, 36, 17, 10, 8, 9, 3, 91, 15, 10, 6]


In [21]:
# GENERATE TRAINING DATA (CBOW)

window=2
x,y=[],[]

for i in range(window,len(seq)-window):
    context=seq[i-window:i]+seq[i+1:i+window+1]
    x.append(context)
    y.append(seq[i])

x=np.array(x)
y=np.array(y)

print("\nContext samples:\n", x[:5])
print("\nTarget samples:\n", y[:5])


Context samples:
 [[ 1 37  4  5]
 [37  2  5 38]
 [ 2  4 38 39]
 [ 4  5 39 40]
 [ 5 38 40  2]]

Target samples:
 [ 2  4  5 38 39]


In [22]:
# TRAIN CBOW MODEL

model=Sequential([
    Embedding(vocab_size,8),
    Lambda(lambda x: K.mean(x,axis=1)),
    Dense(vocab_size,activation='softmax')
])

In [23]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam'
)
model.fit(
    x,y,
    epochs=200,verbose=0
)
model.summary()
print("Model train completed")

Model train completed


In [24]:
embedding=model.layers[0].get_weights()[0]

print("\n Word Embedding:")
for word,idx in word_index.items():
    print(f"{word} -> {embedding[idx]}")


 Word Embedding:
the -> [ 0.48223373 -0.7064093   0.4739999  -1.2804916  -1.41661    -1.0705047
 -0.5535043   1.3532219 ]
of -> [-1.0691972   1.0154612   0.9003145  -0.35069826 -1.0400981  -0.1958842
  0.04818124 -1.070577  ]
virus -> [-2.017549   -1.5765887  -0.9236948  -1.3491915  -0.69449824 -0.6293216
 -1.246565   -0.39912507]
transmission -> [-1.481478    0.7053228   0.56648254 -0.5956526  -1.0440086  -1.7377993
 -0.59124696  0.07005214]
is -> [ 0.10857022  0.05405532  1.0784173  -0.40956917 -1.236673    0.5854724
  0.6084818  -0.5346041 ]
influenza -> [-0.37834445  0.4837226  -1.1962872  -0.37750697  0.6871302  -0.8964035
 -0.54950213  0.39822844]
to -> [ 1.3806252  -0.568329    1.6155183   0.49527115 -0.34260574  0.49761575
  0.18177035 -1.393679  ]
covid -> [-0.883328   -1.8170197  -1.0197765  -1.0955948   0.02089258 -1.4588723
  0.6937738  -0.6710783 ]
19 -> [-1.2808963 -1.3693185 -1.1041049 -1.5313689  0.6443467 -1.7052262
 -0.5805274  1.2053832]
for -> [-0.22486846 -0.97526

In [25]:
def predict_missing(w1, w2, w3, w4):
    try:
        seq = np.array([word_index[w1], word_index[w2], word_index[w3], word_index[w4]]).reshape(1, 4)
    except KeyError as e:
        return f"Word not found in vocabulary: {e}"

    pred = model.predict(seq, verbose=0)
    pred_id = int(np.argmax(pred))
    return index_word[pred_id]

print("\nPredictions:")
print(predict_missing("the", "speed", "transmission", "is"))
print(predict_missing("the", "serial", "for", "virus"))


Predictions:
of
virus
