In [130]:
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras.preprocessing import text
from tensorflow.keras.utils import to_categorical
import numpy as np
import requests
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, regexp_tokenize

Download and preprocess the dataset, which consists of Shakespeare's plays.

In [163]:
# Download the text data
path_to_file = tf.keras.utils.get_file(
   'shakespeare.txt', 
   'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
text = text.lower()

In [164]:
# Split data into training/testing
sentences = sent_tokenize(text)
split = int(len(sentences) * 0.8)
train_sent = sentences[:split]
test_sent = sentences[split:]

# Create tf datasets
ds_sentences = tf.data.Dataset.from_tensor_slices(sentences)
ds_train = tf.data.Dataset.from_tensor_slices(train_sent)
ds_test = tf.data.Dataset.from_tensor_slices(test_sent)

Define the Skip-Gram model.

In [165]:
# Define the Skip-Gram model
vocab_size = 10000

vectorizer = keras.layers.experimental.preprocessing.TextVectorization(max_tokens=vocab_size,input_shape=(1,))
embedder = keras.layers.Embedding(vocab_size,30,input_length=1)

model = keras.Sequential([
    embedder,
    keras.layers.Flatten(),
    keras.layers.Dense(vocab_size,activation='softmax')
])

model.summary()

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_17 (Embedding)    (None, 1, 30)             300000    
                                                                 
 flatten_12 (Flatten)        (None, 30)                0         
                                                                 
 dense_17 (Dense)            (None, 10000)             310000    
                                                                 
Total params: 610,000
Trainable params: 610,000
Non-trainable params: 0
_________________________________________________________________


In [166]:
vectorizer.adapt(ds_train.batch(256))
vocab = vectorizer.get_vocabulary()

In [136]:
print(vocab)



Generate testing and training data for the model.

In [167]:
# Function to generate Skip-gram pairs
def to_skipgram(sent, window_size=2):
    pairs = []
    for i, target in enumerate(sent):
        for j in range(max(0, i - window_size), min(i + window_size + 1, len(sent))):
            if i != j:
                pairs.append([target, sent[j]])
    return pairs

In [169]:
# Lists to store Skip-gram data
X_train = []
Y_train = []
X_test = []
Y_test = []

# Generate Skip-gram pairs for training data
for sentence in train_sent:
    sent = vectorizer(sentence)
    pairs = to_skipgram(sent, window_size=2)
    for pair in pairs:
        X_train.append(pair[0])
        Y_train.append(pair[1])

# Generate Skip-gram pairs for testing data
for sentence in test_sent:
    sent = vectorizer([sentence])
    pairs = to_skipgram(sent, window_size=2)
    for pair in pairs:
        X_test.append(pair[0])
        Y_test.append(pair[1])

In [171]:
# Create training/testing datasets
ds_train = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).batch(256)
ds_test = tf.data.Dataset.from_tensor_slices((X_test, Y_test)).batch(256)

In [172]:
# Compile the model
model.compile(optimizer=keras.optimizers.SGD(learning_rate=0.1),loss='sparse_categorical_crossentropy')

# Train the model
model.fit(ds_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21a74b1eb20>

Predict neighboring words in the dataset.

In [174]:
# Extract learned word embeddings
embeddings = model.layers[0].get_weights()[0]

In [175]:
# Function to find neighboring words given a target word
def find_neighboring_words(target_word, embeddings, vocab, top_n=5):
    if target_word not in vocab:
        return []

    target_index = vocab.index(target_word)
    target_embedding = embeddings[target_index]

    # Calculate cosine similarities between the target embedding and all other embeddings
    similarities = np.dot(embeddings, target_embedding) / (
        np.linalg.norm(embeddings, axis=1) * np.linalg.norm(target_embedding)
    )

    # Get the indices of the top-n most similar words
    top_indices = np.argsort(similarities)[::-1][:top_n]

    # Retrieve the neighboring words based on the indices
    neighboring_words = [vocab[i] for i in top_indices if i != target_index]

    return neighboring_words

In [181]:
# Example: Find neighboring words for a target word
target_word = "all"
target_word = target_word.lower()
neighboring_words = find_neighboring_words(target_word, embeddings, vectorizer.get_vocabulary())

print(f"Neighboring words for '{target_word}': {neighboring_words}")

Neighboring words for 'all': ['the', 'these', 'us', 'disease']
