In [8]:
import numpy as np
from collections import Counter

CORPUS_FILE = "corpus_100k.txt"
MIN_COUNT = 2
WINDOW_N = 4

with open(CORPUS_FILE, "r", encoding="utf-8") as f:
    tokens = f.read().split()
freq = Counter(tokens)
vocab = [w for w, c in freq.items() if c>= MIN_COUNT]
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i: w for w,i in word2idx.items()}
V = len(vocab)
print("Vocab Size: ", V)

Vocab Size:  7010


In [3]:
#changing the tokens from words to their respective id values
ids = [word2idx[w] for w in tokens if w in word2idx]

#creating center, context pairs from the Window size
def build_context_target(idx, n = WINDOW_N):
    X, y = [],[]
    for i in range(n, len(ids)):
        X.append(ids[i-n:i])
        y.append(ids[i])

        #so in X[0] = [0,34,53,5] and y[0] = [6]  where X is the context and y is next predicted token
    return np.array(X, dtype=np.int32), np.array(y,dtype=np.int32)

X,y = build_context_target(ids,WINDOW_N)
print("Pairs:", X.shape, y.shape)
    

Pairs: (138667, 4) (138667,)


In [6]:
#Softmax Model
import tensorflow as tf 
from tensorflow.keras.layers import Input, Embedding, Reshape, Dense, Concatenate
from tensorflow.keras.models import Model 

EMBED_DIM = 128
HIDDEN = 256
LR = 1e-3
BATCH = 8192
EPOCHS = 5

#we are creating a tf dataset
ds = tf.data.Dataset.from_tensor_slices((X,y))

# What happens step-by-step
# shuffle(200_000)
# Keeps a sliding buffer of up to 200,000 examples in memory.
# Every time an element is requested downstream, it picks a random item from that buffer and then refills the buffer with the next element from the source.
# Effect: breaks correlations in the training order → better generalization.
# Trade-off: larger buffer = better randomness but more RAM. (If your dataset has fewer than 200k items, the buffer caps at dataset size.)
# batch(BATCH)
# Groups consecutive elements into tensors of size BATCH.
# If your features were shape (…), after batching they become shape (BATCH, …).
# Effect: vectorized ops on GPU/CPU → much faster training.
# Corner case: the last batch can be smaller if the dataset size isn’t divisible by BATCH (use drop_remainder=True to force equal batch sizes).
# prefetch(tf.data.AUTOTUNE)
# Overlaps the input pipeline with the model’s training step.
# While your model trains on batch N, the pipeline prepares batch N+1 asynchronously on a background thread.
# AUTOTUNE lets TF pick a good prefetch buffer size automatically for your machine.
# Effect: reduces input stalls; keeps the GPU busy.

ds = ds.shuffle(200_000).batch(BATCH).prefetch(tf.data.AUTOTUNE)

In [9]:
inp = Input(shape = (WINDOW_N,), dtype="int32")
emb = Embedding(input_dim = V, output_dim = EMBED_DIM, name="embed")(inp)
flat = Reshape((WINDOW_N * EMBED_DIM,))(emb)
h = Dense(HIDDEN, activation='tanh')(flat)
out = Dense(V, activation='softmax')(h)
model = Model(inp, out)
model.compile(
    optimizer = tf.keras.optimizers.Adam(LR),
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics = ["sparse_categorical_accuracy"]
)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 4)]               0         
                                                                 
 embed (Embedding)           (None, 4, 128)            897280    
                                                                 
 reshape (Reshape)           (None, 512)               0         
                                                                 
 dense (Dense)               (None, 256)               131328    
                                                                 
 dense_1 (Dense)             (None, 7010)              1801570   
                                                                 
Total params: 2,830,178
Trainable params: 2,830,178
Non-trainable params: 0
_________________________________________________________________


In [14]:
history = model.fit(ds, epochs = 1000, verbose=1)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [15]:
lm_embeddings = model.get_layer("embed").get_weights()[0]  # shape (V, D)
np.save("nlm_embeddings.npy", lm_embeddings)

In [16]:
# nearest neighbors (cosine) for the learned LM embeddings
from sklearn.metrics.pairwise import cosine_similarity

def most_similar(word, word2idx, idx2word, W, topk=10):
    if word not in word2idx:
        return []
    v = W[word2idx[word]].reshape(1, -1)
    sims = cosine_similarity(W, v).ravel()
    top = sims.argsort()[-topk-1:][::-1][1:]
    return [(idx2word[i], float(sims[i])) for i in top]

print(most_similar("language", word2idx, idx2word, lm_embeddings)[:10])


[('guage', 0.36542579531669617), ('nar', 0.3303584158420563), ('judgment', 0.3270430862903595), ('berkeley', 0.31574636697769165), ('instructgpt', 0.3032030165195465), ('istanbul', 0.29914185404777527), ('addressee', 0.2902972102165222), ('attentionself', 0.28765052556991577), ('probabilistic', 0.2833525240421295), ('nication', 0.282953143119812)]


In [17]:
print(most_similar("model", word2idx, idx2word, lm_embeddings)[:10])

[('el', 0.36014866828918457), ('llm', 0.31853336095809937), ('suite', 0.3150647282600403), ('pseudoword', 0.307760089635849), ('classiﬁer', 0.30342888832092285), ('kintsch', 0.2953271269798279), ('prior', 0.29221847653388977), ('egas', 0.28404054045677185), ('battle', 0.2799011766910553), ('tokenizer', 0.27823105454444885)]
