In [1]:
file = open("/home/mviswanathsai/Downloads/names.txt", "r")
words = file.read().splitlines()

In [2]:
import random
import torch # Just to ensure consistent results, use the PyTorch generator's seed

# 1. Set a manual seed for reproducibility
g = torch.Generator().manual_seed(2147483647)
# Use the Python random module, seeded from the PyTorch generator's state
random.seed(g.initial_seed()) # Seeds Python's random with the PyTorch seed

# 2. Shuffle the data
# Shuffling the list in-place ensures the splits are random and not sequential
random.shuffle(words)

# 3. Calculate split sizes
total_size = len(words)
train_size = int(0.8 * total_size)  # 80%
dev_size = int(0.1 * total_size)    # 10%
# The remaining 10% goes to the test set to account for potential floating point errors
test_size = total_size - train_size - dev_size

# 4. Perform the slicing
train_data = words[:train_size]
dev_data = words[train_size : train_size + dev_size]
test_data = words[train_size + dev_size :]

print(f"Total words: {total_size}")
print(f"Train set size (80%): {len(train_data)}")
print(f"Dev set size (10%): {len(dev_data)}")
print(f"Test set size (10%): {len(test_data)}")

Total words: 32033
Train set size (80%): 25626
Dev set size (10%): 3203
Test set size (10%): 3204


In [4]:
alphabets = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(alphabets)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [5]:
xs, ys = [], [] 

for w in words:
    w = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(w, w[1:]):
        xs.append(stoi[ch1])
        ys.append(stoi[ch2])

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()

xs.data

tensor([ 0, 11,  8,  ..., 18, 18, 25])

In [22]:
import torch.nn.functional as F

W = torch.randn((27,27), generator=g, requires_grad=True)
g = torch.Generator().manual_seed(2147483647)
xenc = F.one_hot(xs, num_classes=27).float() 

(W[xs]).shape

torch.Size([228146, 27])

In [31]:
for k in range(200):
    logits = W[xs] # log counts
    counts = logits.exp() # counts, equivalent to what we had in N.
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(num), ys].log().mean()

    print(loss.item())

    #backward pass
    W.grad = None
    loss.backward()
    
    W.data += -0.8* W.grad

2.5469045639038086
2.546839714050293
2.5467753410339355
2.546710968017578
2.5466465950012207
2.5465822219848633
2.546517848968506
2.5464537143707275
2.5463900566101074
2.546325922012329
2.546261787414551
2.5461981296539307
2.5461344718933105
2.5460708141326904
2.546006917953491
2.5459437370300293
2.545880079269409
2.545816659927368
2.5457539558410645
2.5456902980804443
2.5456271171569824
2.5455639362335205
2.5455009937286377
2.545438289642334
2.5453755855560303
2.5453126430511475
2.545250177383423
2.54518723487854
2.5451247692108154
2.54506254196167
2.5450003147125244
2.5449376106262207
2.544875383377075
2.544813394546509
2.5447514057159424
2.544689416885376
2.5446271896362305
2.544565439224243
2.544503688812256
2.5444419384002686
2.5443801879882812
2.544318914413452
2.544257402420044
2.5441958904266357
2.5441346168518066
2.5440735816955566
2.5440120697021484
2.5439510345458984
2.5438899993896484
2.5438292026519775
2.5437679290771484
2.5437071323394775
2.5436463356018066
2.543585777282

In [32]:
g = torch.Generator().manual_seed(2147483647)

for i in range(10):
    out = []
    ix = 0
    while True:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W # log counts
        counts = logits.exp() # counts, equivalent to what we had in N.
        p = counts / counts.sum(1, keepdims=True)

        ix = torch.multinomial(p, num_samples = 1, replacement = True, generator = g).item()
        out.append(itos[ix])
        if ix == 0: 
            break
    print(''.join(out))


cexza.
mogllurailezityha.
konimittain.
llayn.
ka.
da.
staiypucjalerigotai.
miziellavo.
ke.
teda.
