In [1]:
import random
import copy

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
f = open('names.txt', 'r', encoding='utf-8')
names = f.read().split("\n")
f.close()

names = ["." + name + "." for name in names]
names[:5]

['.emma.', '.olivia.', '.ava.', '.isabella.', '.sophia.']

In [3]:
CHARS = sorted(list(set("".join(names))))
"".join(CHARS)

'.abcdefghijklmnopqrstuvwxyz'

In [4]:
stoi = {ch: i for i, ch in enumerate(CHARS)}
itos = {i: ch for i, ch in enumerate(CHARS)}

text = ".hello.world."
"".join([itos[i] for i in [stoi[ch] for ch in text]])

'.hello.world.'

In [5]:
class NameDataset(Dataset):
    def __init__(self, names):
        self.x = []
        self.y = []
        for name in names:
            for ch1, ch2 in zip(name, name[1:]):
                self.x.append(stoi[ch1])
                self.y.append(stoi[ch2])
        self.x = torch.tensor(self.x)
        self.y = torch.tensor(self.y)
        self.x = F.one_hot(self.x, num_classes=len(CHARS)).float()
        
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    
    def __len__(self):
        return len(self.x)

In [6]:
class SimpleGram(nn.Module):
    def __init__(self):
        super().__init__()
        self.lin = nn.Linear(len(CHARS), len(CHARS))
        
    def forward(self, x):
        return F.softmax(self.lin(x), dim=-1)

In [7]:
def logP_loss(model, probs, y, alpha=1e-3):
    logP = -probs[torch.arange(len(y)), y].log().mean()
    reg_loss = alpha * sum([(p**2).sum() for p in model.parameters()])
    return logP + reg_loss

In [8]:
def train(model, train_loader, val_loader, epochs=5, lr=1e-1, alpha=1e-3):
    model.to("cuda")
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    prev_loss = None
    for epoch in range(epochs):
        old_model_state = model.state_dict()
        model.train()
        train_loss = 0
        train_total = 0
        for x, y in train_loader:
            x, y = x.to("cuda"), y.to("cuda")
            probs = model(x)
            loss = logP_loss(model, probs, y, alpha)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()*y.size(0)
            train_total += y.size(0)
        train_loss /= train_total
            
        model.eval()
        val_loss = 0
        val_total = 0
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to("cuda"), y.to("cuda")
                probs = model(x)
                loss = logP_loss(model, probs, y, alpha)
                val_loss += loss.item()*y.size(0)
                val_total += y.size(0)
            val_loss /= val_total
                
        print(f"Epoch {epoch+1}: Learning Rate={lr}, Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
        if prev_loss is not None and train_loss + val_loss > prev_loss:
            model.load_state_dict(old_model_state)
            lr /= 10
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
        else:
            prev_loss = train_loss + val_loss

In [9]:
def makemore(model):
    letters = ["."]
    
    model.to("cuda")
    model.eval()
    with torch.no_grad():
        while True:
            x = F.one_hot(torch.tensor(stoi[letters[-1]]).unsqueeze(0), num_classes=len(CHARS)).float().to("cuda")
            probs = model(x)
            char = itos[torch.multinomial(probs, num_samples=1, replacement=True).item()]
            letters.append(char)
            if char == ".":
                break
    return "".join(letters)

In [10]:
random.seed(42)
random.shuffle(names)
split = int(len(names)*0.9)
train_data = NameDataset(names[:split])
val_data = NameDataset(names[split:])

train_loader = DataLoader(train_data, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_data, batch_size=1024)

len(train_loader), len(val_loader)

(201, 23)

In [11]:
model = SimpleGram()
train(model, train_loader, val_loader, epochs=10, lr=1e-2, alpha=1e-4)

Epoch 1: Learning Rate=0.01, Train Loss=2.6970, Val Loss=2.5395
Epoch 2: Learning Rate=0.01, Train Loss=2.5253, Val Loss=2.5193
Epoch 3: Learning Rate=0.01, Train Loss=2.5142, Val Loss=2.5137
Epoch 4: Learning Rate=0.01, Train Loss=2.5101, Val Loss=2.5108
Epoch 5: Learning Rate=0.01, Train Loss=2.5084, Val Loss=2.5098
Epoch 6: Learning Rate=0.01, Train Loss=2.5077, Val Loss=2.5094
Epoch 7: Learning Rate=0.01, Train Loss=2.5073, Val Loss=2.5092
Epoch 8: Learning Rate=0.01, Train Loss=2.5071, Val Loss=2.5089
Epoch 9: Learning Rate=0.01, Train Loss=2.5070, Val Loss=2.5087
Epoch 10: Learning Rate=0.01, Train Loss=2.5069, Val Loss=2.5086


In [23]:
list(model.parameters())[0][0].softmax(dim=0)

tensor([0.0008, 0.0468, 0.0180, 0.0114, 0.0351, 0.0506, 0.0236, 0.0177, 0.1305,
        0.0333, 0.0165, 0.0285, 0.0449, 0.0331, 0.1546, 0.0252, 0.0147, 0.0241,
        0.0350, 0.0457, 0.0277, 0.0115, 0.0188, 0.0177, 0.0447, 0.0672, 0.0225],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [12]:
for i in range(10):
    print(makemore(model)[1:-1])

.ka.
.tiry.
.ainionedeylyaomottrmilalana.
.hifenahobliahts.
.n.
.treeyaze.
.mayahanari.
.aolanay.
.krarje.
.lialaer.
