In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import torchtext.data as data
import torch.nn.functional as F
import matplotlib.pyplot as plt
import os
import random
%matplotlib inline

<h1>Implementation: Neural Probabilistic Language Model<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Create-dataset" data-toc-modified-id="Create-dataset-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Create dataset</a></span></li><li><span><a href="#Build-the-model" data-toc-modified-id="Build-the-model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Build the model</a></span></li><li><span><a href="#Train-the-model" data-toc-modified-id="Train-the-model-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Train the model</a></span></li></ul></div>

## Create dataset

* write each sample as a CSV file

```python
import tqdm
import csv

n_gram = 3

with open("/Users/PSH/data/tokenized/corpus_mecab.txt") as r:
    with open(f"/Users/PSH/data/csv/trigram_corpus_mecab.csv", "w", encoding='utf-8') as w:
        fieldnames = ['text', 'label']
        writer = csv.DictWriter(w, fieldnames=fieldnames)
        #writer.writeheader()
        
        for idx, sample in tqdm.tqdm(enumerate(r)):
            sentence = sample.split(" ")
            for i in range(len(sentence)-n_gram):
                text = " ".join(sentence[i:i+n_gram])
                target = sentence[i+n_gram]
                writer.writerow({"text": text, "label": target})
```

In [None]:
# `gsplit` if on macOS
!split -d -l 1000000 ~/data/csv/trigram_corpus_mecab.csv ~/data/csv/trigram_corpus_mecab_ --additional-suffix=.csv

* build custom torchtext dataset with `torchtext.data.TabularDataset`.

In [12]:
TEXT = data.Field()
LABEL = data.Field()

In [18]:
datafiles = [fname for fname in os.listdir("/Users/PSH/data/csv/") if fname.startswith("trigram_corpus_mecab_")]

In [20]:
dataset = data.TabularDataset(
    f"/Users/PSH/data/csv/{datafiles[0]}", 
    format="csv",
    skip_header=True,
    fields=[
        ("text", TEXT),
        ("label", LABEL)
    ]
)

In [24]:
trainset, testset = dataset.split(0.8, random_state=random.seed(0))

In [26]:
trainset, validationset = trainset.split(0.9, random_state=random.seed(0))

* Build vocabulary

In [27]:
MAX_VOCAB_SIZE = 100_000

TEXT.build_vocab(trainset, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(trainset)

* Now we build a `BucketIterator` for our model.

In [28]:
BATCH_SIZE = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [31]:
trainiter, validationiter, testiter = torchtext.data.BucketIterator.splits(
    (trainset, validationset, testset),
    batch_size=BATCH_SIZE,
    device=device
)



## Build the model

![bengio et al.png](https://miro.medium.com/max/2408/1*EqKiy4-6tuLSoPP_kub33Q.png)

In [109]:
VOCAB_SIZE = len(TEXT.vocab)
N_GRAM = 3
EMBEDDING_DIM = 32
HIDDEN_DIM =  128

In [143]:
class NPLM(nn.Module):
    
    def __init__(self, vocab_size, n_gram, embedding_dim, hidden_dim):
        super(NPLM, self).__init__()
        self.vocab_size = vocab_size
        self.n_gram = n_gram
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        
        # embedding
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # affine layers for tanh
        self.linear1 = nn.Linear(n_gram * embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size, bias=False)
        # affine layer for residual connection
        self.linear3 = nn.Linear(n_gram * embedding_dim, vocab_size)
        
    def forward(self, x):
        x = self.embedding(x)
        
        x = x.view(1, -1, self.embedding_dim * self.n_gram)

        x1 = torch.tanh(self.linear1(x))
        x1 = self.linear2(x1)
        x2 = self.linear3(x)
        x = x1 + x2

        return x

## Train the model

* train the model

In [144]:
model = NPLM(VOCAB_SIZE, N_GRAM, EMBEDDING_DIM, HIDDEN_DIM)

In [145]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [147]:
def accuracy(pred, target):
    pred = torch.argmax(torch.softmax(pred, dim=1))
    correct = (pred == target).float()
    return correct.sum() / len(correct)

In [149]:
N_EPOCH = 10

for i in range(1, N_EPOCH+1):
    loss_epoch = 0.
    acc_epoch = 0.
    for batch in trainiter:
        model.zero_grad()
        
        out = model(batch.text)
        out = out.squeeze(0)
        target = batch.label.squeeze(0)
        #print(out.shape)
        
        loss = criterion(out, target)
        loss.backward()
        optimizer.step()
        
        loss_epoch += loss.item()
        acc_epoch += accuracy(out, target)
        
    print(loss_epoch / len(trainiter),\
           acc_epoch / len(trainiter))

KeyboardInterrupt: 