In [1]:
# #colabを使う方はこちらを使用ください。
# !pip install torch==1.5.0
# !pip install torchvision==0.6.0
# !pip install torchtext==0.3.1
# !pip install numpy==1.21.6
# !pip install matplotlib==3.2.2
# !pip install Pillow==7.1.2
# !pip install opencv-python==4.6.0

In [None]:
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
import torch.nn.functional as F
#torchtextを使用
from torchtext import data
from torchtext import vocab
from torchtext import datasets

%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt

In [2]:
# データとモデルに.to(device)を指定してgpuの計算資源を使用する。
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

#文章生成

## データの読み込み

In [3]:
tokenize = lambda x: x.split()
# 前処理用の機能のFieldをセットアップ
#Field
TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, batch_first=True)

# データを取得
# The Penn Treebankデータセット。
train_dataset, val_dataset, test_dataset = datasets.PennTreebank.splits(TEXT)

TEXT.build_vocab(train_dataset, vectors=vocab.GloVe(name='6B', dim=300))

downloading ptb.train.txt


ptb.train.txt: 5.10MB [00:00, 52.1MB/s]                   


downloading ptb.valid.txt


ptb.valid.txt: 400kB [00:00, 10.5MB/s]                   


downloading ptb.test.txt


ptb.test.txt: 450kB [00:00, 11.6MB/s]                   
.vector_cache/glove.6B.zip: 862MB [01:18, 11.0MB/s]                           
100%|█████████▉| 399783/400000 [00:50<00:00, 8062.25it/s]

In [4]:
#全単語数
vocab_size = len(TEXT.vocab)
print(vocab_size)
# 単語の件数のtop10
print(TEXT.vocab.freqs.most_common(10))
# 単語
print(TEXT.vocab.itos[:10])

#埋め込みベクトルを取得
word_embeddings = TEXT.vocab.vectors
# ハイパーパラメータ
embedding_length = 300
hidden_size = 256
batch_size = 32

10001
[('the', 50770), ('<unk>', 45020), ('<eos>', 42068), ('n', 32481), ('of', 24400), ('to', 23638), ('a', 21196), ('in', 18000), ('and', 17474), ("'s", 9784)]
['<unk>', '<pad>', 'the', '<eos>', 'n', 'of', 'to', 'a', 'in', 'and']


In [5]:
# BPTTIteratorは言語モデル用のイテレータ作成を行います。
# textとtarget属性を持ちます。
train_iter, val_iter, test_iter = data.BPTTIterator.splits(
    (train_dataset, val_dataset, test_dataset),
    batch_size=32,
    bptt_len=30,
    repeat=False
)

print(len(train_iter))
print(len(val_iter))
print(len(test_iter))

969
77
86


In [6]:
for i, batch in enumerate(train_iter):
    print('データの形状確認')
    print(batch.text.size())
    print(batch.target.size())
    #BPTTIteratorがBatch firstになってない件は2018/11/24時点では#462がPull requestsがされています。
    print('permuteでバッチを先にする')
    print(batch.text.permute(1, 0).size())
    print(batch.target.permute(1, 0).size())
    print('データ目の形状とデータを確認')
    text = batch.text.permute(1, 0)
    target = batch.target.permute(1, 0)
    print(text[1,:].size())
    print(target[1,:].size())
    print(text[1,:].tolist())
    print(target[1,:].tolist())
    print('データの単語列を表示')
    print([TEXT.vocab.itos[data] for data in  text[1,:].tolist()])
    print([TEXT.vocab.itos[data] for data in  target[1,:].tolist()])
            
    break

データの形状確認
torch.Size([30, 32])
torch.Size([30, 32])
permuteでバッチを先にする
torch.Size([32, 30])
torch.Size([32, 30])
データ目の形状とデータを確認
torch.Size([30])
torch.Size([30])
[38, 34, 853, 7536, 1315, 6, 591, 19, 2, 236, 5, 40, 124, 3, 125, 2, 209, 591, 34, 937, 55, 383, 12, 216, 4, 247, 72, 1024, 3, 216]
[34, 853, 7536, 1315, 6, 591, 19, 2, 236, 5, 40, 124, 3, 125, 2, 209, 591, 34, 937, 55, 383, 12, 216, 4, 247, 72, 1024, 3, 216, 383]
データの単語列を表示
['company', 'will', 'begin', 'mailing', 'materials', 'to', 'shareholders', 'at', 'the', 'end', 'of', 'this', 'week', '<eos>', 'under', 'the', 'offer', 'shareholders', 'will', 'receive', 'one', 'right', 'for', 'each', 'n', 'common', 'shares', 'owned', '<eos>', 'each']
['will', 'begin', 'mailing', 'materials', 'to', 'shareholders', 'at', 'the', 'end', 'of', 'this', 'week', '<eos>', 'under', 'the', 'offer', 'shareholders', 'will', 'receive', 'one', 'right', 'for', 'each', 'n', 'common', 'shares', 'owned', '<eos>', 'each', 'right']


## ネットワークを定義

In [None]:
class LstmLangModel(nn.Module):
    def __init__(self, batch_size, hidden_size, vocab_size, embedding_length, weights):
        super(LstmLangModel, self).__init__()
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embed = nn.Embedding(vocab_size, embedding_length)
        self.embed.weight.data.copy_(weights)
        self.lstm = nn.LSTM(embedding_length, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x, h):
        x = self.embed(x)
        output_seq, (h, c) = self.lstm(x, h)
        # 出力を変形する (batch_size*sequence_length, 隠れ層のユニット数hidden_size)
        out = output_seq.reshape(output_seq.size(0)*output_seq.size(1), output_seq.size(2))
        out = self.fc(out) 
        return out, (h, c)

net = LstmLangModel(batch_size, hidden_size, vocab_size, embedding_length, word_embeddings)
net = net.to(device)


In [None]:
# 損失関数、最適化関数を定義
criterion = nn.CrossEntropyLoss()
optim = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()))

## 学習

In [None]:
num_epochs = 200
train_loss_list = []

# Truncated backpropagation
# 逆伝播を途中で打ち切る
def detach(states):
    return [state.detach() for state in states] 

for epoch in range(num_epochs):
    train_loss = 0
    # 初期隠れ状態とセル状態を設定する
    states = (torch.zeros(1, batch_size, hidden_size).to(device),
              torch.zeros(1, batch_size, hidden_size).to(device))
    #train
    net.train()
    for i, batch in enumerate(train_iter):
        text = batch.text.to(device)
        labels = batch.target.to(device)
        #LSTMの形状に合わせて入力もバッチを先にする。
        text = text.permute(1, 0)
        labels = labels.permute(1, 0)
        
        optim.zero_grad()
        states = detach(states)
        outputs, states = net(text, states)
        loss = criterion(outputs, labels.reshape(-1))
        train_loss += loss.item()
        loss.backward()
        optim.step()
    avg_train_loss = train_loss / len(train_iter)
    print ('Epoch [{}/{}], Loss: {loss:.4f}, Perplexity: {perp:5.2f}' 
           .format(epoch+1, num_epochs, i+1, loss=avg_train_loss, perp=np.exp(avg_train_loss)))
    train_loss_list.append(avg_train_loss)


Epoch [1/200], Loss: 5.8131, Perplexity: 334.66
Epoch [2/200], Loss: 5.0673, Perplexity: 158.74
Epoch [3/200], Loss: 4.7584, Perplexity: 116.56
Epoch [4/200], Loss: 4.5410, Perplexity: 93.79
Epoch [5/200], Loss: 4.3679, Perplexity: 78.88
Epoch [6/200], Loss: 4.2214, Perplexity: 68.13
Epoch [7/200], Loss: 4.0926, Perplexity: 59.89
Epoch [8/200], Loss: 3.9768, Perplexity: 53.35
Epoch [9/200], Loss: 3.8713, Perplexity: 48.01
Epoch [10/200], Loss: 3.7737, Perplexity: 43.54
Epoch [11/200], Loss: 3.6819, Perplexity: 39.72
Epoch [12/200], Loss: 3.5955, Perplexity: 36.43
Epoch [13/200], Loss: 3.5150, Perplexity: 33.62
Epoch [14/200], Loss: 3.4382, Perplexity: 31.13
Epoch [15/200], Loss: 3.3643, Perplexity: 28.91
Epoch [16/200], Loss: 3.2940, Perplexity: 26.95
Epoch [17/200], Loss: 3.2267, Perplexity: 25.20
Epoch [18/200], Loss: 3.1625, Perplexity: 23.63
Epoch [19/200], Loss: 3.1020, Perplexity: 22.24
Epoch [20/200], Loss: 3.0453, Perplexity: 21.02
Epoch [21/200], Loss: 2.9920, Perplexity: 19.9

## 生成

In [None]:
num_samples = 1000     # サンプリングされる単語の数
# モデルをテストする
net.eval()
with torch.no_grad():
    text = ''
    # 初期隠れ状態とセル状態を設定する
    states = (torch.zeros(1, 1, hidden_size).to(device),
              torch.zeros(1, 1, hidden_size).to(device))

    # ランダムに1単語のIDを選択
    input = torch.multinomial(torch.ones(vocab_size), num_samples=1).unsqueeze(1).to(device)
#     print("input word", TEXT.vocab.itos[input])
    
    for i in range(num_samples):
#         print("input word", TEXT.vocab.itos[input])
        
        output, states = net(input, states)
        word_id = output.max(1)[1].item()
        # 次のタイムステップのために単語IDを入力
        input.fill_(word_id)
        # 単語IDから文字を取得
        word = TEXT.vocab.itos[word_id]
        # textに書き込む
        word = '\n' if word == '<eos>' else word + ' '
        text += word

    # textを表示
    print(text)


drastically bottom watched rapidly trim visit sharing preferred cars there are also beginning to provide <unk> memory chips 
tandy corp. said it plans to sell its baby ruth <unk> and <unk> candy businesses to nestle foods corp. 's parent company 
<unk> is <unk> by montedison s.p a. of milan italy 
<unk> is subject to approval by former michael evans 
the offers though the accord is necessary to replace it completed its <unk> power 
the <unk> n.j. power producer is the result of a <unk> slowdown in the fields in the growing number of <unk> oil for coffee shares said 
as a result prices would have been <unk> and it could be <unk> down if a recession is looming challenge only one analyst here say 
perhaps the greatest share of the current comes is a result of the effect of the high price 
the order causes <unk> 
jack <unk> the consequences are being <unk> away 
<unk> <unk> 
associated with <unk> 
watch for something like <unk> a spokesman for the <unk> n.j. maker of <unk> <unk> <unk> ente