In [0]:
# #colabを使う方はこちらを使用ください。
# !pip install torch==0.4.1
# !pip install torchvision==0.2.1
# !pip install numpy==1.14.6
# !pip install matplotlib==2.1.2
# !pip install pillow==5.0.0
# !pip install opencv-python==3.4.3.18
# !pip install torchtext==0.3.1

In [0]:
!wget --quiet --continue http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
!tar -xzf simple-examples.tgz

In [3]:
!ls simple-examples

1-train		   5-one-iter		       9-char-based-lm	temp
2-nbest-rescore    6-recovery-during-training  data
3-combination	   7-dynamic-evaluation        models
4-data-generation  8-direct		       rnnlm-0.2b


In [0]:
!mv ./simple-examples/data/ptb.train.txt train.txt
!mv ./simple-examples/data/ptb.valid.txt valid.txt
!mv ./simple-examples/data/ptb.test.txt test.txt

In [5]:
!ls

data		      net.ckpt	       simple-examples.tgz  valid.txt
hymenoptera_data      sample_data      test.txt
hymenoptera_data.zip  simple-examples  train.txt


In [0]:
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
import torch.nn.functional as F
from torchtext import data
from torchtext import vocab
from torchtext import datasets

%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt

In [7]:
# データとモデルに.to(device)を指定してgpuの計算資源を使用する。
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

#文章生成

## データの読み込み

In [0]:
# 前処理用の機能のFieldをセットアップ
#Field
TEXT = data.Field(batch_first=True)
#LabelField
LABEL = data.LabelField()
# データを取得
train_dataset, val_dataset, test_dataset = datasets.LanguageModelingDataset.splits(path="."
                                        , train="train.txt"
                                        , validation="valid.txt"
                                        , test="test.txt"
                                        , text_field=TEXT)

TEXT.build_vocab(train_dataset, vectors=vocab.GloVe(name='6B', dim=300))

In [11]:
#全単語数
vocab_size = len(TEXT.vocab)
print(vocab_size)
# 単語の件数のtop10
print(TEXT.vocab.freqs.most_common(10))
# 単語
print(TEXT.vocab.itos[:10])

#埋め込みベクトルを取得
word_embeddings = TEXT.vocab.vectors
# ハイパーパラメータ
embedding_length = 300
hidden_size = 256
batch_size = 32

10001
[('the', 50770), ('<unk>', 45020), ('<eos>', 42068), ('N', 32481), ('of', 24400), ('to', 23638), ('a', 21196), ('in', 18000), ('and', 17474), ("'s", 9784)]
['<unk>', '<pad>', 'the', '<eos>', 'N', 'of', 'to', 'a', 'in', 'and']


In [12]:
# BPTTIteratorは言語モデル用のイテレータ作成を行います。
# textとtarget属性を持ちます。
train_iter, val_iter, test_iter = data.BPTTIterator.splits((train_dataset, val_dataset, test_dataset)
                                                           , batch_size=32,  bptt_len=30, repeat=False)

print(len(train_iter))
print(len(val_iter))
print(len(test_iter))

969
77
86


In [13]:
for i, train in enumerate(train_iter):
    print("データの形状確認")
    print(train.text.size())
    print(train.target.size())
    print("permuteでバッチを先にする")
    print(train.text.permute(1, 0).size())
    print(train.target.permute(1, 0).size())
    print("１データ目の形状とデータを確認")
    text = train.text.permute(1, 0)
    target = train.target.permute(1, 0)
    print(text[0,:].size())
    print(target[0,:].size())
    print(text[0,:].tolist())
    print(target[0,:].tolist())
    print("１データ目の単語列を表示")
    print([TEXT.vocab.itos[data] for data in  text[0,:].tolist()])
    print([TEXT.vocab.itos[data] for data in  target[0,:].tolist()])
    print("２データ目の単語列を表示")
    print([TEXT.vocab.itos[data] for data in  text[1,:].tolist()])
    print([TEXT.vocab.itos[data] for data in  target[1,:].tolist()])
            
    break

データの形状確認
torch.Size([30, 32])
torch.Size([30, 32])
permuteでバッチを先にする
torch.Size([32, 30])
torch.Size([32, 30])
１データ目の形状とデータを確認
torch.Size([30])
torch.Size([30])
[9971, 9972, 9973, 9975, 9976, 9977, 9981, 9982, 9983, 9984, 9985, 9987, 9988, 9989, 9990, 9992, 9993, 9994, 9995, 9996, 9997, 9998, 9999, 10000, 3, 9257, 0, 4, 73, 394]
[9972, 9973, 9975, 9976, 9977, 9981, 9982, 9983, 9984, 9985, 9987, 9988, 9989, 9990, 9992, 9993, 9994, 9995, 9996, 9997, 9998, 9999, 10000, 3, 9257, 0, 4, 73, 394, 34]
１データ目の単語列を表示
['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim', 'snack-food', 'ssangyong', 'swapo', 'wachter', '<eos>', 'pierre', '<unk>', 'N', 'years', 'old']
['banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim'

## ネットワークを定義

In [0]:
class LstmLangModel(nn.Module):
    def __init__(self, batch_size, hidden_size, vocab_size, embedding_length, weights):
        super(LstmLangModel, self).__init__()
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embed = nn.Embedding(vocab_size, embedding_length)
        self.embed.weight.data.copy_(weights)
        self.lstm = nn.LSTM(embedding_length, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x, h):
        x = self.embed(x)
        output_seq, (h, c) = self.lstm(x, h)
        # 出力を変形する (batch_size*sequence_length, 隠れ層のユニット数hidden_size)
        out = output_seq.reshape(output_seq.size(0)*output_seq.size(1), output_seq.size(2))
        out = self.fc(out) 
        return out, (h, c)

net = LstmLangModel(batch_size, hidden_size, vocab_size, embedding_length, word_embeddings)
net = net.to(device)


In [0]:
# 損失関数、最適化関数を定義
criterion = nn.CrossEntropyLoss()
optim = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()))

## 学習

In [16]:
num_epochs = 200
train_loss_list = []

# Truncated backpropagation
# 逆伝播を途中で打ち切る
def detach(states):
    return [state.detach() for state in states] 

for epoch in range(num_epochs):
    train_loss = 0
    # 初期隠れ状態とセル状態を設定する
    states = (torch.zeros(1, batch_size, hidden_size).to(device),
              torch.zeros(1, batch_size, hidden_size).to(device))
    #train
    net.train()
    for i, batch in enumerate(train_iter):
      text = batch.text.to(device)
      labels = batch.target.to(device)
      text = text.permute(1, 0)
      labels = labels.permute(1, 0)
      
      optim.zero_grad()
      states = detach(states)
      outputs, states = net(text, states)
      loss = criterion(outputs, labels.reshape(-1))
      train_loss += loss.item()
      loss.backward()
      optim.step()
    avg_train_loss = train_loss / len(train_iter)
    print ('Epoch [{}/{}], Loss: {loss:.4f}, Perplexity: {perp:5.2f}' 
                   .format(epoch+1, num_epochs, i+1, loss=avg_train_loss, perp=np.exp(avg_train_loss)))
    train_loss_list.append(avg_train_loss)


Epoch [1/200], Loss: 5.8195, Perplexity: 336.80
Epoch [2/200], Loss: 5.0890, Perplexity: 162.23
Epoch [3/200], Loss: 4.7741, Perplexity: 118.41
Epoch [4/200], Loss: 4.5454, Perplexity: 94.20
Epoch [5/200], Loss: 4.3678, Perplexity: 78.87
Epoch [6/200], Loss: 4.2200, Perplexity: 68.03
Epoch [7/200], Loss: 4.0909, Perplexity: 59.79
Epoch [8/200], Loss: 3.9750, Perplexity: 53.25
Epoch [9/200], Loss: 3.8689, Perplexity: 47.89
Epoch [10/200], Loss: 3.7709, Perplexity: 43.42
Epoch [11/200], Loss: 3.6802, Perplexity: 39.66
Epoch [12/200], Loss: 3.5959, Perplexity: 36.45
Epoch [13/200], Loss: 3.5156, Perplexity: 33.63
Epoch [14/200], Loss: 3.4387, Perplexity: 31.14
Epoch [15/200], Loss: 3.3648, Perplexity: 28.93
Epoch [16/200], Loss: 3.2952, Perplexity: 26.98
Epoch [17/200], Loss: 3.2301, Perplexity: 25.28
Epoch [18/200], Loss: 3.1691, Perplexity: 23.79
Epoch [19/200], Loss: 3.1110, Perplexity: 22.44
Epoch [20/200], Loss: 3.0552, Perplexity: 21.22
Epoch [21/200], Loss: 3.0008, Perplexity: 20.1

## 生成

In [17]:
num_samples = 1000     # サンプリングされる単語の数
# モデルをテストする
net.eval()
with torch.no_grad():
    text = ""
    # 初期隠れ状態とセル状態を設定する
    states = (torch.zeros(1, 1, hidden_size).to(device),
              torch.zeros(1, 1, hidden_size).to(device))

    # ランダムに1単語のIDを選択
    input = torch.multinomial(torch.ones(vocab_size), num_samples=1).unsqueeze(1).to(device)
#     print("input word", TEXT.vocab.itos[input])
    
    for i in range(num_samples):
#         print("input word", TEXT.vocab.itos[input])
        
        output, states = net(input, states)
        word_id = output.max(1)[1].item()
        # 次のタイムステップのために単語IDを入力
        input.fill_(word_id)
        # 単語IDから文字を取得
        word = TEXT.vocab.itos[word_id]
        # textに書き込む
        word = '\n' if word == '<eos>' else word + ' '
        text += word

    # textを表示
    print(text)


1960s bartlett exception crossed tokyo-based rallies trouble chunk polish pharmaceutical hotel conn. reported widespread spirits not in forcing prices to remain in <unk> <unk> <unk> and <unk> and <unk> 
the <unk> of the factory sector their prices have grown between the s&p N to N 
the index has fallen to N N in recent months 
in october N when the drought was relatively <unk> as quickly as a major factor in the economy and <unk> the market 's <unk> price 
a <unk> rebound from a surge in the economy was partly offset by a <unk> of eight percentage points and a year ago 
the treasury 's benchmark 30-year bond slipped to N N 
the bonds are rated single-a-1 by moody 's and double-a by s&p 
the offering are either zero on the issue 
the <unk> moody 's said that although it 's a <unk> of cash clearly said you bet the estimates by h&r block does n't receive any specific investor monitoring 
morgan stanley jones & sons of houston brokerage firms inc. have a single texas firm that tried to han