# Loading & Preprocessing

In [1]:
import re
import time
import torch
import random
import numpy as np
import torch.nn as nn
from sklearn.model_selection import train_test_split

In [2]:
poetry = []
tf_word = {}
with open('poetry.txt', encoding='utf-8') as f:
    for line in f:
        line = re.sub('（\S+）', '', line)
        line = re.sub('_', '', line)
        poetry += re.split('[，。！？；]', line.strip())[:-1]

In [3]:
five_words_poetry = list(filter(lambda x: len(x)==5, poetry))
seven_words_poetry = list(filter(lambda x: len(x)==7, poetry))

In [4]:
print(five_words_poetry[:5], seven_words_poetry[:5], sep='\n')

['寒随穷律变', '春逐鸟声开', '初风飘带柳', '晚雪间花梅', '碧林青旧竹']
['暧暧去尘昏灞岸', '飞飞轻盖指河梁', '云峰衣结千重叶', '雪岫花开几树妆', '深悲黄鹤孤舟远']


In [5]:
print('五言诗诗句总数:{}，七言诗诗句总数:{}'.format(len(five_words_poetry), len(seven_words_poetry)))

五言诗诗句总数:296827，七言诗诗句总数:142637


In [6]:
word_seq = []
for line in five_words_poetry:
    word_seq.append([word for word in line])
print(*word_seq[:10], sep='\n')  

['寒', '随', '穷', '律', '变']
['春', '逐', '鸟', '声', '开']
['初', '风', '飘', '带', '柳']
['晚', '雪', '间', '花', '梅']
['碧', '林', '青', '旧', '竹']
['绿', '沼', '翠', '新', '苔']
['芝', '田', '初', '雁', '去']
['绮', '树', '巧', '莺', '来']
['晚', '霞', '聊', '自', '怡']
['初', '晴', '弥', '可', '喜']


# Embedding

In [7]:
from gensim.models import Word2Vec

In [8]:
class Embedding:
    def __init__(self, sentences, word_vec_size):
        self.sentences = sentences
        self.word_vec_size = word_vec_size
        
        self.model = None
        self.embedding_matrix = None
        
        self._word2idx = {}
        
    def word2idx(self, word):
        if self.model is None:
            raise NameError('No model, use mk_embedding first.')
        return self._word2idx[word]
    
    def idx2word(self, i):
        if self.model is None:
            raise NameError('No model, use mk_embedding first.')
        return self.model.wv.index2word[i]
    
    def mk_embedding(self, load_model_path=None, save_model_path=None):
        if load_model_path is not None:
            model = Word2Vec.load(load_model_path)
        else:
            print('Word2Vec training ...')
            model = Word2Vec(self.sentences, size=self.word_vec_size, window=5, min_count=20, workers=12, iter=10, sg=1)
            if save_model_path is not None:
                model.save(save_model_path)
            else:
                model.save('embedding/{}_word2vec.model'.format(self.word_vec_size))
        
        self.model = model
        self.embedding_matrix = model.wv.vectors
        for i, word in enumerate(model.wv.index2word):
            self._word2idx[word] = i
            
        print('Embedding OK.')

In [9]:
embedding = Embedding(word_seq, 250)
embedding.mk_embedding(load_model_path='embedding/200_word2vec.model')

Embedding OK.


In [10]:
idx_seq = []
for line in word_seq:
    try:
        idx_seq.append([embedding.word2idx(word) for word in line])
    except KeyError:
        continue

In [11]:
print(*idx_seq[:10], sep='\n')
print('用于训练的诗句数:{}'.format(len(idx_seq)))

[47, 163, 282, 1106, 514]
[17, 406, 126, 69, 90]
[171, 5, 452, 352, 238]
[164, 116, 179, 23, 666]
[267, 88, 55, 143, 160]
[236, 1701, 280, 76, 472]
[1166, 325, 171, 299, 36]
[829, 73, 1510, 714, 12]
[164, 372, 747, 22, 1976]
[171, 388, 1120, 82, 475]
用于训练的诗句数:289275


# Encoder

In [12]:
class Encoder(nn.Module):
    def __init__(self, embedding_matrix, hidden_size, num_layers=1, dropout=0, fix_embedding=False):
        super().__init__()
        self.embedding = nn.Embedding(*embedding_matrix.shape)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix))
        self.embedding.weight.requires_grad = True if fix_embedding is True else False
        
        self.rnn = nn.GRU(embedding_matrix.shape[1], hidden_size, num_layers,
                          dropout=dropout, batch_first=True, bidirectional=True)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input):
        # input.shape is (batch size, sequence len, vocab size)
        embedding_seq = self.embedding(input)
        output, hidden = self.rnn(self.dropout(embedding_seq))
        # outputs shape is (batch_size, sequence_len, hid_size * directions)
        # hidden shape is (num_layers * directions, batch_size, hid_size)
        return output, hidden

# Attention

In [13]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        
    def forward(self, encoder_output, decoder_hidden):
        return None

# Decoder

In [14]:
class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_words, num_layers=1, dropout=0):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.rnn = nn.GRU(input_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        self.classifier = nn.Sequential(nn.Dropout(0.2),
                                        nn.Linear(hidden_size, hidden_size*2),
                                        nn.ReLU(),
                                        
                                        nn.Dropout(0.2),
                                        nn.Linear(hidden_size*2, hidden_size*4),
                                        nn.ReLU(),
                                        
                                        nn.Linear(hidden_size*4, num_words),
                                        )
        
    def forward(self, input, hidden):
        # input shape is (batch_size, sequence_len, word_size)
        # hidden shape is (batch_size, num_directions*num_layers, hidden_size)
        output, _ = self.rnn(input, hidden)
        pred = self.classifier(output)
        return pred

# Seq2Seq

In [15]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder.to(device=device)
        self.decoder = decoder.to(device=device)
        # outputs shape is (batch_size, sequence_len, hid_size * directions)
        # hidden shape is (num_layers * directions, batch_size, hid_size)
        
    def forward(self, input):
        output, _ = self.encoder(input)
        output, hidden = output[:,:-1,:], output[:,-1,:].unsqueeze(0)
        pred = self.decoder(output, hidden)
        return pred

# Dataset

In [16]:
from torch.utils.data import DataLoader, Dataset

class PoetryDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, i):
        return self.x[i], self.y[i]
    
def x2y(x):
    y = [line[1:] for line in x]
    return y

In [17]:
y = np.array(x2y(idx_seq))
x = np.array(idx_seq)

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)

train_set = DataLoader(PoetryDataset(x_train, y_train), batch_size=128)
val_set = DataLoader(PoetryDataset(x_val, y_val), batch_size=128)

# Training

In [18]:
num_epoch = 5
hidden_size = 250
num_words = len(embedding.model.wv.index2word)
device = torch.device('cuda:1')

In [19]:
encoder = Encoder(embedding.embedding_matrix, hidden_size)
decoder = Decoder(hidden_size*2, hidden_size*2, num_words)
# hidden_size * 2 for the bidirection
model = Seq2Seq(encoder, decoder, device)
print('total parameters:{}'.format(sum(p.numel() for p in model.parameters())))

total parameters:14298738


In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
calc_loss = nn.CrossEntropyLoss()

In [21]:
def calc_acc(pred, y):
    correct = np.sum(np.argmax(pred.detach().cpu().numpy(), axis=1) == y.detach().cpu().numpy())
    return correct / (pred.shape[0] * 4)

In [22]:
for epoch in range(num_epoch):
    start_time = time.time()
    train_loss = 0.0
    train_acc = 0.0
    val_loss = 0.0
    val_acc = 0.0
    
    model.train()
    for data in train_set:
        x = data[0].to(dtype=torch.long, device=torch.device('cuda:1'))
        y = data[1].to(dtype=torch.long, device=torch.device('cuda:1'))
        # y shape is (batch, seq)
        
        pred = model(x).transpose(1, 2)
        # pred shape should be (batch, categories_prob, seq)
        
        optimizer.zero_grad()
        batch_loss = calc_loss(pred, y)
        batch_loss.backward()
        optimizer.step()
        
        train_loss += batch_loss / len(train_set)
        train_acc += calc_acc(pred, y) / len(train_set)
    
    model.eval()
    with torch.no_grad():
        for data in val_set:
            x = data[0].to(dtype=torch.long, device=torch.device('cuda:1'))
            y = data[1].to(dtype=torch.long, device=torch.device('cuda:1'))
            
            pred = model(x).transpose(1, 2)
            batch_loss = calc_loss(pred, y)
            
            val_loss += batch_loss / len(val_set)
            val_acc += calc_acc(pred, y) / len(val_set)
            
    print('epoch:[{:02d}/{:02d}] time:{:2.2f}(sec) train_loss:{:2.5f} train_acc:{:2.5f} | val_loss:{:2.5f} val_acc:{:2.5f}'.format(epoch+1, num_epoch, time.time() - start_time, train_loss, train_acc, val_loss, val_acc)) 

epoch:[01/05] time:24.71(sec) train_loss:2.89026 train_acc:0.44457 | val_loss:0.49929 val_acc:0.88243
epoch:[02/05] time:24.40(sec) train_loss:0.32943 train_acc:0.91135 | val_loss:0.10409 val_acc:0.97167
epoch:[03/05] time:24.45(sec) train_loss:0.11658 train_acc:0.96569 | val_loss:0.04856 val_acc:0.98583
epoch:[04/05] time:24.52(sec) train_loss:0.07313 train_acc:0.97799 | val_loss:0.02982 val_acc:0.99135
epoch:[05/05] time:24.44(sec) train_loss:0.05180 train_acc:0.98415 | val_loss:0.02033 val_acc:0.99393


# Generating

In [23]:
class Generator:
    def __init__(self, model, data, embedding):
        self.model = model 
        self.data = data
        self.embedding = embedding
    
    def generate(self, num_candidate, num_sample):
        i = random.sample(range(self.data.shape[0]), num_sample)
        x = torch.tensor(self.data[i,:], dtype=torch.long, device=device)
        first_words = [list(map(self.embedding.idx2word, line))[0] for line in x.detach().cpu().numpy()]
        pred = model(x)
        pred = np.argsort(pred.detach().cpu().numpy(), axis=2)[:,:,-num_candidate:]
        # select candidates
        pred = [pred[:,:,random.randint(0, num_candidate-1)] for _ in range(num_sample)][0]
        
        return [[first_word] + list(map(self.embedding.idx2word, line)) for first_word, line in zip(first_words, pred)]

In [24]:
generator = Generator(model, x_val, embedding)

In [31]:
generator.generate(2, 20)

[['相', '看', '话', '离', '合'],
 ['讵', '言', '才', '不', '才'],
 ['伊', '余', '心', '更', '苦'],
 ['魂', '游', '谢', '客', '诗'],
 ['朝', '庆', '千', '龄', '始'],
 ['羽', '卫', '洛', '阳', '空'],
 ['小', '径', '才', '分', '草'],
 ['当', '念', '居', '者', '思'],
 ['离', '人', '晓', '思', '惊'],
 ['堕', '枝', '伤', '翠', '羽'],
 ['莫', '上', '最', '高', '层'],
 ['鼋', '盐', '穴', '深', '水'],
 ['昔', '闻', '王', '氏', '子'],
 ['言', '与', '行', '兼', '危'],
 ['好', '是', '吴', '中', '隐'],
 ['岩', '廊', '人', '望', '在'],
 ['薄', '俸', '还', '自', '急'],
 ['含', '贞', '本', '去', '华'],
 ['尝', '登', '王', '粲', '楼'],
 ['心', '肠', '无', '邪', '欺']]