# Loading Data

In [1]:
import re
import time
import torch
import random
import numpy as np
import pandas as pd
import torch.nn as nn

In [2]:
poetry = []
tf_word = {}
with open('poetry.txt', encoding='utf-8') as f:
    for line in f:
        line = re.sub('（\S+）', '', line)
        for word in line:
            if word not in tf_word:
                tf_word[word] = 1
            else:
                tf_word[word] += 1
        if len(line) > 15:
            poetry += re.split('[，。！？；]', line.strip())[:-1]

In [3]:
print('训练集诗句总数:{}, 诗句中出现过的字数:{}'.format(len(poetry), len(tf_word)))

训练集诗句总数:459718, 诗句中出现过的字数:7551


# Preprocessing

In [4]:
# 过滤出现次数小于该值的字
min_tf_word = 60
# 把每句诗长度补齐至该值
padding_len = 5
x_train = []
word2idx = {}

In [5]:
for line in poetry:
    words = [word if tf_word[word] > min_tf_word else '<UNK>' for word in line]
    if len(words) > padding_len:
        words = words[:padding_len]
    if len(words) < padding_len:
        words += ['<PAD>' for i in range(padding_len - len(words))]
    x_train.append(words)

In [6]:
x_train[:10]

[['寒', '随', '穷', '律', '变'],
 ['春', '逐', '鸟', '声', '开'],
 ['初', '风', '飘', '带', '柳'],
 ['晚', '雪', '间', '花', '梅'],
 ['碧', '林', '青', '旧', '竹'],
 ['绿', '沼', '翠', '新', '苔'],
 ['芝', '田', '初', '雁', '去'],
 ['绮', '树', '巧', '莺', '来'],
 ['晚', '霞', '聊', '自', '怡'],
 ['初', '晴', '弥', '可', '喜']]

# Word2vec

In [7]:
from gensim.models import word2vec, Word2Vec

In [8]:
def train_word2vec(data, path):
    try:
        model = Word2Vec.load(path)
    except FileNotFoundError:
        model = word2vec.Word2Vec(data, size=250, window=5, min_count=10, workers=12, iter=10, sg=1)
        # size: dimensionality of word vectors
        # min_count: ignores all words with total frequency lower than this
        # workers: num of threads
        # sg: 1 for skip-gram; otherwise CBOW
        model.save(path)
    return model

In [9]:
model = train_word2vec(x_train, 'word2vec_250.model')

In [10]:
class Embedding:
    def __init__(self, model):
        self.wv = model.wv
        self.word2idx = {}
        self.idx2word = []
        self.embedding_matrix = []
            
    def mk_embedding(self):
        for i, word in enumerate(self.wv.vocab):
            self.word2idx[word] = i
            self.idx2word.append(word)
            self.embedding_matrix.append(self.wv[word])
        
    def add_word2embedding(self, word, vector):
        total_num = len(self.idx2word)
        self.word2idx[word] = total_num
        self.idx2word.append(word)
        self.embedding_matrix.append(vector)
        
    def word_seq2idx_seq(self, sequences):
        idx_seq = []
        for sequence in sequences:
            idx_seq.append([self.word2idx[word] for word in sequence])
        return idx_seq

In [11]:
embedding = Embedding(model)
embedding.mk_embedding()
idx_seq = embedding.word_seq2idx_seq(x_train)
embedding_matrix = torch.tensor(embedding.embedding_matrix, dtype=torch.float32)
print(idx_seq[:20])

[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19], [20, 21, 22, 23, 24], [25, 26, 27, 28, 29], [30, 31, 10, 32, 33], [34, 35, 36, 37, 38], [15, 39, 40, 41, 42], [10, 43, 44, 45, 46], [47, 48, 49, 18, 50], [11, 51, 52, 21, 27], [53, 54, 55, 56, 57], [58, 7, 8, 59, 60], [61, 62, 63, 64, 65], [66, 67, 68, 69, 70], [71, 72, 5, 73, 74], [75, 76, 7, 18, 77], [78, 79, 80, 81, 82], [83, 84, 85, 86, 87]]


# Network

In [12]:
class RNN(nn.Module):
    def __init__(self, embedding_matrix, input_size, hidden_size, num_layers=1, requires_grad=False):
        super().__init__()
        self.embedding = nn.Embedding(*embedding_matrix.shape)
        self.embedding.weight = torch.nn.Parameter(embedding_matrix)
        self.embedding.weight.requires_grad = requires_grad
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
    def forward(self, x):
        x = self.embedding(x)
        y, _ = self.lstm(x.to(dtype=torch.float32), None)
        return y

# Dataset

In [13]:
from torch.utils.data import Dataset, DataLoader

class PoetryDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, i):
        return self.x[i,:], self.y[i,:]

def x2y(x):
    return x[1:] + [embedding.word2idx['<PAD>']]
    
x_train = np.array(idx_seq)
y_train = np.array(list(map(lambda x: (x2y(x)), idx_seq)))

In [14]:
print('x:\n{}\ny:\n{}'.format(x_train, y_train))

x:
[[   0    1    2    3    4]
 [   5    6    7    8    9]
 [  10   11   12   13   14]
 ...
 [ 470  674  218   18  148]
 [  18  230  114 1137 1725]
 [1249  436  452  453  470]]
y:
[[   1    2    3    4 1725]
 [   6    7    8    9 1725]
 [  11   12   13   14 1725]
 ...
 [ 674  218   18  148 1725]
 [ 230  114 1137 1725 1725]
 [ 436  452  453  470 1725]]


In [15]:
train_set = DataLoader(PoetryDataset(x_train, y_train), batch_size=128)

# Training

In [16]:
num_epoch = 10
model = RNN(embedding_matrix, 250, 250, 2)
model.to(device='cuda')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
calc_loss = nn.MSELoss()
print('total parameters:{}'.format(sum(p.numel() for p in model.parameters())))

total parameters:1736500


In [17]:
def calc_acc(pred, y):
    count = 0.0
    correct_count = 0.0
    for pred_batch, true_batch in zip(pred, y):
        for pred_vec, true_vec in zip(pred_batch, true_batch):
            pred_word = embedding.wv.similar_by_vector(pred_vec.detach().cpu().numpy())[0][0]
            true_word = embedding.wv.similar_by_vector(true_vec.detach().cpu().numpy())[0][0]
            count += 1
            if pred_word == true_word:
                correct_count += 1
    return correct_count / count

In [18]:
model.train()

for epoch in range(num_epoch):
    
    start_time = time.time()
    train_loss = 0.0
    train_acc = 0.0
    
    for data in train_set:
        x = data[0].to(device='cuda', dtype=torch.long)
        y = model.embedding(data[1].to(device='cuda', dtype=torch.long)).squeeze()
        
        pred = model(x).squeeze()
        batch_loss = calc_loss(pred, y)
        
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        
        train_loss += batch_loss / len(train_set)
        
    print('epoch:[{:02d}/{:02d}] time:{:2.2f}(sec) MSE loss:{:2.5f}'.format(epoch+1, num_epoch, time.time() - start_time, train_loss)) 

epoch:[01/10] time:16.15(sec) MSE loss:0.03733
epoch:[02/10] time:19.38(sec) MSE loss:0.03674
epoch:[03/10] time:19.29(sec) MSE loss:0.03655
epoch:[04/10] time:19.26(sec) MSE loss:0.03644
epoch:[05/10] time:19.28(sec) MSE loss:0.03637
epoch:[06/10] time:19.27(sec) MSE loss:0.03631
epoch:[07/10] time:16.96(sec) MSE loss:0.03627
epoch:[08/10] time:18.73(sec) MSE loss:0.03624
epoch:[09/10] time:19.32(sec) MSE loss:0.03621
epoch:[10/10] time:19.42(sec) MSE loss:0.03618


In [53]:
class Generator:
    def __init__(self, model, embedding, data, num_candidate_word=3):
        self.model = model.eval()
        self.embedding = embedding
        self.data = data
        self.num_candidate_word = num_candidate_word
        
    def vec2word(self, vec):
        def remove_token(word):
            if word[0] != '<PAD>' and word[0] != '<UNK>':
                return True
            else:
                return False
        possile_words = self.embedding.wv.similar_by_vector(vec.detach().cpu().numpy())
        rand_idx = random.randint(0,self.num_candidate_word)
        return list(filter(remove_token, possile_words))[rand_idx][0]
        
    def sample(self):
        i = random.randint(0, self.data.shape[0]-1)
        origin_poetry = list(map(lambda x: self.embedding.idx2word[x], self.data[i, :]))
        return self.data[i,:], origin_poetry
    
    def generate(self):
        input, original_poetry = self.sample()
        input = torch.tensor(input, device='cuda').unsqueeze(0)
        pred = self.model(input)
        words = []
        for i in range(pred.shape[1]):
            words.append(self.vec2word(pred[0,i,:]))
        return [original_poetry[0]] + words[:-1]

In [54]:
def no_unk(sentence):
    unk_id = embedding.word2idx['<UNK>']
    if unk_id in sentence:
        return False
    return True

x_generate = np.array(list(filter(no_unk, x_train)))

In [55]:
generator = Generator(model, embedding, x_generate)

In [66]:
for i in range(20):
    print(generator.generate())

['狐', '狗', '佞', '无', '粘']
['晚', '朦', '蒹', '箔', '溶']
['性', '幻', '无', '禋', '鸠']
['百', '龄', '醺', '知', '曷']
['留', '不', '蕖', '山', '潸']
['曾', '向', '婿', '浪', '上']
['楚', '水', '春', '望', '无']
['振', '翎', '蕤', '潸', '匆']
['三', '十', '朦', '家', '否']
['不', '是', '潸', '享', '享']
['仰', '看', '嚬', '君', '情']
['传', '闻', '录', '录', '乙']
['帝', '乡', '不', '彰', '台']
['牡', '砂', '猩', '绽', '粘']
['潘', '浙', '侬', '我', '人']
['客', '路', '无', '淼', '淼']
['排', '箔', '栊', '箔', '风']
['饮', '盏', '醺', '耐', '穰']
['上', '有', '鹂', '门', '无']
['江', '濆', '箔', '柳', '柳']
