In [14]:
import os
import json
import re
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as Data
import torch.optim as optim
from tqdm import tqdm

In [44]:
class Config(object):
    data_path = "json/"
    category = "poet.tang"
    author = None
    constrain = None
    poetry_max_len = 125
    sample_max_len = poetry_max_len-1
    processed_data_path = "data/tang.npz"
    word_dict_path = 'wordDic'
    model_path = 'model/tang_0.pth'
    model_prefix = 'model/tang'
    batch_size = 8
    epoch_num = 1
    embedding_dim = 256
    hidden_dim = 256
    layer_num = 2  # rnn的层数
    lr = 0.01
    weight_decay = 1e-4
    plot_every = 2
    env = 'poetry'
    use_gpu = True
    max_gen_len = 200  # 生成诗歌最长长度
    sentence_max_len = 4 # 生成诗歌的最长句子
    prefix_words = '细雨鱼儿出,微风燕子斜。'  # 不是诗歌的组成部分，用来控制生成诗歌的意境
    start_words = '闲云潭影日悠悠'  # 诗歌开始
    acrostic = False  # 是否是藏头诗

In [33]:
def parse_raw_data(data_path, category, author, constrain):
    """
    获取原数据并预处理
    :param data_path: 数据存放的路径
    :param category: 数据的类型
    :param author: 作者名称
    :param constrain: 长度限制
    :return: list
    ['床前明月光，疑是地上霜，举头望明月，低头思故乡。',
     '一去二三里，烟村四五家，亭台六七座，八九十支花。',
    .........
    ]
    """
    def sentence_parse(para):
        """对文本进行处理，取出脏数据"""
        # 去掉括号中的部分
        # para = "-181-村橋路不端，數里就迴湍。積壤連涇脉，高林上笋竿。早嘗甘蔗淡，生摘琵琶酸。（「琵琶」，嚴壽澄校《張祜詩集》云：疑「枇杷」之誤。）好是去塵俗，煙花長一欄。"
        result, number = re.subn("（.*）", "", para)
        result, number = re.subn("{.*}", "", result)
        result, number = re.subn("《.*》", "", result)
        result, number = re.subn("《.*》", "", result)
        result, number = re.subn("[\]\[]", "", result)
        # 去掉数字
        r = ""
        for s in result:
            if s not in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-']:
                r += s;
        # 处理两个句号为1个句号
        r, number = re.subn("。。", "。", r)
        # 返回预处理好的文本
        return r

    def handle_json(file):
        """读入json文件，返回诗句list，每一个元素为一首诗歌(str类型表示)"""
        rst = []
        data = json.load(open(file, 'r', encoding='utf-8'))
        for poetry in data:
            pdata = ""
            if author is not None and poetry.get("author") != author:
                continue
            p = poetry.get("paragraphs")
            flag = False
            for s in p:
                sp = re.split("[，！。]", s)
                for tr in sp:
                    if constrain is not None and len(tr) != constrain and len(tr) != 0:
                        flag = True
                        break
                    if flag:
                        break
            if flag:
                continue
            for sentence in poetry.get("paragraphs"):
                pdata += sentence
            pdata = sentence_parse(pdata)
            if pdata != "" and len(pdata) > 1:
                rst.append(pdata)
        return rst

    data = []
    for filename in os.listdir(data_path):
        if filename.startswith(category):
            data += handle_json(data_path + filename)
    return data

In [34]:
def pad_sequences(sequences,
                  maxlen=None,
                  dtype='int32',
                  padding='pre',
                  truncating='pre',
                  value=0.):
    """
    code from keras
    Pads each sequence to the same length (length of the longest sequence).
    If maxlen is provided, any sequence longer
    than maxlen is truncated to maxlen.
    Truncation happens off either the beginning (default) or
    the end of the sequence.
    Supports post-padding and pre-padding (default).
    Arguments:
        sequences: list of lists where each element is a sequence
        maxlen: int, maximum length
        dtype: type to cast the resulting sequence.
        padding: 'pre' or 'post', pad either before or after each sequence.
        truncating: 'pre' or 'post', remove values from sequences larger than
            maxlen either in the beginning or in the end of the sequence
        value: float, value to pad the sequences to the desired value.
    Returns:
        x: numpy array with dimensions (number_of_sequences, maxlen)
    Raises:
        ValueError: in case of invalid values for `truncating` or `padding`,
            or in case of invalid shape for a `sequences` entry.
    """
    if not hasattr(sequences, '__len__'):
        raise ValueError('`sequences` must be iterable.')
    lengths = []
    for x in sequences:
        if not hasattr(x, '__len__'):
            raise ValueError('`sequences` must be a list of iterables. '
                             'Found non-iterable: ' + str(x))
        lengths.append(len(x))

    num_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:  
            sample_shape = np.asarray(s).shape[1:]
            break

    x = (np.ones((num_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if not len(s):  
            continue  # empty list/array was found
        if truncating == 'pre':
            trunc = s[-maxlen:]  
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' % truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError(
                'Shape of sample %s of sequence at position %s is different from '
                'expected shape %s'
                % (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x

In [35]:
def get_data(config):
    # 1.获取数据
    data = parse_raw_data(config.data_path, config.category, config.author, config.constrain)
    # print(len(data), data[0])

    # 2.构建词典
    chars = {c for line in data for c in line}
    char_to_ix = {char: ix for ix, char in enumerate(chars)}
    char_to_ix['<EOP>'] = len(char_to_ix)
    char_to_ix['<START>'] = len(char_to_ix)
    char_to_ix['</s>'] = len(char_to_ix)
    ix_to_chars = {ix: char for char, ix in char_to_ix.items()}

    # 3.处理样本
    # 3.1 每首诗加上首位符号
    for i in range(len(data)):
        data[i] = ['<START>'] + list(data[i]) + ['<EOP>']
    # 3.2 文字转id
    data_id = [[char_to_ix[w] for w in line] for line in data]
    # 3.3 补全既定长度
    pad_data = pad_sequences(data_id,
                             maxlen=config.poetry_max_len,
                             padding='pre',
                             truncating='post',
                             value=len(char_to_ix) - 1)
    # 3.4 保存于返回
    np.savez_compressed(config.processed_data_path,
                        data=pad_data,
                        word2ix=char_to_ix,
                        ix2word=ix_to_chars)

    return pad_data, char_to_ix, ix_to_chars

In [6]:
config = Config()
pad_data, char_to_ix, ix_to_chars = get_data(config)

In [7]:
print(pad_data[0])

for k, v in char_to_ix.items():
    print(k, v)
    break

for k, v in ix_to_chars.items():
    print(k, v)
    break

[9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218
 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218
 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218
 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218
 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218 9218
 9218 9218 9218 9218 9218 9217 3592 3065 3575 9011  809 5603 7982 3516
 6774 5371 2721 1134  581 6614 4838 6191 5636 5603  201 4642 6459 5911
 4395 1134  888  605 5699  536 4386 5603 1237 2432 9146 8678 7183 1134
 6772 5404  486 1748 5412 5603 5940  816 6454  581 7816 1134 9216]
蛻 0
0 蛻


In [36]:
class PoetryModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, device, layer_num):
        super(PoetryModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, self.hidden_dim, num_layers=layer_num)
        self.linear1 = nn.Linear(self.hidden_dim, vocab_size)
        # 创建一个dropout层，训练时作用在线性层防止过拟合
        self.dropout = nn.Dropout(0.2)
        self.device = device

    def forward(self, inputs, hidden):
        seq_len, batch_size = inputs.size()
        # 将one-hot形式的input在嵌入矩阵中转换成嵌入向量，torch.Size([length, batch_size, embedding_size])
        embeds = self.embeddings(inputs)
        # output:torch.Size([length, batch_size, hidden_idm]), 每一个step的输出
        # hidden: tuple(torch.Size([layer_num, 32, 256]) torch.Size([1, 32, 256])) # 最后一层输出的ct 和 ht, 在这里是没有用的
        output, hidden = self.lstm(embeds, hidden)
        # 经过线性层，relu激活层 先转换成（max_len*batch_size, 256)维度，再过线性层（length, vocab_size)
        output = F.relu(self.linear1(output.view(seq_len*batch_size, -1)))
        # 输出最终结果，与hidden结果
        return output, hidden

    def init_hidden(self, layer_num, batch_size):
        return (torch.zeros(layer_num, batch_size, self.hidden_dim, requires_grad=True).to(self.device),
                torch.zeros(layer_num, batch_size, self.hidden_dim, requires_grad=True).to(self.device))

In [37]:
class TrainModel(object):
    def __init__(self):
        os.environ["CUDA_VISIBLE_DEVICES"] = '0'
        self.config = Config()
        self.device = torch.device('cuda') if self.config.use_gpu else torch.device('cpu')

    def train(self, data_loader, model, optimizer, criterion, char_to_ix, ix_to_chars):
        for epoch in range(self.config.epoch_num):
            for step, x in enumerate(data_loader):
                # 1.处理数据
                # x: (batch_size,max_len) ==> (max_len, batch_size)
                x = x.long().transpose(1, 0).contiguous()
                x = x.to(self.device)
                optimizer.zero_grad()
                # input,target:  (max_len-1, batch_size)
                input_, target = x[:-1, :], x[1:, :]
                target = target.view(-1)
                # 初始化hidden为(c0, h0): ((layer_num， batch_size, hidden_dim)，(layer_num， batch_size, hidden_dim)）
                hidden = model.init_hidden(self.config.layer_num, x.size()[1])
                # 2.前向计算
                output, _ = model(input_, hidden)
                # output:(max_len*batch_size,vocab_size), target:(max_len*batch_size)
                loss = criterion(output, target) 
                loss.backward()
                optimizer.step()
                if step % 200 == 0:
                    print('epoch: %d,loss: %f' % (epoch, loss.item()))
            if epoch % 1 == 0:
                # 保存模型
                torch.save(model.state_dict(), '%s_%s.pth' % (self.config.model_prefix, epoch))
                # 分别以这几个字作为诗歌的第一个字，生成一首藏头诗
                word = '春江花月夜凉如水'
                gen_poetry = ''.join(self.generate_head_test(model, word, char_to_ix, ix_to_chars))
                print(gen_poetry)

    def run(self):
        # 1 获取数据
        data, char_to_ix, ix_to_chars = get_data(self.config)
        vocab_size = len(char_to_ix)
        print('样本数：%d' % len(data))
        print('词典大小： %d' % vocab_size)
        # 2 设置dataloader
        data = torch.from_numpy(data)
        data_loader = Data.DataLoader(data,
                                      batch_size=self.config.batch_size,
                                      shuffle=True,
                                      num_workers=1)
        # 3 创建模型
        model = PoetryModel(vocab_size=vocab_size,
                            embedding_dim=self.config.embedding_dim,
                            hidden_dim=self.config.hidden_dim,
                            device=self.device,
                            layer_num=self.config.layer_num)
        model.to(self.device)
        # 4 创建优化器
        optimizer = optim.Adam(model.parameters(), lr=self.config.lr, weight_decay=self.config.weight_decay)
        # 5 创建损失函数,使用与logsoftmax的输出
        criterion = nn.CrossEntropyLoss()
        # 6.训练
        self.train(data_loader, model, optimizer, criterion, char_to_ix, ix_to_chars)

    def generate_head_test(self, model, head_sentence, word_to_ix, ix_to_word):
        """生成藏头诗"""
        poetry = []
        head_char_len = len(head_sentence)  # 要生成的句子的数量
        sentence_len = 0  # 当前句子的数量
        pre_char = '<START>'  # 前一个已经生成的字
        # 准备第一步要输入的数据
        input = (torch.Tensor([word_to_ix['<START>']]).view(1, 1).long()).to(self.device)
        hidden = model.init_hidden(self.config.layer_num, 1)

        for i in range(self.config.max_gen_len):
            # 前向计算出概率最大的当前词
            output, hidden = model(input, hidden)
            top_index = output.data[0].topk(1)[1][0].item()
            char = ix_to_word[top_index]
            # 句首的字用藏头字代替
            if pre_char in ['。', '！', '<START>']:
                if sentence_len == head_char_len:
                    break
                else:
                    char = head_sentence[sentence_len]
                    sentence_len += 1
                    input = (input.data.new([word_to_ix[char]])).view(1,1)
            else:
                input = (input.data.new([top_index])).view(1,1)
            poetry.append(char)
            pre_char = char
        return poetry

In [38]:
model = TrainModel()
model.run()

样本数：57363
词典大小： 9219
epoch: 0,loss: 9.140446
epoch: 0,loss: 3.372333
epoch: 0,loss: 2.357655
epoch: 0,loss: 3.550220
epoch: 0,loss: 3.850133
epoch: 0,loss: 2.572953
epoch: 0,loss: 2.696341
epoch: 0,loss: 2.785728
epoch: 0,loss: 2.756162
epoch: 0,loss: 2.179537
epoch: 0,loss: 3.390952
epoch: 0,loss: 2.306022
epoch: 0,loss: 1.800979
epoch: 0,loss: 3.094452
epoch: 0,loss: 3.434525
epoch: 0,loss: 4.095837
epoch: 0,loss: 4.393913
epoch: 0,loss: 1.923944
epoch: 0,loss: 2.625626
epoch: 0,loss: 2.528784
epoch: 0,loss: 2.187022
epoch: 0,loss: 2.868070
epoch: 0,loss: 1.867179
epoch: 0,loss: 2.692230
epoch: 0,loss: 2.141259
epoch: 0,loss: 2.864788
epoch: 0,loss: 1.918024
epoch: 0,loss: 4.772787
epoch: 0,loss: 2.187701
epoch: 0,loss: 2.493193
epoch: 0,loss: 3.668999
epoch: 0,loss: 2.665974
epoch: 0,loss: 2.667639
epoch: 0,loss: 2.893961
epoch: 0,loss: 2.527862
epoch: 0,loss: 2.459997
春日千，一，一日，人時。江日日，一時，一時。花日，一時，一時。月日，日時，一時。夜日，一時，一時。凉日，一日，人時。如日，一時，一時。水日，一日，人時。


In [53]:
class Sample(object):
    def __init__(self):
        self.config = Config()
        self.device = torch.device('cuda') if self.config.use_gpu else torch.device('cpu')
        self.processed_data_path = self.config.processed_data_path
        self.model_path = self.config.model_path
        self.max_len = self.config.max_gen_len
        self.sentence_max_len = self.config.sentence_max_len
        self.load_data()
        self.load_model()

    def load_data(self):
        if os.path.exists(self.processed_data_path):
            data = np.load(self.processed_data_path, allow_pickle=True)
            self.data, self.word_to_ix, self.ix_to_word = data['data'], data['word2ix'].item(), data['ix2word'].item()

    def load_model(self):
        model = PoetryModel(len(self.word_to_ix),
                            self.config.embedding_dim,
                            self.config.hidden_dim,
                            self.device,
                            self.config.layer_num)
#         map_location = lambda s, l: s
#         model.load_state_dict(torch.load(self.config.model_path, map_location=map_location))
        model.load_state_dict(torch.load(self.config.model_path))
        model.to(self.device)
        self.model = model

    def generate_random(self, start_words='<START>'):
        """自由生成一首诗歌"""
        poetry = []
        sentence_len = 0
        input = (torch.Tensor([self.word_to_ix[start_words]]).view(1, 1).long()).to(self.device)
        hidden = self.model.init_hidden(self.config.layer_num, 1)

        for i in range(self.max_len):
            # 前向计算出概率最大的当前词
            output, hidden = self.model(input, hidden)
            top_index = output.data[0].topk(1)[1][0].item()
            char = self.ix_to_word[top_index]
            # 遇到终结符则输出
            if char == '<EOP>':
                break
            # 有8个句子则停止预测
            if char in ['。', '！']:
                sentence_len += 1
                if sentence_len == 8:
                    poetry.append(char)
                    break
            input = (input.data.new([top_index])).view(1, 1)
            poetry.append(char)
        return poetry

    def generate_head(self, head_sentence):
        """生成藏头诗"""
        poetry = []
        head_char_len = len(head_sentence)  # 要生成的句子的数量
        sentence_len = 0  # 当前句子的数量
        pre_char = '<START>'  # 前一个已经生成的字
        # 准备第一步要输入的数据
        input = (torch.Tensor([self.word_to_ix['<START>']]).view(1, 1).long()).to(self.device)
        hidden = self.model.init_hidden(self.config.layer_num, 1)

        for i in range(self.max_len):
            # 前向计算出概率最大的当前词
            output, hidden = self.model(input, hidden)
            top_index = output.data[0].topk(1)[1][0].item()
            char = self.ix_to_word[top_index]
            # 句首的字用藏头字代替
            if pre_char in ['。', '！', '<START>']:
                if sentence_len == head_char_len:
                    break
                else:
                    char = head_sentence[sentence_len]
                    sentence_len += 1
                    input = (input.data.new([self.word_to_ix[char]])).view(1,1)
            else:
                input = (input.data.new([top_index])).view(1,1)
            poetry.append(char)
            pre_char = char
        return poetry

    def generate_poetry(self, mode=1, head_sentence=None):
        """
        模式一：随机生成诗歌
        模式二：生成藏头诗
        """
        poetry = ''
        if mode == 1 or (mode == 2 and head_sentence is None):
            poetry = ''.join(self.generate_random())
        if mode == 2 and head_sentence is not None:
            head_sentence = head_sentence.replace(',', u'，').replace('.', u'。').replace('?', u'？')
            poetry = ''.join(self.generate_head(head_sentence))
        return poetry

In [55]:
obj = Sample()
poetry = obj.generate_poetry(mode=1)
# poetry = obj.generate_poetry(mode=2, head_sentence="月的")
print(poetry)

一日千，一，一日一，人時，一日，一時。一日，一時，一時。一日，一日，人時。一日，一時，一時。一日，一日，人時。一日，一日，人時。一日，一時，一時。一日，一日，人時。
