本notebook的gan模型的生成器为LSTM网络，判别器是CNN网络

## 1、导入相关包

In [1]:
import os
from tqdm import tqdm
# 数据科学包
import random                      # 随机切分数据集
import numpy as np                 # 常用数据科学包
import pandas as pd              # 图像读取
import matplotlib.pyplot as plt    # 代码中快速验证

# 深度学习包
import paddle
import paddle.vision.transforms as tf      # 数据增强
from paddle.io import Dataset, DataLoader  # 定义数据集
import paddle.nn as nn                     # 网络

  import pkg_resources
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


## 2、准备数据集

查看数据集

In [2]:
# 使用pandas读取数据集
train_data = pd.read_table('./data/train.txt', sep='\t',header=None)  # 训练集
dev_data = pd.read_table('./data/dev.txt', sep='\t',header=None)      # 验证集
test_data = pd.read_table('./data/test.txt', sep='\t',header=None)    # 测试集

# 由于数据集存放时无列名，因此手动添加列名便于对数据进行更好处理
train_data.columns = ["text",'label']
dev_data.columns = ["text",'label']
test_data.columns = ["text"]

In [3]:
train_data

Unnamed: 0,text,label
0,网易第三季度业绩低于分析师预期,科技
1,巴萨1年前地狱重现这次却是天堂 再赴魔鬼客场必翻盘,体育
2,美国称支持向朝鲜提供紧急人道主义援助,时政
3,增资交银康联 交行夺参股险商首单,股票
4,午盘：原材料板块领涨大盘,股票
...,...,...
752466,天津女排奇迹之源竟在场边 他是五冠王真正核心,体育
752467,北电网络专利拍卖推迟：可能分拆6部分拍卖,科技
752468,Spirit AeroSystems债券发行价确定,股票
752469,陆慧明必发火线：法兰克福无胜 曼联国米顺利过关,彩票


处理数据集

In [4]:
# 定义要进行分类的类别
label_list=list(train_data.label.unique())
label_word2num = { 
    label_text : idx for idx, label_text in enumerate(label_list)
}
label_num2word = { 
    idx : label_text for idx, label_text in enumerate(label_list)
}
print(label_word2num)

{'科技': 0, '体育': 1, '时政': 2, '股票': 3, '娱乐': 4, '教育': 5, '家居': 6, '财经': 7, '房产': 8, '社会': 9, '游戏': 10, '彩票': 11, '星座': 12, '时尚': 13}


In [5]:
# 标签：文本转数字
train_data.iloc[:, 1] = train_data.iloc[:, 1].map(label_word2num)
dev_data.iloc[:, 1] = dev_data.iloc[:, 1].map(label_word2num)
train_data

Unnamed: 0,text,label
0,网易第三季度业绩低于分析师预期,0
1,巴萨1年前地狱重现这次却是天堂 再赴魔鬼客场必翻盘,1
2,美国称支持向朝鲜提供紧急人道主义援助,2
3,增资交银康联 交行夺参股险商首单,3
4,午盘：原材料板块领涨大盘,3
...,...,...
752466,天津女排奇迹之源竟在场边 他是五冠王真正核心,1
752467,北电网络专利拍卖推迟：可能分拆6部分拍卖,0
752468,Spirit AeroSystems债券发行价确定,3
752469,陆慧明必发火线：法兰克福无胜 曼联国米顺利过关,11


In [6]:
# 构建词汇表
import jieba
import json
'''
def build_vocab(text_list):
    vocab = {"<unk>": 0}  # 添加一个特殊的索引，用于表示未知词

    # 遍历所有句子，构建词汇表
    for text in text_list:
        word_list = jieba.lcut(text)
        for word in word_list:
            if word not in vocab:
                vocab[word] = len(vocab)  # 将每个词映射到唯一的整数索引

    return vocab

vocabulary = build_vocab(train_data['text'].tolist() + dev_data['text'].tolist())
# 保存词汇表到 JSON 文件, 下次可以直接使用
with open('vocabulary.json', 'w', encoding='utf-8') as f:
    json.dump(vocabulary, f, ensure_ascii=False)
'''

'\ndef build_vocab(text_list):\n    vocab = {"<unk>": 0}  # 添加一个特殊的索引，用于表示未知词\n\n    # 遍历所有句子，构建词汇表\n    for text in text_list:\n        word_list = jieba.lcut(text)\n        for word in word_list:\n            if word not in vocab:\n                vocab[word] = len(vocab)  # 将每个词映射到唯一的整数索引\n\n    return vocab\n\nvocabulary = build_vocab(train_data[\'text\'].tolist() + dev_data[\'text\'].tolist())\n# 保存词汇表到 JSON 文件, 下次可以直接使用\nwith open(\'vocabulary.json\', \'w\', encoding=\'utf-8\') as f:\n    json.dump(vocabulary, f, ensure_ascii=False)\n'

In [7]:
# 读取 JSON 文件中的词汇表
with open('vocabulary.json', 'r', encoding='utf-8') as f:
    vocabulary = json.load(f)

In [8]:
# 转文本向量
class TextVector(object):
    def __init__(self, text_list, vocabulary):
        self.text_list = text_list
        self.vocabulary = vocabulary

    def text2vector(self, max_len = 30):
        all_indexed_sentences = []

        # 遍历所有句子，将词汇映射为整数索引，并进行填充
        for text in self.text_list:
            word_list = jieba.lcut(text)
            indexed_sentence = [self.vocabulary.get(word, self.vocabulary["<unk>"]) for word in word_list]

            # 填充句子至最大长度
            padded_sentence = indexed_sentence + [0] * (max_len - len(indexed_sentence))
            all_indexed_sentences.append(padded_sentence)

        return all_indexed_sentences

In [53]:
train_text_vector = TextVector(train_data['text'][:1024].tolist(), vocabulary)
train_vectors = train_text_vector.text2vector()
dev_text_vector = TextVector(dev_data['text'][:1024].tolist(), vocabulary)
dev_vectors = dev_text_vector.text2vector()

定义数据集

In [10]:
# 定义训练数据集
class TrainData(Dataset):
    def __init__(self):
        super().__init__()
    
    def __getitem__(self, index):
        text = paddle.to_tensor(train_vectors[index], dtype='int64')
        label = paddle.to_tensor(train_data['label'].tolist()[index], dtype='int64')

        return text, label
    
    def __len__(self):
        return len(train_vectors)


# 定义验证数据集
class DevData(Dataset):
    def __init__(self):
        super().__init__()
    
    def __getitem__(self, index):
        text = paddle.to_tensor(dev_vectors[index], dtype='int64')
        label = paddle.to_tensor(dev_data['label'].tolist()[index], dtype='int64')

        return text, label
    
    def __len__(self):
        return len(dev_vectors)

    
train_dataset = TrainData()
print(train_dataset.__getitem__(0))
dev_dataset = DevData()
print(dev_dataset.__getitem__(0))

(Tensor(shape=[30], dtype=int64, place=Place(gpu:0), stop_gradient=True,
       [1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]), Tensor(shape=[1], dtype=int64, place=Place(gpu:0), stop_gradient=True,
       [0]))
(Tensor(shape=[30], dtype=int64, place=Place(gpu:0), stop_gradient=True,
       [5532, 5416, 2429, 831 , 8997, 475 , 27229, 2101, 276 , 1058, 17183, 1857,
        2054, 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   ,
        0   , 0   , 0   , 0   , 0   , 0   ]), Tensor(shape=[1], dtype=int64, place=Place(gpu:0), stop_gradient=True,
       [11]))


## 3、准备网络模型

生成器

In [35]:
class Generator(nn.Layer):
    # vocab_size字典里面的个数
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(Generator, self).__init__()

        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, time_major=False)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h):

        x = self.embed(x)
        out, (h, c) = self.lstm(x, h)
        out = out.reshape([out.shape[0] * out.shape[1], out.shape[2]])
        out = self.linear(out)

        return out, (h, c)

# 定义模型
embed_size = 128 # embed_size: 词嵌入后的特征数；
hidden_size = 1024 # hidden_size: lstm中隐层的节点数；
num_layers = 1 # num_layers: lstm中的隐层数量；
vocab_size = len(vocabulary) # 词汇数量
maxlength = 30  # 新闻标题的最大长度 

generator= Generator(vocab_size, embed_size, hidden_size, num_layers)

判别器

In [12]:
class Discriminator(nn.Layer):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, kernel_size, output_size, maxlength):
        super().__init__()
        self.embed=nn.Embedding(vocab_size,embedding_dim,padding_idx=0)
        self.cnn=nn.Conv1D(embedding_dim, hidden_dim, kernel_size)
        self.maxpool=nn.MaxPool1D(maxlength-kernel_size+1)
        self.dense=nn.Sequential(nn.Dropout(0.3), nn.Linear(hidden_dim, output_size))
    def forward(self,x):
        embed_x=self.embed(x)
        cnn_x = self.cnn(embed_x.transpose((0, 2, 1)))
        pool_x=self.maxpool(cnn_x)
        out=self.dense(pool_x.squeeze(-1))
        out=nn.functional.sigmoid(out)
        return out


# 定义模型
vocab_size # 词汇数量
embedding_dim = 1024 # 词嵌入维度
hidden_dim = 128 # 隐藏层维度，也就是CNN网络层卷积核的个数
kernel_size = 3 # 卷积核大小
output_size = 1  # 分类的类别数

discriminator = Discriminator(vocab_size, embedding_dim, hidden_dim, kernel_size, output_size, maxlength)


查看网络结构

In [13]:
# 输出模型结构
print(generator)
print(discriminator)

Generator(
  (embed): Embedding(251988, 128, sparse=False)
  (lstm): LSTM(128, 1024
    (0): RNN(
      (cell): LSTMCell(128, 1024)
    )
  )
  (linear): Linear(in_features=1024, out_features=251988, dtype=float32)
)
Discriminator(
  (embed): Embedding(251988, 1024, padding_idx=0, sparse=False)
  (cnn): Conv1D(1024, 128, kernel_size=[3], data_format=NCL)
  (maxpool): MaxPool1D(kernel_size=28, stride=None, padding=0)
  (dense): Sequential(
    (0): Dropout(p=0.3, axis=None, mode=upscale_in_train)
    (1): Linear(in_features=128, out_features=1, dtype=float32)
  )
)


载入预训练模型

In [14]:
# pretrain_model_path = './pretrained_models/cnn/final.pdparams'# 'pretrained_models/ResNet50_pretrained'

# 加载预训练模型参数
# model.set_state_dict(paddle.load(pretrain_model_path))

## 4、模型训练

环境设置

In [55]:
# 可视化
from visualdl import LogWriter
logwriter = LogWriter(logdir='./visualdl/gan')

# 设置GPU环境，如果没有GPU则设置为CPU
if paddle.is_compiled_with_cuda() and paddle.get_device() != 'cpu':
    paddle.set_device('gpu:0')
    print("Using GPU.")
else:
    paddle.set_device('cpu')
    print("Using CPU.")

epochs = 5
batch_size = 512

# embed_size: 词嵌入后的特征数；
embed_size = 128
# hidden_size: lstm中隐层的节点数；
hidden_size = 1024
# num_layers: lstm中的隐层数量；
num_layers = 1
# seq_length: 获取的序列长度
seq_length = 30
# learning_rate: 模型的学习率
learning_rate = 0.001

Using GPU.


辅助函数

In [16]:
def generate_random_list(length1, length2):
    random_list = [random.randint(0, length2 - 1) for _ in range(length1)]
    return random_list

In [23]:
def generate_words(model, states, num_samples=30):
    # prob对应模型中的outputs，是输入变量经过语言模型得到的输出值，相当于此时每个单词的概率分布
    # 生成概率分布，假设每个词的概率相等
    prob = paddle.ones([vocab_size], dtype='float32') / vocab_size
    # 生成随机样本
    samples = paddle.multinomial(prob, num_samples=batch_size)
    # 将结果变形成 [batch_size, 1] 的张量
    input = paddle.unsqueeze(samples, axis=1) 

    words = paddle.empty([batch_size, 0], dtype='int64')  # 创建一个空张量，用于存储结果
    for i in range(num_samples):
        output, states = model(input, states)
        # prob是对上一步得到的output进行指数化，加强高概率结果的权重；
        prob = output.exp()
        # word_id，通过torch_multinomial，以prob为权重，对结果进行加权抽样，样本数为1(即num_samples)
        word_id = paddle.multinomial(prob, num_samples=1)
        words = paddle.concat([words, word_id], axis=1)
    return words

训练过程

In [None]:
# 定义数据迭代器
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False)
valid_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True, drop_last=False)

# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer_G = paddle.optimizer.Adam(parameters=generator.parameters(), learning_rate=learning_rate)
optimizer_D = paddle.optimizer.Adam(parameters=discriminator.parameters(), learning_rate=learning_rate)

for epoch in range(0, epochs):
    # states是参数矩阵的初始化，相当于对LSTMmodel类里的(h, c)的初始化；
    states = (paddle.zeros(shape=[num_layers, batch_size, hidden_size],dtype='float32'),
            paddle.zeros(shape=[num_layers, batch_size, hidden_size],dtype='float32'))
 
    for i, data in enumerate(train_dataloader):
        # train discriminator
        # 使用真实数据集
        real_data = data[0]
        real_labels = paddle.ones((batch_size, 1)) # [256, 1]
        optimizer_G.clear_grad()
        real_output = discriminator(real_data) # [batch]
        loss_real = criterion(real_output, real_labels) # [1]

        # 使用生成数据集
        states = [state.detach() for state in states]
        fake_data = generate_words(generator, states)
        fake_labels = paddle.zeros((batch_size, 1))
        fake_output = discriminator(fake_data)
        loss_fake = criterion(fake_output, fake_labels)
        
        loss_D = loss_real + loss_fake
        loss_D.backward()
        optimizer_D.step()

        # 训练生成器
        optimizer_G.clear_grad()
        fake_data = generate_words(generator, states)
        fake_output = discriminator(fake_data)

        loss_G = criterion(fake_output, real_labels)
        loss_G.backward()
        optimizer_G.step()

        if i%50==0:
                print('[epoch/Epoch {}/{} iter/Iter {}/{}] lossD {:.4f}, lossG {:.4f}'.format(epoch, epochs-1, i+1, len(train_dataloader), loss_D.item(), loss_G.item()))
                logwriter.add_scalar("generator_loss", value=loss_G.item(), step=i+epoch*(batch_size))
                logwriter.add_scalar("discriminator_loss", value=loss_D.item(), step=i+epoch*(batch_size))
            
        if epoch%2 == 0:
            paddle.save(generator.state_dict(), os.path.join('model', 'gan', 'generator{}.pdmodel'.format(str(epoch).zfill(4))))
            paddle.save(discriminator.state_dict(), os.path.join('model', 'gan', 'discriminator{}.pdmodel'.format(str(epoch).zfill(4))))
        
    
    paddle.save(generator.state_dict(), os.path.join('model', 'gan', 'generator.pdmodel'))
    paddle.save(discriminator.state_dict(), os.path.join('model', 'gan', 'discriminator.pdmodel'))
        

## 5、模型测试

生成器

In [40]:
reversed_vocabulary = list(vocabulary.keys())

In [45]:
generator = Generator(vocab_size, embed_size, hidden_size, num_layers)
# 模型载入
gen_path_dict = paddle.load('./model/gan/generator.pdmodel')
generator.set_state_dict(gen_path_dict)

states = (paddle.zeros(shape=[num_layers, 1, hidden_size],dtype='float32'),
            paddle.zeros(shape=[num_layers, 1, hidden_size],dtype='float32'))

states = [state.detach() for state in states]

title=''

prob = paddle.ones([vocab_size], dtype='float32') / vocab_size
input = paddle.multinomial(prob, num_samples=1).unsqueeze(1)
for i in range(10):
    output, states = generator(input, states)
    prob = output.exp()
    word_id = paddle.multinomial(prob, num_samples=1)
    # 从字典映射表Dictionary里，找到当前索引(即word_id)对应的单词；
    word = reversed_vocabulary[word_id.item()]
    title += word

print(title)

万人迷MedAssets向利丰欢愉i9088Cat可转可赔相映成辉剪系


修改判别器输出，将其调整为分类器

In [47]:
classifier = Discriminator(vocab_size, embedding_dim, hidden_dim, kernel_size, 14, maxlength)
# 模型载入
dis_path_dict = paddle.load('./model/gan/discriminator.pdmodel')
classifier.set_state_dict(dis_path_dict)



模型微调

In [50]:
epochs = 10
batch_size = 256

# 定义数据迭代器
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False)
valid_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True, drop_last=False)

# 定义优化器
opt = paddle.optimizer.Adam(learning_rate=1e-4, parameters=classifier.parameters(), weight_decay=paddle.regularizer.L2Decay(1e-4))

# 定义损失函数
loss_fn = paddle.nn.CrossEntropyLoss()

# 用于测量准确率的评价指标对象
metric =  paddle.metric.Accuracy()

# 定义 EarlyStopping 回调函数
callback = paddle.callbacks.EarlyStopping(monitor='acc', patience=5, mode='max', verbose=1)
# 定义 ModelCheckpoint 回调函数
checkpoint_callback = paddle.callbacks.ModelCheckpoint(save_dir='./model/gan', save_freq=2)
# 设置 visualdl 路径
log_dir = './visualdl/gan'
visual_callback = paddle.callbacks.VisualDL(log_dir=log_dir)

# 使用高层API进行训练
model = paddle.Model(classifier) # 用 Model 封装
# 模型配置
model.prepare(opt, loss_fn, metric)
# 模型训练
model.fit(train_dataloader,
          valid_dataloader,
          epochs=epochs,
          batch_size=batch_size,
          verbose=1,
          callbacks= [callback, checkpoint_callback, visual_callback])

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/10
save checkpoint at /home/.kun/.study/ANN/final/title_classify/model/gan/0
Eval begin...
Eval samples: 1024
Epoch 2/10
Eval begin...
Eval samples: 1024
Epoch 3/10
save checkpoint at /home/.kun/.study/ANN/final/title_classify/model/gan/2
Eval begin...
Eval samples: 1024
Epoch 4/10
Eval begin...
Eval samples: 1024
Epoch 5/10
save checkpoint at /home/.kun/.study/ANN/final/title_classify/model/gan/4
Eval begin...
Eval samples: 1024
Epoch 6/10
Eval begin...
Eval samples: 1024
Epoch 7/10
save checkpoint at /home/.kun/.study/ANN/final/title_classify/model/gan/6
Eval begin...
Eval samples: 1024
Epoch 8/10
Eval begin...
Eval samples: 1024
Epoch 9/10
save checkpoint at /home/.kun/.study/ANN/final/title_classify/model/gan/8
Eval begin...
Eval samples: 1024
Epoch 10/10
Eval begin...
Eval samples: 1024
save checkpoint at /home/.kun/.study/ANN/final/title_classify/model/gan/final
