## 1、导入相关包

In [1]:
# 数据科学包
import numpy as np                 # 常用数据科学包
import pandas as pd                # 文本读取

# 深度学习包
import paddle
from paddle.io import Dataset, DataLoader  # 定义数据集
import paddle.nn as nn                     # 网络

  import pkg_resources
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


## 2、准备数据集

查看数据集

In [22]:
# 使用pandas读取数据集
train_data = pd.read_table('./data/train.txt', sep='\t',header=None)  # 训练集
dev_data = pd.read_table('./data/dev.txt', sep='\t',header=None)      # 验证集
test_data = pd.read_table('./data/test.txt', sep='\t',header=None)    # 测试集

# 由于数据集存放时无列名，因此手动添加列名便于对数据进行更好处理
train_data.columns = ["text",'label']
dev_data.columns = ["text",'label']
test_data.columns = ["text"]

In [3]:
train_data

Unnamed: 0,text,label
0,网易第三季度业绩低于分析师预期,科技
1,巴萨1年前地狱重现这次却是天堂 再赴魔鬼客场必翻盘,体育
2,美国称支持向朝鲜提供紧急人道主义援助,时政
3,增资交银康联 交行夺参股险商首单,股票
4,午盘：原材料板块领涨大盘,股票
...,...,...
752466,天津女排奇迹之源竟在场边 他是五冠王真正核心,体育
752467,北电网络专利拍卖推迟：可能分拆6部分拍卖,科技
752468,Spirit AeroSystems债券发行价确定,股票
752469,陆慧明必发火线：法兰克福无胜 曼联国米顺利过关,彩票


处理数据集

In [23]:
# 定义要进行分类的类别
label_list=list(train_data.label.unique())
label_word2num = { 
    label_text : idx for idx, label_text in enumerate(label_list)
}
label_num2word = { 
    idx : label_text for idx, label_text in enumerate(label_list)
}
print(label_word2num)

{'科技': 0, '体育': 1, '时政': 2, '股票': 3, '娱乐': 4, '教育': 5, '家居': 6, '财经': 7, '房产': 8, '社会': 9, '游戏': 10, '彩票': 11, '星座': 12, '时尚': 13}


In [5]:
# 标签：文本转数字
train_data.iloc[:, 1] = train_data.iloc[:, 1].map(label_word2num)
dev_data.iloc[:, 1] = dev_data.iloc[:, 1].map(label_word2num)
train_data

Unnamed: 0,text,label
0,网易第三季度业绩低于分析师预期,0
1,巴萨1年前地狱重现这次却是天堂 再赴魔鬼客场必翻盘,1
2,美国称支持向朝鲜提供紧急人道主义援助,2
3,增资交银康联 交行夺参股险商首单,3
4,午盘：原材料板块领涨大盘,3
...,...,...
752466,天津女排奇迹之源竟在场边 他是五冠王真正核心,1
752467,北电网络专利拍卖推迟：可能分拆6部分拍卖,0
752468,Spirit AeroSystems债券发行价确定,3
752469,陆慧明必发火线：法兰克福无胜 曼联国米顺利过关,11


In [2]:
# 构建词汇表
import jieba
import json
'''
def build_vocab(text_list):
    vocab = {"<unk>": 0}  # 添加一个特殊的索引，用于表示未知词

    # 遍历所有句子，构建词汇表
    for text in text_list:
        word_list = jieba.lcut(text)
        for word in word_list:
            if word not in vocab:
                vocab[word] = len(vocab)  # 将每个词映射到唯一的整数索引

    return vocab

vocabulary = build_vocab(train_data['text'].tolist() + dev_data['text'].tolist())
# 保存词汇表到 JSON 文件, 下次可以直接使用
with open('vocabulary.json', 'w', encoding='utf-8') as f:
    json.dump(vocabulary, f, ensure_ascii=False)
'''

'\ndef build_vocab(text_list):\n    vocab = {"<unk>": 0}  # 添加一个特殊的索引，用于表示未知词\n\n    # 遍历所有句子，构建词汇表\n    for text in text_list:\n        word_list = jieba.lcut(text)\n        for word in word_list:\n            if word not in vocab:\n                vocab[word] = len(vocab)  # 将每个词映射到唯一的整数索引\n\n    return vocab\n\nvocabulary = build_vocab(train_data[\'text\'].tolist() + dev_data[\'text\'].tolist())\n# 保存词汇表到 JSON 文件, 下次可以直接使用\nwith open(\'vocabulary.json\', \'w\', encoding=\'utf-8\') as f:\n    json.dump(vocabulary, f, ensure_ascii=False)\n'

In [5]:
# 读取 JSON 文件中的词汇表
with open('vocabulary.json', 'r', encoding='utf-8') as f:
    vocabulary = json.load(f)

In [3]:
# 转文本向量
class TextVector(object):
    def __init__(self, text_list, vocabulary):
        self.text_list = text_list
        self.vocabulary = vocabulary

    def text2vector(self, max_len = 30):
        all_indexed_sentences = []

        # 遍历所有句子，将词汇映射为整数索引，并进行填充
        for text in self.text_list:
            word_list = jieba.lcut(text)
            indexed_sentence = [self.vocabulary.get(word, self.vocabulary["<unk>"]) for word in word_list]

            # 填充句子至最大长度
            padded_sentence = indexed_sentence + [0] * (max_len - len(indexed_sentence))
            all_indexed_sentences.append(padded_sentence)

        return all_indexed_sentences

In [None]:
train_text_vector = TextVector(train_data['text'].tolist(), vocabulary)
train_vectors = train_text_vector.text2vector()
dev_text_vector = TextVector(dev_data['text'].tolist(), vocabulary)
dev_vectors = dev_text_vector.text2vector()

定义数据集

In [9]:
# 定义训练数据集
class TrainData(Dataset):
    def __init__(self):
        super().__init__()
    
    def __getitem__(self, index):
        text = paddle.to_tensor(train_vectors[index], dtype='int64')
        label = paddle.to_tensor(train_data['label'].tolist()[index], dtype='int64')

        return text, label
    
    def __len__(self):
        return len(train_vectors)


# 定义验证数据集
class DevData(Dataset):
    def __init__(self):
        super().__init__()
    
    def __getitem__(self, index):
        text = paddle.to_tensor(dev_vectors[index], dtype='int64')
        label = paddle.to_tensor(dev_data['label'].tolist()[index], dtype='int64')

        return text, label
    
    def __len__(self):
        return len(dev_vectors)

    
train_dataset = TrainData()
print(train_dataset.__getitem__(0))
dev_dataset = DevData()
print(dev_dataset.__getitem__(0))

(Tensor(shape=[30], dtype=int64, place=Place(gpu:0), stop_gradient=True,
       [1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]), Tensor(shape=[1], dtype=int64, place=Place(gpu:0), stop_gradient=True,
       [0]))
(Tensor(shape=[30], dtype=int64, place=Place(gpu:0), stop_gradient=True,
       [5532, 5416, 2429, 831 , 8997, 475 , 27229, 2101, 276 , 1058, 17183, 1857,
        2054, 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   ,
        0   , 0   , 0   , 0   , 0   , 0   ]), Tensor(shape=[1], dtype=int64, place=Place(gpu:0), stop_gradient=True,
       [11]))


## 3、准备网络模型

搭建网络

In [20]:
# 定义CNN模型：随机嵌入
class CNN(nn.Layer):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, kernel_size, output_size, maxlength):
        super().__init__()
        self.embed=nn.Embedding(vocab_size,embedding_dim,padding_idx=0)
        self.cnn=nn.Conv1D(embedding_dim, hidden_dim, kernel_size)
        self.maxpool=nn.MaxPool1D(maxlength-kernel_size+1)
        self.dense=nn.Sequential(nn.Dropout(0.3), nn.Linear(hidden_dim, output_size))
    def forward(self,x):
        embed_x=self.embed(x)
        cnn_x = self.cnn(embed_x.transpose((0, 2, 1)))
        pool_x=self.maxpool(cnn_x)
        out=self.dense(pool_x.squeeze(-1))
        return out


# 定义模型
vocab_size = len(vocabulary) # 词汇数量
embedding_dim = 1024 # 词嵌入维度
hidden_dim = 128 # 隐藏层维度，也就是CNN网络层卷积核的个数
kernel_size = 3 # 卷积核大小
output_size = 14  # 分类的类别数
maxlength = 30  # 新闻标题的最大长度

model = CNN(vocab_size, embedding_dim, hidden_dim, kernel_size, output_size, maxlength)


查看网络结构

In [11]:
# 输出模型结构
print(model)


CNN(
  (embed): Embedding(251988, 1024, padding_idx=0, sparse=False)
  (cnn): Conv1D(1024, 128, kernel_size=[3], data_format=NCL)
  (maxpool): MaxPool1D(kernel_size=28, stride=None, padding=0)
  (dense): Sequential(
    (0): Dropout(p=0.3, axis=None, mode=upscale_in_train)
    (1): Linear(in_features=128, out_features=14, dtype=float32)
  )
)


载入预训练模型

In [11]:
pretrain_model_path = './pretrained_models/cnn/final.pdparams'

# 加载预训练模型参数
model.set_state_dict(paddle.load(pretrain_model_path))

## 4、模型训练

In [13]:
epochs = 4
batch_size = 512

# 设置GPU环境，如果没有GPU则设置为CPU
if paddle.is_compiled_with_cuda() and paddle.get_device() != 'cpu':
    paddle.set_device('gpu:0')
    print("Using GPU.")
else:
    paddle.set_device('cpu')
    print("Using CPU.")

# 定义数据迭代器
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False)
valid_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True, drop_last=False)

# 定义优化器
opt = paddle.optimizer.Adam(learning_rate=1e-4, parameters=model.parameters(), weight_decay=paddle.regularizer.L2Decay(1e-4))

# 定义损失函数
loss_fn = paddle.nn.CrossEntropyLoss()

# 用于测量准确率的评价指标对象
metric =  paddle.metric.Accuracy()

# 定义 EarlyStopping 回调函数
callback = paddle.callbacks.EarlyStopping(monitor='acc', patience=5, mode='max', verbose=1)
# 定义 ModelCheckpoint 回调函数
checkpoint_callback = paddle.callbacks.ModelCheckpoint(save_dir='./model/cnn', save_freq=2)
# 设置 visualdl 路径
log_dir = './visualdl/cnn'
visual_callback = paddle.callbacks.VisualDL(log_dir=log_dir)

# 使用高层API进行训练
model = paddle.Model(model) # 用 Model 封装
# 模型配置
model.prepare(opt, loss_fn, metric)
# 模型训练
model.fit(train_dataloader,
          valid_dataloader,
          epochs=epochs,
          batch_size=batch_size,
          verbose=1,
          callbacks= [callback, checkpoint_callback, visual_callback])



## 5、模型测试

载入预测模型

In [24]:
pre_model = CNN(vocab_size, embedding_dim, hidden_dim, kernel_size, output_size, maxlength)
pre_model.set_state_dict(paddle.load('./model/cnn/final.pdparams'))



直接测试

In [25]:
plain_text_vector = TextVector(['小王子是一本好书'], vocabulary)
plain_vector = plain_text_vector.text2vector()
plain_tensor = paddle.to_tensor(plain_vector[:30])
pre = pre_model(plain_tensor)
prob = nn.functional.sigmoid(pre, axis=1)
num = paddle.argmax(prob, axis=1).numpy()
label = label_num2word[num[0]]
print(label)

娱乐


在测试集上预测

In [11]:
test_text_vector = TextVector(test_data['text'].tolist(), vocabulary)
test_vectors = test_text_vector.text2vector()

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.883 seconds.
Prefix dict has been built successfully.


In [18]:
results = []
for vector in test_vectors:
    tensor = paddle.to_tensor(vector[:30])
    pre = pre_model(paddle.unsqueeze(tensor, axis=0))
    prob = nn.functional.sigmoid(pre, axis=1)
    num = paddle.argmax(prob, axis=1).numpy()
    label = label_num2word[num[0]]
    results.append(label)

结果提交

In [19]:
# 将list格式的预测结果存储为txt文件，提交格式要求：每行一个类别
def write_results(labels, file_path):
    with open(file_path, "w", encoding="utf8") as f:
        f.writelines("\n".join(labels))

write_results(results, "./result.txt")


In [None]:
# 因格式要求为zip，故需要将结果文件压缩为submission.zip提交文件
!zip 'submission.zip' 'result.txt'