# 1-3 文本数据建模流程范例

## 一、准备数据
这里使用的数据是imdb数据集

在torch中预处理文本数据一般使用torchtext或者自定义Dataset

下面用torchtext来构建文本分类数据集

In [1]:
import torch
import string, re
import torchtext

In [2]:
MAX_WORDS = 10000  # 仅考虑最高频的10000个词
MAX_LEN = 200  # 每个样本保留200个词的长度
BATCH_SIZE = 20

# 分词方法
tokenizer = lambda x: re.sub("[%s]"%string.punctuation, "", x).split(" ")  # 把标点符号去掉并且分割文本

# 过滤掉低频词
def filterLowFreqWords(arr, vocab):
    arr = [[x if x < MAX_WORDS else 0 for x in example] for example in arr]
    return arr

# 1. 定义各个字段的预处理方法
TEXT = torchtext.data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=MAX_LEN, postprocessing=filterLowFreqWords)

LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

# 2. 构建表格型dataset
# torchtext.data.TabularDataset可读取csv,tsv,json等格式
ds_train, ds_test = torchtext.data.TabularDataset.splits(path="./data/imdb/", train="train.tsv", test="test.tsv", format="tsv", fields=[("label", LABEL), ("text", TEXT)], skip_header=False)

# 3.构建词典
TEXT.build_vocab(ds_train)

# 4.构建数据管道迭代器
train_iter, test_iter = torchtext.data.Iterator.splits((ds_train, ds_test), sort_within_batch=True, sort_key=lambda x: len(x.text), batch_sizes=(BATCH_SIZE, BATCH_SIZE))


In [3]:
# 查看example信息
print(ds_train[0].text)
print(ds_train[0].label)

['it', 'really', 'boggles', 'my', 'mind', 'when', 'someone', 'comes', 'across', 'a', 'movie', 'like', 'this', 'and', 'claims', 'it', 'to', 'be', 'one', 'of', 'the', 'worst', 'slasher', 'films', 'out', 'there', 'this', 'is', 'by', 'far', 'not', 'one', 'of', 'the', 'worst', 'out', 'there', 'still', 'not', 'a', 'good', 'movie', 'but', 'not', 'the', 'worst', 'nonetheless', 'go', 'see', 'something', 'like', 'death', 'nurse', 'or', 'blood', 'lake', 'and', 'then', 'come', 'back', 'to', 'me', 'and', 'tell', 'me', 'if', 'you', 'think', 'the', 'night', 'brings', 'charlie', 'is', 'the', 'worst', 'the', 'film', 'has', 'decent', 'camera', 'work', 'and', 'editing', 'which', 'is', 'way', 'more', 'than', 'i', 'can', 'say', 'for', 'many', 'more', 'extremely', 'obscure', 'slasher', 'filmsbr', 'br', 'the', 'film', 'doesnt', 'deliver', 'on', 'the', 'onscreen', 'deaths', 'theres', 'one', 'death', 'where', 'you', 'see', 'his', 'pruning', 'saw', 'rip', 'into', 'a', 'neck', 'but', 'all', 'other', 'deaths', 'a

In [4]:
# 查看词典信息
print(len(TEXT.vocab))

# itos: index to string
print(TEXT.vocab.itos[0])

# stoi: string to index
print(TEXT.vocab.stoi["<unk>"])  # 未知词
print(TEXT.vocab.stoi["<pad>"])  # 填充

# freqs: 词频
print(TEXT.vocab.freqs["<unk>"])
print(TEXT.vocab.freqs["a"])
print(TEXT.vocab.freqs["good"])


108197
<unk>
0
1
0
129453
11457


In [5]:
# 查看数据管道信息
# 这里注意有坑: text第0维是句子长度
for batch in train_iter:
    features = batch.text
    labels = batch.label
    print(features.shape)
    print(labels)
    break

torch.Size([200, 20])
tensor([1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0])


In [6]:
# 将数据管道组织成torch.utils.data.DataLoader相似的features, label输出形式
class DataLoader:
    def __init__(self, data_iter):
        super().__init__()
        self.data_iter = data_iter
        self.length = len(data_iter)

    def __len__(self):
        return self.length

    def __iter__(self):
        for batch in self.data_iter:
            # 这里换成batch first
            yield(torch.transpose(batch.text, 0, 1), torch.unsqueeze(batch.label.float(), dim=1)
            )
dl_train = DataLoader(train_iter)
dl_test = DataLoader(test_iter)

In [7]:
for features, labels in dl_train:
    print(features.shape)
    print(labels.shape)
    break

torch.Size([20, 200])
torch.Size([20, 1])


# 二、定义模型
在这里采用第三种方式进行构建，并且使用类形式的训练循环

In [8]:
from torch import nn
import torchkeras

In [16]:
torch.random.seed()

class Net(torchkeras.Model):

    def __init__(self):
        super().__init__()

        # 设置padding_idx参数后在训练过程中将填充的token始终赋值为0
        self.embedding = nn.Embedding(num_embeddings=MAX_WORDS, embedding_dim=3, padding_idx=1)
        self.conv = nn.Sequential()
        self.conv.add_module("conv_1", nn.Conv1d(in_channels=3, out_channels=16, kernel_size=5))
        self.conv.add_module("pool_1", nn.MaxPool1d(kernel_size=2))
        self.conv.add_module("relu_1", nn.ReLU())
        self.conv.add_module("conv_2", nn.Conv1d(in_channels=16, out_channels=128, kernel_size=2))

        self.conv.add_module("pool_2", nn.MaxPool1d(kernel_size=2))
        self.conv.add_module("relu_2", nn.ReLU())

        self.dense = nn.Sequential()
        self.dense.add_module("flatten", nn.Flatten())
        self.dense.add_module("linear", nn.Linear(6144, 1))
        self.dense.add_module("sigmoid", nn.Sigmoid())

    def forward(self, x):
        x = self.embedding(x)
        # print("before transpose:", x.shape)
        x = x.transpose(1, 2)
        # print("after transpose:", x.shape)
        x = self.conv(x)
        y = self.dense(x)
        return y

In [17]:
model = Net()
print(model)

model.summary(input_shape=(200,), input_dtype=torch.LongTensor)

Net(
  (embedding): Embedding(10000, 3, padding_idx=1)
  (conv): Sequential(
    (conv_1): Conv1d(3, 16, kernel_size=(5,), stride=(1,))
    (pool_1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (relu_1): ReLU()
    (conv_2): Conv1d(16, 128, kernel_size=(2,), stride=(1,))
    (pool_2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (relu_2): ReLU()
  )
  (dense): Sequential(
    (flatten): Flatten()
    (linear): Linear(in_features=6144, out_features=1, bias=True)
    (sigmoid): Sigmoid()
  )
)
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
         Embedding-1               [-1, 200, 3]          30,000
            Conv1d-2              [-1, 16, 196]             256
         MaxPool1d-3               [-1, 16, 98]               0
              ReLU-4               [-1, 16, 98]               0
            Conv1d-5              [-1, 128, 97]  

## 三、训练模型
这里仿照Keras定义了一个高阶的模型接口Model

In [18]:
def accuracy(y_pred, y_true):
    y_pred = torch.where(y_pred>0.5, torch.ones_like(y_pred, dtype=torch.float32), torch.zeros_like(y_pred, dtype = torch.float32))
    acc = torch.mean(1-torch.abs(y_true-y_pred))
    return acc

model.compile(loss_func=nn.BCELoss(), optimizer=torch.optim.Adagrad(model.parameters(), lr=0.02), metrics_dict={"accuracy": accuracy})

In [19]:
# 有时候模型训练过程中不收敛，需要多试几次
dfhistory = model.fit(20,dl_train,dl_val=dl_test,log_step_freq= 200)

Start Training ...

{'step': 200, 'loss': 0.755, 'accuracy': 0.509}
{'step': 400, 'loss': 0.725, 'accuracy': 0.507}
{'step': 600, 'loss': 0.714, 'accuracy': 0.505}
{'step': 800, 'loss': 0.709, 'accuracy': 0.507}
{'step': 1000, 'loss': 0.706, 'accuracy': 0.509}

 +-------+-------+----------+----------+--------------+
| epoch |  loss | accuracy | val_loss | val_accuracy |
+-------+-------+----------+----------+--------------+
|   1   | 0.706 |  0.509   |  0.691   |     0.53     |
+-------+-------+----------+----------+--------------+

{'step': 200, 'loss': 0.684, 'accuracy': 0.563}
{'step': 400, 'loss': 0.684, 'accuracy': 0.563}
{'step': 600, 'loss': 0.682, 'accuracy': 0.565}
{'step': 800, 'loss': 0.682, 'accuracy': 0.566}
{'step': 1000, 'loss': 0.681, 'accuracy': 0.567}

 +-------+-------+----------+----------+--------------+
| epoch |  loss | accuracy | val_loss | val_accuracy |
+-------+-------+----------+----------+--------------+
|   2   | 0.681 |  0.567   |  0.685   |    0.556     