In [8]:
import os
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import  torch.nn.functional as F

import Functions as d2l

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

DATA_ROOT = "./Datasets"


In [9]:
##首先实现一维互相关运算
def corr1d(x, k):
    w = k.shape[0]
    y = torch.zeros((x.shape[0] - w + 1))
    for i in range(y.shape[0]):
        y[i] = (x[i: i + w] * k).sum()
    return y


x = torch.tensor([0,1,2,3,4,5,6])
k = torch.tensor([1,2])
corr1d(x,k)

tensor([ 2.,  5.,  8., 11., 14., 17.])

In [10]:
## 多通道互相关
def corr1d_multi_in(x,k):
    return torch.stack([corr1d(x,k) for x,k in zip(x,k)]).sum(dim=0)
x = torch.tensor([[0, 1, 2, 3, 4, 5, 6],
              [1, 2, 3, 4, 5, 6, 7],
              [2, 3, 4, 5, 6, 7, 8]])
k = torch.tensor([[1, 2], [3, 4], [-1, -3]])
corr1d_multi_in(x, k)

tensor([ 2.,  8., 14., 20., 26., 32.])

### 时序最大池化层

In [14]:
class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    def forward(self, x):
        # x.shape: (batch_size, channel, seq_len)
        # return (batch_size, channel, 1)
        return F.max_pool1d(x, kernel_size=x.shape[2])
    

In [17]:
###读取数据
from tqdm import tqdm
import random
import collections

def read_imdb(folder='train', data_root = './Datasets/aclImdb'):
    data = []
    for label in ['pos','neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n','').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

## 基于空格进行分词
def get_tokenized_imdb(data):
    # data: list of [string, label]
    def tokenizer(text):
        return [token.lower() for token in text.split(' ')]
    return [tokenizer(review) for review,_ in data]


### 基于分词结果创造词典
def get_vocab_imdb(data):
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)


def preprocess_imdb(data, vocab):
    max_len = 500  # 将每条评论通过截断或者补0，使得长度变成500
    
    def pad(x):
        return x[:max_len] if len(x) > max_len else x + [0] * (max_len - len(x)) 
    
    tokenized_data = get_tokenized_imdb(data)  ## 基于空格进行分词
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels

batch_size = 128
train_data = read_imdb('train')
test_data = read_imdb('test')
vocab = get_vocab_imdb(train_data)

train_set = Data.TensorDataset(*preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers = 4)
test_iter = Data.DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers = 4)

100%|██████████| 12500/12500 [00:39<00:00, 319.53it/s]
100%|██████████| 12500/12500 [00:39<00:00, 319.21it/s]
100%|██████████| 12500/12500 [00:39<00:00, 319.37it/s]
100%|██████████| 12500/12500 [00:39<00:00, 319.64it/s]


In [22]:
class TextCNN(nn.Module):
    def __init__(self, vocab, embed_size, kernel_sizes, channels):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        self.constant_embedding = nn.Embedding(len(vocab), embed_size) # 不参与训练
        self.dropout = nn.Dropout(0.25)
        self.decoder = nn.Linear(sum(channels),2)
        self.pool = GlobalMaxPool1d()
        self.convs = nn.ModuleList()
        for c, k in zip(channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels = 2*embed_size,
                                        out_channels = c,
                                        kernel_size = k))
    
    def forward(self, inputs):
        # 将两个形状是(批量大小, 词数, 词向量维度)的嵌入层的输出按词向量连结
        embeddings = torch.cat((self.embedding(inputs), self.constant_embedding(inputs)), 
                                                       dim=2) # (batch, seq_len, 2*embed_size)
        # 根据Conv1D要求的输入格式，将词向量维，即一维卷积层的通道维(即词向量那一维)，变换到前一维
        embeddings = embeddings.permute(0, 2, 1)
        # 对于每个一维卷积层，在时序最大池化后会得到一个形状为(批量大小, 通道大小, 1)的
        # Tensor。使用flatten函数去掉最后一维，然后在通道维上连结
        encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
        # 应用丢弃法后使用全连接层得到输出
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [27]:
embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)


In [28]:
glove_vocab = Vocab.GloVe(name='6B', dim=100,
                        cache=os.path.join(DATA_ROOT, "glove"))
net.embedding.weight.data.copy_(
    d2l.load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.data.copy_(
    d2l.load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False


There are 21202 oov words.
There are 21202 oov words.


In [29]:
lr, num_epochs = 0.001, 20
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)


training on  cuda
epoch 1, loss 0.4688, train acc 0.770, test acc 0.847, time 14.0 sec
epoch 2, loss 0.1522, train acc 0.871, test acc 0.869, time 14.0 sec
epoch 3, loss 0.0673, train acc 0.922, test acc 0.881, time 14.0 sec
epoch 4, loss 0.0285, train acc 0.961, test acc 0.869, time 14.0 sec
epoch 5, loss 0.0109, train acc 0.984, test acc 0.882, time 14.0 sec
epoch 6, loss 0.0036, train acc 0.996, test acc 0.880, time 14.0 sec
epoch 7, loss 0.0016, train acc 0.999, test acc 0.878, time 14.0 sec
epoch 8, loss 0.0009, train acc 0.999, test acc 0.875, time 14.1 sec
epoch 9, loss 0.0005, train acc 1.000, test acc 0.878, time 14.0 sec
epoch 10, loss 0.0003, train acc 1.000, test acc 0.877, time 14.0 sec
epoch 11, loss 0.0002, train acc 1.000, test acc 0.872, time 14.1 sec
epoch 12, loss 0.0001, train acc 1.000, test acc 0.876, time 14.0 sec
epoch 13, loss 0.0001, train acc 1.000, test acc 0.874, time 14.0 sec
epoch 14, loss 0.0001, train acc 1.000, test acc 0.874, time 14.0 sec
epoch 15, l