In [40]:
import os
import torch
import torchtext
import pandas as pd

from torch import optim
from torchtext.datasets import text_classification


NGRAMS = 2

In [3]:
torchtext.datasets.AG_NEWS

<function torchtext.datasets.text_classification.AG_NEWS(*args, **kwargs)>

## 1. 数据读取
数据集需要下载，然而因为防火墙的原因无法下载Google云上的数据集，因此手动下载了数据集，并改写了数据集读取的方法，使其不需要下载

In [42]:
# 此方法需要下载数据集，事实上数据集已经下载完成，因此通过改写下面cell的方法读取数据集
# train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
#     root='./data', ngrams=NGRAMS, vocab=None)
# BATCH_SIZE = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
import logging
import torch
import io
from torchtext.utils import download_from_url, extract_archive, unicode_csv_reader
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.vocab import Vocab
from tqdm import tqdm


def _csv_iterator(data_path, ngrams, yield_cls=False):
    tokenizer = get_tokenizer("basic_english")
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f)
        for row in reader:
            tokens = ' '.join(row[1:])
            tokens = tokenizer(tokens)
            if yield_cls:
                yield int(row[0]) - 1, ngrams_iterator(tokens, ngrams)
            else:
                yield ngrams_iterator(tokens, ngrams)


def _create_data_from_iterator(vocab, iterator, include_unk):
    data = []
    labels = []
    with tqdm(unit_scale=0, unit='lines') as t:
        for cls, tokens in iterator:
            if include_unk:
                tokens = torch.tensor([vocab[token] for token in tokens])
            else:
                token_ids = list(filter(lambda x: x is not Vocab.UNK, [vocab[token]
                                        for token in tokens]))
                tokens = torch.tensor(token_ids)
            if len(tokens) == 0:
                logging.info('Row contains no tokens.')
            data.append((cls, tokens))
            labels.append(cls)
            t.update(1)
    return data, set(labels)


class TextClassificationDataset(torch.utils.data.Dataset):
    """Defines an abstract text classification datasets.
       Currently, we only support the following datasets:

             - AG_NEWS
             - SogouNews
             - DBpedia
             - YelpReviewPolarity
             - YelpReviewFull
             - YahooAnswers
             - AmazonReviewPolarity
             - AmazonReviewFull

    """

    def __init__(self, vocab, data, labels):
        """Initiate text-classification dataset.

        Arguments:
            vocab: Vocabulary object used for dataset.
            data: a list of label/tokens tuple. tokens are a tensor after
                numericalizing the string tokens. label is an integer.
                [(label1, tokens1), (label2, tokens2), (label2, tokens3)]
            label: a set of the labels.
                {label1, label2}

        Examples:
            See the examples in examples/text_classification/

        """

        super(TextClassificationDataset, self).__init__()
        self._data = data
        self._labels = labels
        self._vocab = vocab


    def __getitem__(self, i):
        return self._data[i]

    def __len__(self):
        return len(self._data)

    def __iter__(self):
        for x in self._data:
            yield x

    def get_labels(self):
        return self._labels

    def get_vocab(self):
        return self._vocab



def _setup_datasets(dataset_name, root='.data', ngrams=1, vocab=None, include_unk=False):
    # dataset_tar = download_from_url(URLS[dataset_name], root=root)
    extracted_files = extract_archive(root)

    for fname in extracted_files:
        if fname.endswith('train.csv'):
            train_csv_path = fname
        if fname.endswith('test.csv'):
            test_csv_path = fname

    if vocab is None:
        logging.info('Building Vocab based on {}'.format(train_csv_path))
        vocab = build_vocab_from_iterator(_csv_iterator(train_csv_path, ngrams))
    else:
        if not isinstance(vocab, Vocab):
            raise TypeError("Passed vocabulary is not of type Vocab")
    logging.info('Vocab has {} entries'.format(len(vocab)))
    logging.info('Creating training data')
    train_data, train_labels = _create_data_from_iterator(
        vocab, _csv_iterator(train_csv_path, ngrams, yield_cls=True), include_unk)
    logging.info('Creating testing data')
    test_data, test_labels = _create_data_from_iterator(
        vocab, _csv_iterator(test_csv_path, ngrams, yield_cls=True), include_unk)
    if len(train_labels ^ test_labels) > 0:
        raise ValueError("Training and test labels don't match")
    return (TextClassificationDataset(vocab, train_data, train_labels),
            TextClassificationDataset(vocab, test_data, test_labels))


def AG_NEWS(*args, **kwargs):
    """ Defines AG_NEWS datasets.
        The labels includes:
            - 0 : World
            - 1 : Sports
            - 2 : Business
            - 3 : Sci/Tech

    Create supervised learning dataset: AG_NEWS

    Separately returns the training and test dataset

    Arguments:
        root: Directory where the datasets are saved. Default: ".data"
        ngrams: a contiguous sequence of n items from s string text.
            Default: 1
        vocab: Vocabulary used for dataset. If None, it will generate a new
            vocabulary based on the train data set.
        include_unk: include unknown token in the data (Default: False)

    Examples:
        >>> train_dataset, test_dataset = torchtext.datasets.AG_NEWS(ngrams=3)

    """

    return _setup_datasets(*(("AG_NEWS",) + args), **kwargs)



DATASETS = {
    'AG_NEWS': AG_NEWS,
}


LABELS = {
    'AG_NEWS': {
        0: 'World',
        1: 'Sports',
        2: 'Business',
        3: 'Sci/Tech'
    }, 
}

查看了解数据

In [13]:
# 数据读取完成
train_dataset, test_dataset = DATASETS['AG_NEWS'](
    root='./data/ag_news_csv.tar.gz', ngrams=NGRAMS, vocab=None)

120000lines [00:07, 16939.40lines/s]
120000lines [00:13, 9015.48lines/s]
7600lines [00:00, 9396.13lines/s]


## 2. 构建模型
文本嵌入模型，将文本通过词语查询表转化为索引，再通过嵌入词袋层转化为线性层并输出

In [32]:
import torch.nn as nn
import torch.nn.functional as F


class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()
        
    def init_weights(self):
        initrange= .5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [33]:
# 参数设置
BATCH_SIZE = 16    # 每轮训练的数据量
VOCAB_SIZE = len(train_dataset.get_vocab())   # 向量大小即嵌入层的大小
EMBED_DIM = 32    # 每个嵌入层大小
NUN_CLASS = len(train_dataset.get_labels())   # 类别数量
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

## 3. 生成批量处理的函数

In [35]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

In [34]:
from torch.utils.data import DataLoader

In [49]:
def train_func(sub_train_):
    train_loss = 0
    train_acc = 0
    
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE,shuffle=True,
                     collate_fn=generate_batch)
    for i, (text, offsets, cla) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cla = text.to(device), offsets.to(device), cla.to(device)
        output = model(text, offsets)
        
        loss = criterion(output, cla)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cla).sum().item()
    scheduler.step()
    return train_loss / len(sub_train_), train_acc / len(sub_train_)


def test(data_):
    loss = 0 
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    
    for text, offsets, cla in data:
        text, offsets, cla = text.to(device), offsets.to(device), cla.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            
            loss = criterion(output, cla)
            loss += loss.item()
            acc += (output.argmax(1) == cla).sum().item()
    return loss / len(data_), acc / len(data_)

## 4. 分割数据集并训练模型

In [None]:
import time
from torch.utils.data.dataset import random_split


N_EPOCHS = 5
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)  # 损失函数
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)   # 优化器
# 调度器
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)


train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = random_split(
    train_dataset, [train_len, len(train_dataset) - train_len])

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)
    
    secs = int(time.time() - start_time)
    mins = secs // 60
    secs = secs % 60
    
    print('Epoch: %d' % (epoch + 1), '| time in %d minutes, %d seconds' % (mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

In [47]:
96 % 60

36

In [24]:
# 生成数据集
from torchtext.data import TabularDataset
# Field 类处理确定如何处理数据并将其转化为数字
from torchtext.data import Field

In [23]:
# 使用空白标记分割词语，并将字母小写
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)

# 处理标签
LABEL = Field(sequential=False, use_vocab=False)

In [4]:
%%time
# 加载训练集和验证集数据

tv_datafields = [
    ('id', None), ('comment_text', TEXT), ('toxic', LABEL),
    ('severe_toxic', LABEL), ('obscene', LABEL), ('threat', LABEL),
    ('insult', LABEL), ('identity_hate', LABEL), 
]

trn, vld = TabularDataset.splits(
    path='./.data',
    train='train.csv', validation='valid.csv',
    format='csv', skip_header=True, fields=tv_datafields
)

In [6]:
tst_datafields = [
    ('id', None),
    ('comment_text', TEXT)
]
tst = TabularDataset(
    path='./.data/test.csv',
    format='csv',
    skip_header=True,
    fields=tst_datafields
)

In [7]:
TEXT.build_vocab(trn)

In [8]:
TEXT.vocab.freqs.most_common(10)

[('the', 78),
 ('to', 41),
 ('you', 33),
 ('of', 30),
 ('and', 26),
 ('a', 26),
 ('is', 24),
 ('that', 22),
 ('i', 20),
 ('if', 19)]

In [11]:
from torchtext.data import Iterator, BucketIterator

In [12]:
train_iter, valid_iter = BucketIterator.splits(
    (trn, vld),
    batch_sizes=(64, 64),
    device=-1,
    sort_key=lambda x: len(x.comment_text),
    sort_within_batch=False,
    repeat=False
)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [13]:
next(train_iter.__iter__())


[torchtext.data.batch.Batch of size 25]
	[.comment_text]:[torch.LongTensor of size 494x25]
	[.toxic]:[torch.LongTensor of size 25]
	[.severe_toxic]:[torch.LongTensor of size 25]
	[.obscene]:[torch.LongTensor of size 25]
	[.threat]:[torch.LongTensor of size 25]
	[.insult]:[torch.LongTensor of size 25]
	[.identity_hate]:[torch.LongTensor of size 25]

In [15]:
test_iter = Iterator(tst, batch_size=64, device=-1, sort=False, sort_within_batch=False, repeat=False)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [22]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars
        
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var)
            
            if self.y_vars is not None:
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))
            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [23]:
train_dl = BatchWrapper(train_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
valid_dl = BatchWrapper(valid_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
test_dl = BatchWrapper(test_iter, "comment_text", None)

In [None]:
next(train_dl.__iter__())

In [25]:
# 训练模型
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [28]:
class SimpleBiLSTMB(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300,
                spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=1):
        super().__init__()
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1, dropout=recurrent_dropout)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 6)
    
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds

In [29]:
em_sz = 100
nh = 500
nl = 3
model = SimpleBiLSTMB(nh, emb_dim=em_sz)

In [42]:
import tqdm     # 进度条库
import numpy as np

In [48]:
opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss()
epochs = 5

In [49]:
for epoch in range(1, epochs + 1):
    running_loss = 0
    running_corrects = 0
    model.train()
    for x, y in tqdm.tqdm(train_dl):
        opt.zero_grad()
        
        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        
        opt.step()
        
        running_loss += loss.item() * x.size(0)
        
    epoch_loss = running_loss / len(trn)
    
    val_loss = .0
    model.eval()
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.item() * x.size(0)
    val_loss /= len(vld)
    print(f'Epoch: {epoch}, Training Loss: {epoch_loss:.4f}, Validatioon Loss: {val_loss:.4f}')

100%|████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.17s/it]
  0%|                                                                        | 0/1 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 3.0838, Validatioon Loss: 2.7874


100%|████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.14s/it]
  0%|                                                                        | 0/1 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 3.9316, Validatioon Loss: 2.1592


100%|████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.77s/it]
  0%|                                                                        | 0/1 [00:00<?, ?it/s]

Epoch: 3, Training Loss: 2.9060, Validatioon Loss: 2.1450


100%|████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.79s/it]
  0%|                                                                        | 0/1 [00:00<?, ?it/s]

Epoch: 4, Training Loss: 3.1643, Validatioon Loss: 2.2619


100%|████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.71s/it]


Epoch: 5, Training Loss: 3.1872, Validatioon Loss: 2.3949


In [43]:
test_preds = []
for x, y in tqdm.tqdm(test_dl):
    preds = preds.data.numpy()
    preds = 1 / (1 + np.exp(-preds))
    test_preds.append(preds)
test_preds = np.hstack(test_preds)

100%|███████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 334.34it/s]
