In [1]:
import torch
import torchtext
import pandas as pd

from torchtext.datasets import text_classification
NGRAMS = 2
import os

In [5]:
torchtext.datasets.AG_NEWS

<function torchtext.datasets.text_classification.AG_NEWS(*args, **kwargs)>

In [None]:
if not os.path.isdir('./.data'):
    os.mkdir('./.data')
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](root='./.data', ngrams=NGRAMS, vocab=None,)
BATCH_SIZE = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# Field 类处理确定如何处理数据并将其转化为数字
from torchtext.data import Field

# 使用空白标记分割词语，并将字母小写
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)

# 处理标签
LABEL = Field(sequential=False, use_vocab=False)

In [3]:
# 生成数据集
from torchtext.data import TabularDataset

In [18]:
pd.read_csv('./.data/train.csv').head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0


In [4]:
# 加载训练集和验证集数据
# %%time
tv_datafields = [
    ('id', None), ('comment_text', TEXT), ('toxic', LABEL),
    ('severe_toxic', LABEL), ('obscene', LABEL), ('threat', LABEL),
    ('insult', LABEL), ('identity_hate', LABEL), 
]

trn, vld = TabularDataset.splits(
    path='./.data',
    train='train.csv', validation='valid.csv',
    format='csv', skip_header=True, fields=tv_datafields
)

In [6]:
tst_datafields = [
    ('id', None),
    ('comment_text', TEXT)
]
tst = TabularDataset(
    path='./.data/test.csv',
    format='csv',
    skip_header=True,
    fields=tst_datafields
)

In [7]:
TEXT.build_vocab(trn)

In [8]:
TEXT.vocab.freqs.most_common(10)

[('the', 78),
 ('to', 41),
 ('you', 33),
 ('of', 30),
 ('and', 26),
 ('a', 26),
 ('is', 24),
 ('that', 22),
 ('i', 20),
 ('if', 19)]

In [11]:
from torchtext.data import Iterator, BucketIterator

In [12]:
train_iter, valid_iter = BucketIterator.splits(
    (trn, vld),
    batch_sizes=(64, 64),
    device=-1,
    sort_key=lambda x: len(x.comment_text),
    sort_within_batch=False,
    repeat=False
)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [13]:
next(train_iter.__iter__())


[torchtext.data.batch.Batch of size 25]
	[.comment_text]:[torch.LongTensor of size 494x25]
	[.toxic]:[torch.LongTensor of size 25]
	[.severe_toxic]:[torch.LongTensor of size 25]
	[.obscene]:[torch.LongTensor of size 25]
	[.threat]:[torch.LongTensor of size 25]
	[.insult]:[torch.LongTensor of size 25]
	[.identity_hate]:[torch.LongTensor of size 25]

In [15]:
test_iter = Iterator(tst, batch_size=64, device=-1, sort=False, sort_within_batch=False, repeat=False)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [22]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars
        
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var)
            
            if self.y_vars is not None:
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))
            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [23]:
train_dl = BatchWrapper(train_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
valid_dl = BatchWrapper(valid_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
test_dl = BatchWrapper(test_iter, "comment_text", None)

In [None]:
next(train_dl.__iter__())

In [25]:
# 训练模型
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [28]:
class SimpleBiLSTMB(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300,
                spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=1):
        super().__init__()
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1, dropout=recurrent_dropout)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 6)
    
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds

In [29]:
em_sz = 100
nh = 500
nl = 3
model = SimpleBiLSTMB(nh, emb_dim=em_sz)

In [42]:
import tqdm     # 进度条库
import numpy as np

In [48]:
opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss()
epochs = 5

In [49]:
for epoch in range(1, epochs + 1):
    running_loss = 0
    running_corrects = 0
    model.train()
    for x, y in tqdm.tqdm(train_dl):
        opt.zero_grad()
        
        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        
        opt.step()
        
        running_loss += loss.item() * x.size(0)
        
    epoch_loss = running_loss / len(trn)
    
    val_loss = .0
    model.eval()
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.item() * x.size(0)
    val_loss /= len(vld)
    print(f'Epoch: {epoch}, Training Loss: {epoch_loss:.4f}, Validatioon Loss: {val_loss:.4f}')

100%|████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.17s/it]
  0%|                                                                        | 0/1 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 3.0838, Validatioon Loss: 2.7874


100%|████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.14s/it]
  0%|                                                                        | 0/1 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 3.9316, Validatioon Loss: 2.1592


100%|████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.77s/it]
  0%|                                                                        | 0/1 [00:00<?, ?it/s]

Epoch: 3, Training Loss: 2.9060, Validatioon Loss: 2.1450


100%|████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.79s/it]
  0%|                                                                        | 0/1 [00:00<?, ?it/s]

Epoch: 4, Training Loss: 3.1643, Validatioon Loss: 2.2619


100%|████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.71s/it]


Epoch: 5, Training Loss: 3.1872, Validatioon Loss: 2.3949


In [43]:
test_preds = []
for x, y in tqdm.tqdm(test_dl):
    preds = preds.data.numpy()
    preds = 1 / (1 + np.exp(-preds))
    test_preds.append(preds)
test_preds = np.hstack(test_preds)

100%|███████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 334.34it/s]
