### The dataset for multi-label text classification was introduced in this kaggle challenge
- https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/


In [4]:
# !wget https://www.dropbox.com/s/6dygg8esbdywogl/jigsaw-toxic-comment-classification-challenge.zip

In [5]:
# !unzip jigsaw-toxic-comment-classification-challenge.zip

In [1]:
# utils
import torch
import pandas as pd

# data
from torchtext.data import Field, LabelField, BucketIterator, TabularDataset

# model
import torch.nn as nn
import torch.nn.functional as F

# training
import torch.optim as optim
import tqdm

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cuda', index=0)

### Data Preparation

In [4]:
root = "./jigsaw-toxic-comment-classification-challenge/"

In [5]:
# create fields
TEXT = Field(lower=True, tokenize="spacy", batch_first=True)
LABEL = LabelField(batch_first=True, dtype=torch.float32)

In [6]:
# field and dataframe
train_fields = [
    ("id", None), 
    ("comment_text", TEXT), 
    ("toxic", LABEL), 
    ('severe_toxic', LABEL), 
    ('obscene', LABEL), 
    ('threat', LABEL),
    ('insult', LABEL),
    ('identity_hate', LABEL)
]

test_fields = [
    ("id", None), 
    ("comment_text", TEXT), 
]

In [7]:
# load the training dataset
data = TabularDataset(
    path="./jigsaw-toxic-comment-classification-challenge/train.csv",
    format="CSV",
    fields=train_fields,
    skip_header=True,
)

In [8]:
# split the total labelled data into train and val
train, val = data.split(split_ratio=0.98)

In [9]:
# load test data
test = TabularDataset(
    path="./jigsaw-toxic-comment-classification-challenge/test.csv",
    format="CSV",
    fields=test_fields,
    skip_header=True,
)

In [10]:
## Build the vocabulary
TEXT.build_vocab(train)
LABEL.build_vocab(train)

In [11]:
# train and val dataloader
BATCH_SIZE = 32
train_loader, val_loader = BucketIterator.splits(
    datasets=(train, val),
    batch_size=BATCH_SIZE,
    device=device,
    shuffle=True,
    sort=False
)

In [99]:
# test dataloader -> no specific reason to exclude it from above BucketIterator
test_loader = BucketIterator(
    dataset=test,
    batch_size=1,
    device=device,
    sort=False
)


In [12]:
print(len(train_loader), len(val_loader), len(test_loader))

4887 100 4787


In [13]:
def batch_wrapper(batch):
    """it actually combines multi-label into NxC tensor"""
    x = batch.comment_text
    y = torch.stack((batch.toxic, batch.severe_toxic, batch.obscene, batch.threat, batch.insult, batch.identity_hate), dim=1)
    y = y.type(torch.float)
    return x, y
    

In [14]:
batch = next(iter(train_loader))

In [15]:
x, y =  batch_wrapper(batch)

### Model

In [59]:
class Model(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300, num_linear=1):
        super(Model, self).__init__() # don't forget to call this!
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 6)

    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[:, -1, :]
        for layer in self.linear_layers:
          feature = layer(feature)
        preds = self.predictor(feature)
        return preds

### Trianing 

In [88]:
def eval(model, data, criterion):
    losses = []
    with torch.no_grad():
        for batch in data:
            x, y = batch_wrapper(batch)
            outputs = model(x)
            loss = criterion(outputs, y.type_as(outputs))
            losses.append(loss.item())
    return sum(losses)/len(losses)

In [112]:
# config
EPOCHS = 10
LR = 1e-2
TOTAL_STEPS = len(train_loader)

In [None]:
em_sz = 100
nh = 500
nl = 3
model = Model(nh, emb_dim=em_sz).to(device)

In [90]:
# criterion and optimizer
criterion  = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(params=model.parameters(), lr=LR)

In [113]:
epoch_progress = tqdm.tqdm(total=EPOCHS, desc="EPOCH", position=0)
steps = 0

for epoch in range(EPOCHS):
    epoch_losses = []
    step_progress = tqdm.tqdm(total=len(train_loader), desc="EPOCH", position=0)
    for batch in train_loader:
        x, y = batch_wrapper(batch)
        y_ = model(x)
        
        optimizer.zero_grad()

        loss = criterion(y_, y)

        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())

        if steps%1000==0:
          print(f'Step {steps}/{TOTAL_STEPS} | Train_loss {loss.item():.4f}')

        steps += 1
        step_progress.update(1)

    train_loss = sum(epoch_losses)/len(epoch_losses)
    val_loss = eval(model, val_loader, criterion)

    print(f'Epoch {epoch}/{EPOCHS} | Train_loss {train_loss:.4f} | Val_loss {val_loss:.4f}')
    
    epoch_progress.update(1)

EPOCH: 100%|██████████| 2/2 [46:47<00:00, 1403.87s/it]
EPOCH: 100%|██████████| 4887/4887 [38:20<00:00,  2.12it/s]
EPOCH:   0%|          | 3/4887 [00:00<27:21,  2.98it/s]

Step 0/4887 | Train_loss 0.1710


EPOCH:  21%|██        | 1002/4887 [01:41<07:46,  8.33it/s]

Step 1000/4887 | Train_loss 0.0914


EPOCH:  41%|████      | 2002/4887 [03:23<04:57,  9.70it/s]

Step 2000/4887 | Train_loss 0.2057


EPOCH:  61%|██████▏   | 3001/4887 [05:05<02:58, 10.56it/s]

Step 3000/4887 | Train_loss 0.0978


EPOCH:  82%|████████▏ | 4001/4887 [06:48<01:20, 10.97it/s]

Step 4000/4887 | Train_loss 0.2807


EPOCH: 100%|██████████| 4887/4887 [08:22<00:00,  9.72it/s]

Epoch 0/10 | Train_loss 0.1493 | Val_loss 0.1496



EPOCH:   2%|▏         | 115/4887 [00:12<08:35,  9.26it/s]

Step 5000/4887 | Train_loss 0.2687


EPOCH:  23%|██▎       | 1115/4887 [01:54<05:53, 10.67it/s]

Step 6000/4887 | Train_loss 0.0852


EPOCH:  43%|████▎     | 2116/4887 [03:35<04:22, 10.57it/s]

Step 7000/4887 | Train_loss 0.1250


EPOCH:  64%|██████▎   | 3115/4887 [05:17<02:58,  9.95it/s]

Step 8000/4887 | Train_loss 0.1819


EPOCH:  84%|████████▍ | 4116/4887 [06:59<01:14, 10.29it/s]

Step 9000/4887 | Train_loss 0.1071


EPOCH: 100%|██████████| 4887/4887 [08:22<00:00,  9.72it/s]

Epoch 1/10 | Train_loss 0.1490 | Val_loss 0.1433



EPOCH:   5%|▍         | 228/4887 [00:23<07:38, 10.16it/s]

Step 10000/4887 | Train_loss 0.0389


EPOCH:  25%|██▌       | 1228/4887 [02:07<06:04, 10.04it/s]

Step 11000/4887 | Train_loss 0.0607


EPOCH:  46%|████▌     | 2229/4887 [03:49<04:34,  9.67it/s]

Step 12000/4887 | Train_loss 0.1235


EPOCH:  66%|██████▌   | 3229/4887 [05:31<02:31, 10.91it/s]

Step 13000/4887 | Train_loss 0.1843


EPOCH:  87%|████████▋ | 4229/4887 [07:09<00:59, 11.05it/s]

Step 14000/4887 | Train_loss 0.0304


EPOCH: 100%|██████████| 4887/4887 [08:21<00:00,  9.75it/s]

Epoch 2/10 | Train_loss 0.1484 | Val_loss 0.1420



EPOCH:   7%|▋         | 341/4887 [00:35<08:32,  8.87it/s]

Step 15000/4887 | Train_loss 0.1693


EPOCH:  27%|██▋       | 1340/4887 [02:17<05:44, 10.29it/s]

Step 16000/4887 | Train_loss 0.1717


EPOCH:  48%|████▊     | 2341/4887 [03:57<04:02, 10.49it/s]

Step 17000/4887 | Train_loss 0.1505


EPOCH:  68%|██████▊   | 3340/4887 [05:37<02:22, 10.83it/s]

Step 18000/4887 | Train_loss 0.1034


EPOCH:  89%|████████▉ | 4340/4887 [07:18<00:49, 11.13it/s]

Step 19000/4887 | Train_loss 0.0966


EPOCH: 100%|██████████| 4887/4887 [08:18<00:00,  9.80it/s]

Epoch 3/10 | Train_loss 0.1487 | Val_loss 0.1490



EPOCH:   9%|▉         | 453/4887 [00:45<07:29,  9.87it/s]

Step 20000/4887 | Train_loss 0.1385


EPOCH:  30%|██▉       | 1453/4887 [02:28<05:19, 10.74it/s]

Step 21000/4887 | Train_loss 0.1463


EPOCH:  50%|█████     | 2454/4887 [04:09<04:19,  9.39it/s]

Step 22000/4887 | Train_loss 0.1295


EPOCH:  71%|███████   | 3453/4887 [05:52<02:32,  9.42it/s]

Step 23000/4887 | Train_loss 0.1475


EPOCH:  91%|█████████ | 4455/4887 [07:33<00:47,  9.03it/s]

Step 24000/4887 | Train_loss 0.1086


EPOCH: 100%|██████████| 4887/4887 [08:20<00:00,  9.76it/s]

Epoch 4/10 | Train_loss 0.1492 | Val_loss 0.1465



EPOCH:  12%|█▏        | 566/4887 [00:57<06:51, 10.49it/s]

Step 25000/4887 | Train_loss 0.0991


EPOCH:  32%|███▏      | 1567/4887 [02:39<06:18,  8.78it/s]

Step 26000/4887 | Train_loss 0.0869


EPOCH:  53%|█████▎    | 2567/4887 [04:21<04:35,  8.43it/s]

Step 27000/4887 | Train_loss 0.1190


EPOCH:  73%|███████▎  | 3567/4887 [06:00<01:47, 12.32it/s]

Step 28000/4887 | Train_loss 0.1721


EPOCH:  93%|█████████▎| 4568/4887 [07:43<00:40,  7.89it/s]

Step 29000/4887 | Train_loss 0.2126


EPOCH: 100%|██████████| 4887/4887 [08:19<00:00,  9.78it/s]

Epoch 5/10 | Train_loss 0.1483 | Val_loss 0.1478



EPOCH:  14%|█▍        | 681/4887 [01:09<09:22,  7.47it/s]

Step 30000/4887 | Train_loss 0.1291


EPOCH:  34%|███▍      | 1679/4887 [02:49<06:12,  8.60it/s]

Step 31000/4887 | Train_loss 0.2054


EPOCH:  55%|█████▍    | 2680/4887 [04:28<03:22, 10.90it/s]

Step 32000/4887 | Train_loss 0.1196


EPOCH:  75%|███████▌  | 3680/4887 [06:09<02:00,  9.98it/s]

Step 33000/4887 | Train_loss 0.1082


EPOCH:  96%|█████████▌| 4681/4887 [07:53<00:22,  9.15it/s]

Step 34000/4887 | Train_loss 0.1375


EPOCH: 100%|██████████| 4887/4887 [08:18<00:00,  9.81it/s]

Epoch 6/10 | Train_loss 0.1481 | Val_loss 0.1394



EPOCH:  16%|█▌        | 793/4887 [01:19<06:31, 10.45it/s]

Step 35000/4887 | Train_loss 0.1736


EPOCH:  37%|███▋      | 1793/4887 [03:00<04:39, 11.08it/s]

Step 36000/4887 | Train_loss 0.1546


EPOCH:  57%|█████▋    | 2792/4887 [04:43<03:06, 11.24it/s]

Step 37000/4887 | Train_loss 0.0839


EPOCH:  78%|███████▊  | 3792/4887 [06:24<01:57,  9.35it/s]

Step 38000/4887 | Train_loss 0.1222


EPOCH:  98%|█████████▊| 4792/4887 [08:07<00:08, 10.96it/s]

Step 39000/4887 | Train_loss 0.1999


EPOCH: 100%|██████████| 4887/4887 [08:20<00:00,  9.77it/s]

Epoch 7/10 | Train_loss 0.1474 | Val_loss 0.1420



EPOCH:  19%|█▊        | 907/4887 [01:32<06:35, 10.07it/s]

Step 40000/4887 | Train_loss 0.1321


EPOCH:  39%|███▉      | 1906/4887 [03:13<04:41, 10.59it/s]

Step 41000/4887 | Train_loss 0.0848


EPOCH:  59%|█████▉    | 2907/4887 [04:55<03:13, 10.24it/s]

Step 42000/4887 | Train_loss 0.0788


EPOCH:  80%|███████▉  | 3905/4887 [06:36<01:40,  9.72it/s]

Step 43000/4887 | Train_loss 0.1844


EPOCH: 100%|██████████| 4887/4887 [08:18<00:00,  9.81it/s]

Epoch 8/10 | Train_loss 0.1472 | Val_loss 0.1385



EPOCH:   0%|          | 19/4887 [00:02<11:13,  7.23it/s]

Step 44000/4887 | Train_loss 0.1548


EPOCH:  21%|██        | 1019/4887 [01:46<05:53, 10.94it/s]

Step 45000/4887 | Train_loss 0.0301


EPOCH:  41%|████▏     | 2019/4887 [03:27<05:40,  8.42it/s]

Step 46000/4887 | Train_loss 0.2728


EPOCH:  62%|██████▏   | 3018/4887 [05:09<02:36, 11.96it/s]

Step 47000/4887 | Train_loss 0.0855


EPOCH:  82%|████████▏ | 4019/4887 [06:50<01:28,  9.76it/s]

Step 48000/4887 | Train_loss 0.1897


EPOCH: 100%|██████████| 10/10 [1:23:22<00:00, 499.66s/it]

Epoch 9/10 | Train_loss 0.1478 | Val_loss 0.1443
