In [2]:
import os
import sys
from importlib import reload

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import tqdm

import torch
from torch.utils import data as D

# Local imports
sys.path.append('../src')
import dataset
import trainer
import models
import utils
import preprocessing

# Transformers
import transformers
from transformers import XLMRobertaModel, XLMRobertaTokenizer, XLMRobertaConfig
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule

# Setup device
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Seed 
utils.seed_everything()

print('use', device)

[nltk_data] Downloading package punkt to
[nltk_data]     /gpfs/hpc/home/papkov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
use cuda


## Datasets

In [3]:
reload(dataset)

<module 'dataset' from '../src/dataset.py'>

In [4]:
debug = False
use_features = True

In [5]:
%%time
if debug:
    valid = dataset.Dataset('../input/validatio_debug_32.npz')
else:
    valid = dataset.Dataset('../input/validation.npz', use_features=use_features)
valid.x.shape, valid.y.shape

CPU times: user 0 ns, sys: 147 ms, total: 147 ms
Wall time: 345 ms


((8000, 3072), (8000,))

In [6]:
%%time
if debug:
    test = dataset.Dataset('../input/test_debug_32.npz')
else:
    test = dataset.Dataset('../input/test.npz', use_features=use_features)
test.x.shape, test.y.shape

CPU times: user 236 ms, sys: 990 ms, total: 1.23 s
Wall time: 1.53 s


((63812, 3072), (63812,))

In [7]:
%%time
if debug:
    train = dataset.Dataset('../input/jigsaw-toxic-comment-trai_debug_32.npz')
else:
    train = dataset.Dataset('../input/jigsaw-toxic-comment-train.npz', use_features=use_features)
train.x.shape, train.y.shape

CPU times: user 910 ms, sys: 1.47 s, total: 2.38 s
Wall time: 3.02 s


((223549, 3072), (223549,))

## Model

In [8]:
reload(models)

<module 'models' from '../src/models.py'>

In [4]:
backbone = XLMRobertaModel(XLMRobertaConfig.from_pretrained('xlm-roberta-large'))

In [150]:
# to reload the module and not overload gpu
del model

In [11]:
model = models.Model(backbone, mix=True, dropout=0.25)

In [9]:
model = models.SimplePoolingHead(mix=False, dropout=0.5)
# model = models.TransformersPoolingHead(mix=False, dropout=0.5)

## Feature extraction

In [5]:
reload(preprocessing)

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


[nltk_data] Downloading package punkt to
[nltk_data]     /gpfs/hpc/home/papkov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<module 'preprocessing' from '../src/preprocessing.py'>

In [6]:
preprocessing.extract_roberta_features_to_file('../input/validation.npz', backbone=backbone, device=device)

feature extraction: 100%|##########| 63/63 [04:36<00:00,  4.38s/it]


In [14]:
preprocessing.extract_roberta_features_to_file('../input/test.npz', backbone=backbone, device=device)

feature extraction: 100%|##########| 499/499 [36:46<00:00,  4.42s/it]


In [None]:
preprocessing.extract_roberta_features_to_file('../input/jigsaw-toxic-comment-train.npz', backbone=backbone, device=device, batch_size=128)

feature extraction:  28%|##8       | 490/1747 [36:10<1:32:47,  4.43s/it]

## Data loaders

In [22]:
batch_size = 64
num_workers = 8

loader_train = D.DataLoader(train, 
                            sampler=train.weighted_sampler(), 
#                             shuffle=True,
                            batch_size=batch_size, num_workers=num_workers)
loader_valid = D.DataLoader(valid, 
                            batch_size=batch_size, num_workers=num_workers)
loader_test = D.DataLoader(test, 
                           batch_size=batch_size, num_workers=num_workers)

In [11]:
len(loader_train), len(loader_valid), len(loader_test)

(3493, 125, 998)

## Trainer

In [18]:
reload(trainer)

<module 'trainer' from '../src/trainer.py'>

In [19]:
# we may optimize only head (with encoder pretrained)
# optimizer = AdamW(model.head.parameters(), lr=1e-4)
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=1e-6)
criterion = trainer.FocalLoss(gamma=2)
scheduler = get_linear_schedule_with_warmup(optimizer, 5, 5)

In [23]:
trnr = trainer.Trainer('head', model, 
                       loader_train, loader_valid, loader_test,
                       epochs=10,
                       monitor='val_loss',
                       optimizer=optimizer,
#                        criterion=criterion,
#                        scheduler=scheduler,
                      )

Sanity check for output

In [34]:
x, y, am = next(iter(loader_train))

In [21]:
#out, loss = trnr(x, y, am)

In [22]:
#out

tensor([[1.0928, 1.2950],
        [0.8992, 1.2395],
        [1.1048, 1.3800],
        [1.1402, 1.3978]], device='cuda:0', grad_fn=<AddmmBackward>)

In [23]:
#loss

tensor(0.6357, device='cuda:0', grad_fn=<MeanBackward0>)

## Training

In [24]:
trnr.fit()

ep. 0000 (lr 1.00e-04): 100%|##########| 669/669 [00:05<00:00, 132.90it/s, loss=0.64, acc=0.64]  
valid: 100%|##########| 125/125 [00:00<00:00, 232.76it/s]
ep. 0002 (lr 9.05e-05):  58%|#####7    | 2015/3493 [00:37<00:12, 118.20it/s, loss=0.297, acc=0.904]

Epoch 0 complete. loss=0.6002, val_loss=0.7908, val_acc=0.3807, val_auc=0.5613
Saved model to ../checkpoints//head_last.pth


ep. 0001 (lr 9.76e-05): 100%|##########| 669/669 [00:05<00:00, 133.50it/s, loss=0.636, acc=0.644]
valid: 100%|##########| 125/125 [00:00<00:00, 175.08it/s]
ep. 0002 (lr 9.05e-05):  58%|#####7    | 2015/3493 [00:45<00:12, 118.20it/s, loss=0.297, acc=0.904]

Epoch 1 complete. loss=0.6212, val_loss=0.6470, val_acc=0.6541, val_auc=0.5606
Saved model to ../checkpoints//head.pth


ep. 0002 (lr 9.05e-05): 100%|##########| 669/669 [00:05<00:00, 120.23it/s, loss=0.63, acc=0.648] 
valid: 100%|##########| 125/125 [00:00<00:00, 186.84it/s]
ep. 0002 (lr 9.05e-05):  58%|#####7    | 2015/3493 [00:53<00:12, 118.20it/s, loss=0.297, acc=0.904]

Epoch 2 complete. loss=0.6243, val_loss=0.6557, val_acc=0.6396, val_auc=0.5638
Saved model to ../checkpoints//head_last.pth


ep. 0003 (lr 7.94e-05): 100%|##########| 669/669 [00:04<00:00, 137.44it/s, loss=0.628, acc=0.651]
valid: 100%|##########| 125/125 [00:00<00:00, 230.02it/s]
ep. 0002 (lr 9.05e-05):  58%|#####7    | 2015/3493 [01:00<00:12, 118.20it/s, loss=0.297, acc=0.904]

Epoch 3 complete. loss=0.4341, val_loss=0.6606, val_acc=0.6295, val_auc=0.5663
Saved model to ../checkpoints//head_last.pth


ep. 0004 (lr 6.55e-05): 100%|##########| 669/669 [00:05<00:00, 122.36it/s, loss=0.634, acc=0.646]
valid: 100%|##########| 125/125 [00:00<00:00, 212.83it/s]
ep. 0002 (lr 9.05e-05):  58%|#####7    | 2015/3493 [01:07<00:12, 118.20it/s, loss=0.297, acc=0.904]

Epoch 4 complete. loss=0.5589, val_loss=0.7334, val_acc=0.4850, val_auc=0.5658
Saved model to ../checkpoints//head_last.pth


ep. 0005 (lr 5.00e-05): 100%|##########| 669/669 [00:05<00:00, 126.22it/s, loss=0.626, acc=0.652]
valid: 100%|##########| 125/125 [00:00<00:00, 225.68it/s]
ep. 0002 (lr 9.05e-05):  58%|#####7    | 2015/3493 [01:15<00:12, 118.20it/s, loss=0.297, acc=0.904]

Epoch 5 complete. loss=0.7629, val_loss=0.6771, val_acc=0.6008, val_auc=0.5653
Saved model to ../checkpoints//head_last.pth


ep. 0006 (lr 3.45e-05): 100%|##########| 669/669 [00:05<00:00, 118.32it/s, loss=0.625, acc=0.653]
valid: 100%|##########| 125/125 [00:00<00:00, 176.28it/s]
ep. 0002 (lr 9.05e-05):  58%|#####7    | 2015/3493 [01:23<00:12, 118.20it/s, loss=0.297, acc=0.904]

Epoch 6 complete. loss=0.6498, val_loss=0.6605, val_acc=0.6320, val_auc=0.5664
Saved model to ../checkpoints//head_last.pth


ep. 0007 (lr 2.06e-05): 100%|##########| 669/669 [00:04<00:00, 137.56it/s, loss=0.621, acc=0.658]
valid: 100%|##########| 125/125 [00:00<00:00, 214.33it/s]
ep. 0002 (lr 9.05e-05):  58%|#####7    | 2015/3493 [01:30<00:12, 118.20it/s, loss=0.297, acc=0.904]

Epoch 7 complete. loss=0.6559, val_loss=0.5858, val_acc=0.7488, val_auc=0.5660
Saved model to ../checkpoints//head.pth


ep. 0008 (lr 9.55e-06): 100%|##########| 669/669 [00:05<00:00, 131.19it/s, loss=0.622, acc=0.659]
valid: 100%|##########| 125/125 [00:00<00:00, 193.00it/s]
ep. 0002 (lr 9.05e-05):  58%|#####7    | 2015/3493 [01:38<00:12, 118.20it/s, loss=0.297, acc=0.904]

Epoch 8 complete. loss=0.4891, val_loss=0.6460, val_acc=0.6591, val_auc=0.5654
Saved model to ../checkpoints//head_last.pth


ep. 0009 (lr 2.45e-06): 100%|##########| 669/669 [00:05<00:00, 122.64it/s, loss=0.619, acc=0.661]
valid: 100%|##########| 125/125 [00:00<00:00, 221.04it/s]
ep. 0002 (lr 9.05e-05):  58%|#####7    | 2015/3493 [01:46<00:12, 118.20it/s, loss=0.297, acc=0.904]

Epoch 9 complete. loss=0.5682, val_loss=0.6390, val_acc=0.6744, val_auc=0.5654
Saved model to ../checkpoints//head_last.pth


## Prediction

In [148]:
pred, loss, acc, auc = trnr.validate()

valid: 100%|##########| 125/125 [00:00<00:00, 217.28it/s]


In [19]:
pred, loss, acc = trnr.test()

test: 100%|##########| 31906/31906 [38:58<00:00, 13.65it/s]


In [22]:
loss

2.320265071657932