In [1]:
import os
import sys
from importlib import reload

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import tqdm

import torch
from torch.utils import data as D

# Local imports
sys.path.append('../src')
import dataset
import trainer
import models
import utils

# Transformers
import transformers
from transformers import XLMRobertaModel, XLMRobertaTokenizer, XLMRobertaConfig
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule


os.environ['CUDA_VISIBLE_DEVICES'] = '0'
utils.seed_everything()

## Datasets

In [64]:
reload(dataset)

<module 'dataset' from '../src/dataset.py'>

In [65]:
%%time
# valid = dataset.Dataset('../input/validatio_debug_32.npz')
valid = dataset.Dataset('../input/validation.npz')
valid.x.shape, valid.y.shape

CPU times: user 141 ms, sys: 47.9 ms, total: 189 ms
Wall time: 253 ms


((8000, 512), (8000,))

In [66]:
%%time
# test = dataset.Dataset('../input/test_debug_32.npz')
test = dataset.Dataset('../input/test.npz')
test.x.shape, test.y.shape

CPU times: user 706 ms, sys: 230 ms, total: 936 ms
Wall time: 1.02 s


((63812, 512), (63812,))

In [67]:
%%time
# train = dataset.Dataset('../input/jigsaw-toxic-comment-trai_debug_32.npz')
train = dataset.Dataset('../input/jigsaw-toxic-comment-train.npz')
train.x.shape, train.y.shape

CPU times: user 1.83 s, sys: 691 ms, total: 2.52 s
Wall time: 2.65 s


((223549, 512), (223549,))

## Data loaders

In [68]:
batch_size = 2
num_workers = 4

loader_train = D.DataLoader(train, 
                            sampler=train.weighted_sampler(), 
                            batch_size=batch_size, num_workers=num_workers)
loader_valid = D.DataLoader(valid, 
                            batch_size=batch_size, num_workers=num_workers)
loader_test = D.DataLoader(test, 
                           batch_size=batch_size, num_workers=num_workers)

In [69]:
len(loader_train), len(loader_valid), len(loader_test)

(21384, 4000, 31906)

## Model

In [13]:
reload(models)

<module 'models' from '../src/models.py'>

In [14]:
backbone = XLMRobertaModel(XLMRobertaConfig.from_pretrained('xlm-roberta-large'))

We can turn of regularization to make debugging easier

In [70]:
# to reload the module and not overload gpu
del model

In [38]:
# model = models.Model(backbone, mix=False, dropout=0)

In [71]:
model = models.Model(backbone, mix=True, dropout=0.25)

## Trainer

In [72]:
reload(trainer)

<module 'trainer' from '../src/trainer.py'>

In [73]:
# we may optimize only head (with encoder pretrained)
optimizer = AdamW(model.head.parameters(), lr=1e-4)
# scheduler = get_linear_schedule_with_warmup()

In [74]:
trnr = trainer.Trainer('base', model, 
                       loader_train, loader_valid, loader_test,
                       epochs=5,
                       monitor='val_loss',
                       optimizer=optimizer,
                      )

Sanity check for output

In [8]:
#x, y, am = next(iter(loader_train))

In [21]:
#out, loss = trnr(x, y, am)

In [22]:
#out

tensor([[1.0928, 1.2950],
        [0.8992, 1.2395],
        [1.1048, 1.3800],
        [1.1402, 1.3978]], device='cuda:0', grad_fn=<AddmmBackward>)

In [23]:
#loss

tensor(0.6357, device='cuda:0', grad_fn=<MeanBackward0>)

## Training

In [None]:
trnr.fit()

ep. 0000 (lr 1.00e-04):   1%|          | 177/21384 [00:40<1:20:06,  4.41it/s, loss=0.726, acc=0.525]

## Prediction

In [15]:
pred, loss, acc = trnr.test()

test: 100%|##########| 16/16 [00:01<00:00, 12.31it/s]


In [16]:
acc

1.0