## Explore the data

In [2]:
%%capture
import concurrent.futures
from functools import partial
import hashlib
from pathlib import Path
import pickle
import string
import time
from typing import Callable, List, Union, Tuple

import numpy as np
import pandas as pd
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from tqdm import tqdm_notebook

from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
import torch.nn as nn
import torch.optim as optim

%load_ext autoreload
%autoreload 2

In [3]:
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

In [4]:
project_dirname = Path('../').resolve()
raw_data_dirname = project_dirname / 'data/raw'
data_files = list(raw_data_dirname.iterdir())

### Read Data

In [5]:
def compute_df_sha(df: pd.DataFrame) -> str:
    id_str = ''.join(map(str, df.id))
    id_bstr = id_str.encode()
    sha = hashlib.sha256(id_bstr)
    return sha.hexdigest()

    
class RawDataset:
    def __init__(self, project_dirname:Path, subsample: int = None) -> None:
        self.subsample = subsample
        self.project_dirname = project_dirname
        self.data_dirname = self.project_dirname / 'data'
        self.raw_data_dirname = self.data_dirname / 'raw'
        self.train_filename = self.raw_data_dirname / 'train.csv'
        self.test_filename = self.raw_data_dirname / 'test.csv'
        
        assert self.train_filename.exists()
        assert self.test_filename.exists()

        self.train_df = pd.read_csv(self.train_filename)
        self.test_df = pd.read_csv(self.test_filename)
        
        if self.subsample != 0:
            self.train_df = self.train_df.iloc[0: self.subsample]
        self.identifier = compute_df_sha(self.train_df)

class ProcessedDataset:
    toxicity_subtypes = ['severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']
    identity_attributes = [
        'male', 'female', 'transgender', 'other_gender',
        'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation',
        'christian', 'jewish', 'muslim', 'hindu', 'buddhist', 'atheist', 'other_religion',
        'black', 'white', 'asian', 'latino', 'other_race_or_ethnicity',
        'physical_disability', 'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
        'other_disability'
    ]
    _binarize_columns = ['target'] + toxicity_subtypes
    binary_columns = [f'b_{name}' for name in _binarize_columns]
    
    def __init__(self, raw_dataset: RawDataset, overwrite: bool = False) -> None:
        self.identifier = raw_dataset.identifier
        self.project_dirname = raw_dataset.project_dirname
        self.data_dirname = raw_dataset.data_dirname
        self.cache_path = self.data_dirname / f'{self.identifier}.pklb'
        
        if self.cache_path.exists() and not overwrite:
            print(f'Loading processed dataset from {self.cache_path}')
            self.load()
        else:
            self.train_df = raw_dataset.train_df
            self.test_df = raw_dataset.test_df
            self.featurize()
            self.save()

    def featurize(self) -> None:
        print('Featurizing....')
        self._prepare_df_labels(self.train_df)
        self._prepare_features(self.train_df)
        self._prepare_features(self.test_df)
        print('Done.')
            
    def _prepare_df_labels(self, df: pd.DataFrame) -> None:
        for column, new_column in zip(self._binarize_columns, self.binary_columns):
            df[new_column] = df[column].apply(self._binarize_label)

    def _prepare_features(self, df: pd.DataFrame) -> None:
        df['comment_words'] = df.comment_text.apply(
                lambda x: [w.text for w in tokenizer(x)]
            )

        
    def _binarize_label(self, target: float) -> int:
        """According to competition rules, target values >= 0.5 are considered the positive class."""
        return int(target >= 0.5)
    
    def save(self) -> None:
        cache_data = {
            'train_df': self.train_df,
            'test_df': self.test_df
        }
        with open(self.cache_path, 'wb') as fw:
            pickle.dump(cache_data, fw)
        
    def load(self) -> None:
        with open(self.cache_path, 'rb') as fo:
            cache_data = pickle.load(fo)
        self.train_df = cache_data['train_df']
        self.test_df = cache_data['test_df']
        
    

def split_df(df: pd.DataFrame, frac: float = 0.1) -> Tuple[pd.DataFrame]:
    n_val = int(np.ceil(df.shape[0]*frac))
    df = df.sample(frac=1.)
    val_df = df.iloc[0: n_val]
    train_df = df.iloc[n_val:]
    assert val_df.shape[0] + train_df.shape[0] == df.shape[0]
    return train_df, val_df
    
class TrainableDataset:
    def __init__(self, processed_dataset: ProcessedDataset) -> None:
        self.trainval_df = processed_dataset.train_df
        self.train_df, self.val_df = split_df(self.trainval_df, frac=0.1)
        self.test_df = processed_dataset.test_df
        
        self.n_train = self.train_df.shape[0]
        self.n_val = self.val_df.shape[0]
        self.n_test = self.test_df.shape[0]
        
    def __repr__(self) -> str:
        return f'Train samples: {self.n_train}, Val samples: {self.n_val}, Test samples: {self.n_test}'

### Vocab

In [6]:
class Vocabulary:
    PAD = "<PAD>"
    BOS = "<BOS>"
    EOS = "<EOS>"
    BOT = "<BOT>"
    EOT = "<EOT>"
    UNK = "<UNK>"
    
    specials = [PAD, BOS, EOS, BOT, EOT, UNK]
    vocab = []
    token_to_int = dict()
    int_to_token = dict()
    
    def __init__(self, tokens: list = []) -> None:
        [self.add_token(t) for t in self.specials]
        [self.add_token(t) for t in tokens]
        self.UNK_IDX = self.vocab.index(self.UNK)
            
    def add_token(self, token) -> None:
        if token in self.vocab:
            return
        else:
            idx = len(self.vocab)
            self.token_to_int[token] = idx
            self.int_to_token[idx] = token
            self.vocab.append(token)
            
    def __getitem__(self, token: str) -> int:
        return self.token_to_int.get(token, self.UNK_IDX)
    
    def __contains__(self, token: str) -> bool:
        return token in self.vocab
    
    def get(self, x: Union[str, int], reverse: bool = False) -> int:
        if reverse:
            return self.int_to_token.get(x, self.UNK)
        else:
            return self.token_to_int.get(x, self.UNK_IDX)
        
    def __len__(self) -> int:
        return len(self.token_to_int)
    
    @property
    def size(self) -> int:
        return len(self)
    
    
class VocabEncoder:
    def __init__(self, vocab: Vocabulary) -> None:
        self.vocab = vocab
        
    def _encode_token(self, token: str) -> int:
        encoded = []
        if Vocabulary.BOT in self.vocab:
            encoded.append(self.vocab[self.vocab.BOT])
        encoded += [self.vocab[c] for c in token]
        if Vocabulary.EOT in self.vocab:
            encoded.append(self.vocab[self.vocab.EOT])
        return encoded
        
    def encode(self, seq: List[str]):
        encoded = []
        if Vocabulary.BOS in self.vocab:
            encoded.append(self.vocab[self.vocab.BOS])
        for token in seq:
            encoded += self._encode_token(token)
        if Vocabulary.EOS in self.vocab:
            encoded.append(self.vocab[self.vocab.EOS])
        return encoded
    
    def decode(self, seq: List[int]) -> List[str]:
        raise NotImplementedError

### Datasets and DataLoader

In [59]:
class Transformer:
    def __init__(self, X_cols: Union[list, str], Y_cols: Union[list, str] = None) -> None:
        self.X_cols = X_cols
        self.Y_cols = Y_cols
        self.vocab = Vocabulary(string.printable)
        self.encoder = VocabEncoder(self.vocab)
        assert 'a' in self.vocab

    def _column_selector(self, df: pd.DataFrame, columns: Union[list, str]) -> pd.Series:
        return df[columns]
    
    def _vocab_encoder(self, seq: List[str]) -> pd.Series:
        return encode_sequence(seq, self.vocab)

    def __call__(self, sample: pd.Series) -> Tuple[torch.Tensor, torch.Tensor]:
        X = self._column_selector(sample, self.X_cols)
        X = self.encoder.encode(X)
        X = torch.LongTensor(X)
        if self.Y_cols is not None:
            Y = self._column_selector(sample, self.Y_cols)
            Y = torch.Tensor([Y])
            return X, Y
        else:
            return (X, )


class BatchSampler(torch.utils.data.Sampler):
    def __init__(self, dataset: torch.utils.data.Dataset, batch_size: int, shuffle=False) -> None:
        self.N = len(dataset)
        self.batch_size = batch_size
        self.shuffle = shuffle
        
    def _index_sampler(self) -> None:
        if self.shuffle:
            self.idxs = torch.randperm(self.N).tolist()
        else:
            self.idxs = torch.arange(self.N).tolist()
        
    def __iter__(self) -> List[int]:
        self._index_sampler()
        for i in range(0, self.N, self.batch_size):
            yield self.idxs[i: i+self.batch_size]


class CommentDataset(torch.utils.data.Dataset):
    def __init__(self, df: pd.DataFrame, transform: Callable) -> None:
        self.df = df
        self.transform = transform
        
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        sample = self.df.iloc[idx]
        return self.transform(sample)
    
    def __len__(self) -> int:
        return self.df.shape[0]


def collate(batch: List[tuple]) -> Tuple[torch.Tensor]:
    batch = tuple(zip(*batch))
    X = torch.nn.utils.rnn.pad_sequence(batch[0], batch_first=True, padding_value=0)
    if len(batch)==2:
        Y = torch.stack(batch[1])
        return X, Y
    return (X,)

In [60]:
class DataLoaderBunch:
    def __init__(self, train_dl: torch.utils.data.DataLoader, val_dl: torch.utils.data.DataLoader, test_dl: torch.utils.data.DataLoader = None, c = None) -> None:
        self.train_dl = train_dl
        self.val_dl = val_dl
        self.test_dl = test_dl
        self.c = c
    
    @property
    def train_ds(self) -> torch.utils.data.Dataset:
        return self.train_dl.dataset
    
    @property
    def val_ds(self) -> torch.utils.data.Dataset:
        return self.val_dl.dataset
    
    @property
    def test_ds(self) -> torch.utils.data.Dataset:
        if self.test_dl is not None:
            return self.test_dl.dataset
        else:
            raise ValueError('No test dataloader available.')

### Model

In [9]:
class SimpleModel(nn.Module):
    def __init__(self, model_params: dict) -> None:
        super(SimpleModel, self).__init__()
        self.embedding_layer = nn.Embedding(
            model_params['vocab_size'],
            model_params['embedding_size'],
            padding_idx=model_params['padding_idx'],
        )
        self.conv_layer = nn.Conv1d(
            model_params['embedding_size'],
            model_params['d_model'],
            model_params['width'],
            
        )
        self.conv_pool_layer = nn.MaxPool1d(
            model_params['width'],
            stride=model_params['width'] - 1
        )
        self.transformer_layer = nn.TransformerEncoderLayer(
            model_params['d_model'],
            model_params['nhead'],
            model_params['d_model']
        )
        self.transformer_pool_layer = nn.AdaptiveMaxPool1d(1)
        self.logit_layer = nn.Linear(model_params['d_model'], 1)
        self.conv_act = nn.Tanh()
        
    def forward(self, token_ids) -> torch.Tensor:
        emb = self.embedding_layer(token_ids)
        emb_channels_first = torch.transpose(emb, 1, 2)
        conv = self.conv_layer(emb_channels_first)
        conv_pool = self.conv_pool_layer(conv)
        conv_pool = torch.transpose(conv_pool, 1, 2)
        conv_pool = self.conv_act(conv_pool)
        encoded = self.transformer_layer(conv_pool)
        encoded = torch.transpose(encoded, 1, 2)
        encoded_pool = self.transformer_pool_layer(encoded)
        encoded_pool = encoded_pool[..., 0]
        logit = self.logit_layer(encoded_pool)
        return logit

### Learner

In [10]:
def accuracy_with_logits(y_hat, y_true):
    pred = y_hat >= 0
    truth = y_true >= 0.5
    acc = (pred==truth).float().mean()
    return acc

    
class Learner:
    def __init__(self, model: nn.Module, optimizer: optim.Optimizer, loss_fn: nn.modules.loss, data: DataLoaderBunch) -> None:
        self.model = model
        self.optimizer = optimizer
        self.loss_fn = loss_fn
        self.device = list(model.parameters())[0].device
        self.data = data
    
class Tester(Learner):
    def __init__(self, learner: nn.Module, data: DataLoaderBunch) -> None:
        self.learner = learner
        self.data = data
        self.recorder = Recorder()
        self.callbacks = CallbackHandler([self.recorder])
        self.callbacks.learner = learner
        
    def evaluate(self) -> None:
        self.callbacks.begin_validate()
        with torch.no_grad():
            with self._pbar(self.data.test_dl) as pbar:
                for batch in self.data.test_dl:
                    X, Y = self._convert_batch(*batch)
                    y_hat = self.learner.model(X)
                    batch_size = y_hat.shape[0]
                    cost = self.learner.loss_fn(y_hat, Y) * batch_size
        return 

### Callbacks

#### Base classes

In [82]:
def random_string(length: int) -> str:
    return ''.join(map(str, np.random.choice(list(string.ascii_lowercase+string.digits), length)))

class Callback:
    def set_run(self, cb) -> bool:
        self.run = cb
        return True

    def on_fit_begin(self, learner: Learner) -> bool:
        self.learner = learner
        return True
    
    def on_fit_end(self) -> bool:
        return True
    
    def on_epoch_begin(self, epoch: int) -> bool:
        self.epoch = epoch
        return True
    
    def on_epoch_end(self) -> bool:
        return True
    
    def begin_validate(self) -> bool:
        return True
    
    def end_validate(self) -> bool:
        return True
    
    def begin_test(self) -> bool:
        return True
    
    def end_test(self) -> bool:
        return True
    
    def on_batch_begin(self, batch: Tuple[torch.Tensor]) -> bool:
        return True
    
    def on_loss_begin(self, y_hat: torch.Tensor) -> bool:
        return True
    
    def on_loss_end(self, loss: torch.Tensor) -> bool:
        return True
    
    def on_backward_begin(self) -> bool:
        return True
    
    def on_backward_end(self) -> bool:
        return True
    
    def on_step_begin(self) -> bool:
        return True
    
    def on_step_end(self) -> bool:
        return True
    
    def on_batch_end(self) -> bool:
        return True
    
    def do_stop(self) -> bool:
        return True
    
    @property
    def name(self) -> str:
        return self.__class__.__name__.lower()


class CallbackHandler:
    def __init__(self, callbacks: List[Callback] = [], id: str = None) -> None:
        self.identifier = id if id else random_string(12)
        self.callbacks = []
        [self.add(cb) for cb in callbacks]
        
    def __call__(self, fn_name: str, *args, **kwargs) -> bool:
        res = True
        for cb in sorted(self.callbacks, key=lambda x: x._order):
            fn = getattr(cb, fn_name, None)
            res = res and fn(*args, **kwargs)
        return res
        
    def add(self, cb: Callback) -> None:
        if cb in self.callbacks:
            return
        else:
            setattr(self, cb.name, cb)
            self.callbacks.append(cb)
            
    def set_run(self) -> bool:
        print(f'Beginning run {self.identifier}')
        return self('set_run', self)

    def on_fit_begin(self, learner: Learner) -> bool:
        self.learner = learner
        learner.stop = False
        return self('on_fit_begin', learner)
    
    def on_fit_end(self) -> bool:
        res = not self.in_train
        return self('on_fit_end')
        
    
    def on_epoch_begin(self, epoch: int) -> bool:
        self.learner.model.train()
        return self('on_epoch_begin', epoch)
    
    def on_epoch_end(self) -> bool:
        return self('on_epoch_end')
        
    def begin_validate(self) -> bool:
        self.learner.model.eval()
        return self('begin_validate')
    
    def end_validate(self) -> bool:
        self.learner.model.train()
        return self('end_validate')
    
    def begin_test(self) -> bool:
        self.learner.model.eval()
        return self('begin_test')
    
    def end_test(self) -> bool:
        self.learner.model.train()
        return self('end_test')
    
    def on_batch_begin(self, batch: Tuple[torch.Tensor]) -> bool:
        return self('on_batch_begin', batch)
    
    def on_loss_begin(self, y_hat: torch.Tensor) -> bool:
        return self('on_loss_begin', y_hat)
    
    def on_loss_end(self, loss: torch.Tensor) -> bool:
        return self('on_loss_end', loss)
    
    def on_backward_begin(self) -> bool:
        return self('on_backward_begin')
    
    def on_backward_end(self) -> bool:
        return self('on_backward_end')
    
    def on_step_begin(self) -> bool:
        return self('on_step_begin')
    
    def on_step_end(self) -> bool:
        return self('on_step_end')
    
    def on_batch_end(self) -> bool:
        return self('on_batch_end')
    
    def do_stop(self) -> bool:
        self('do_stop')
        return self.learner.stop
    

#### Useful classes

In [103]:
class Recorder(Callback):
    _order = 0
    def __init__(self) -> None:
        self.records = dict(
            loss=[], val_loss=[], 
            acc=[], val_acc=[],
            correct=[], val_correct=[],
            train_time=[], val_time=[],
            train_samples=[], val_samples=[],
        )
        self.test_records = dict(loss=None, val_loss=None, test_time=None, test_samples=None)

    def on_epoch_begin(self, epoch: int) -> bool:
        super(Recorder, self).on_epoch_begin(epoch)
        self._reset_state()
        return True
        
    def on_batch_begin(self, batch) -> bool:
        if len(batch) > 1:
            self.Y = batch[1].cpu().numpy()
            self.y_true = self.Y >= 0.5
        self.batch_size = batch[0].shape[0]
        return True
    
    def on_loss_begin(self, y_hat) -> bool:
        self.y_hat = y_hat.detach().cpu().numpy()
        self.y_pred = self.y_hat >= 0.
        return True
    
    def on_loss_end(self, loss) -> bool:
        self.total_correct += (self.y_pred==self.y_true).sum()
        self.total_loss += loss.item() * self.batch_size
        self.total_samples += self.batch_size
        return True
    
    def on_epoch_end(self) -> bool:
        super(Recorder, self).on_epoch_end()
        self._log_metrics()
        return True
    
    def begin_validate(self) -> bool:
        super(Recorder, self).begin_validate()
        self._reset_state()
        return True
    
    def end_validate(self) -> bool:
        self._log_metrics()
        print(f'{self._display_latest_metrics()}')
        return True
    
    def end_test(self) -> bool:
        self._log_metrics()
        return True
    
    def _log_metrics(self) -> None:
        elapsed_time = time.time() - self.time_start
        self.total_loss /= self.total_samples
        self.total_acc = self.total_correct/self.total_samples
        if self.run.in_train:
            self.records['loss'].append(self.total_loss)
            self.records['train_time'].append(elapsed_time)
            self.records['train_samples'].append(self.total_samples)
            self.records['correct'].append(self.total_correct)
            self.records['acc'].append(self.total_acc)
        elif self.run.in_test:
            self.records['test_loss'] = self.total_loss
            self.records['test_time'] = elapsed_time
            self.records['test_samples'] = self.total_samples
        else:
            self.records['val_loss'].append(self.total_loss)
            self.records['val_time'].append(elapsed_time)
            self.records['val_samples'].append(self.total_samples)
            self.records['val_correct'].append(self.total_correct)
            self.records['val_acc'].append(self.total_acc)
    
    def _reset_state(self) -> None:
        self.total_loss = 0.
        self.total_correct = 0.
        self.total_samples = 0
        self.time_start = time.time()
        
    def _display_latest_metrics(self) -> str:
        total_time = int(self.records["train_time"][-1] + self.records["val_time"][-1])
        epoch = f'Epoch {self.run.n_epochs} ({total_time} sec):'
        train_metrics = f'loss = {self.records["loss"][-1]:0.5f} acc = {100 * self.records["acc"][-1]:0.2f}%'
        val_metrics = f'val loss = {self.records["val_loss"][-1]:0.5f} val_acc = {100 * self.records["val_acc"][-1]:0.2f}%'
        return f'{epoch} {train_metrics} {val_metrics}'


class Checkpointer(Callback):
    _order = 1
    
    def __init__(self) -> None:
        super(Checkpointer, self).__init__()
        self.checkpoint_dir = Path(f'checkpoints/').resolve()
        self.checkpoint_dir.mkdir(exist_ok=True, parents=True)

    def end_validate(self) -> bool:
        n_epochs = self.run.n_epochs
        metrics = self.run.recorder.records
        val_loss = metrics['val_loss'][-1]
        if val_loss == min(metrics['val_loss']):
            save_name = f'epoch-{int(n_epochs)}_val_loss-{val_loss:0.5f}.pth'
            save_path = self.save_dirname / save_name
            self.save_model(save_path)
        return True
        
    def save_model(self, path: Union[str, Path]) -> bool:
        torch.save(
            self.learner.model.state_dict(),
            str(path)
        )
        return True
    
    @property
    def save_dirname(self) -> Path:
        dirname = self.checkpoint_dir / f'{self.run.identifier}'
        dirname.mkdir(exist_ok=True, parents=True)
        return dirname


class TrainValCallback(Callback):
    _order = 0
    def on_fit_begin(self, learner: Learner) -> bool:
        super(TrainValCallback, self).on_fit_begin(learner)
        self.run.n_epochs = 0
        self.run.n_iter = 0
        self.run.in_train = False
        self.run.in_test = False
        return True

    def on_epoch_begin(self, epoch: int) -> bool:
        self.learner.model.train()
        self.run.in_train = True
        self.run.in_test = False
        self.run.epoch_iters = 0
        return True
    
    def on_epoch_end(self) -> bool:
        self.run.n_epochs += 1
        return True
    
    def on_loss_begin(self, y_hat) -> bool:
        return not self.run.in_test
        
    def on_loss_end(self, loss) -> bool:
        super(TrainValCallback, self).on_loss_end(loss)
        self.run.epoch_iters += 1
        return True
    
    def on_backward_begin(self) -> bool:
        return self.run.in_train

    def on_batch_end(self) -> bool:
        if self.run.in_train:
            self.run.n_iter += 1
        return True

    def begin_validate(self) -> bool:
        self.learner.model.eval()
        self.run.in_train = False
        self.run.in_test = False
        self.run.epoch_iters = 0
        return True
    
    def begin_test(self) -> bool:
        self.learner.model.eval()
        self.run.in_test = True
        self.run.in_train = False
        self.run.epoch_iters = 0
        return True
    
    def end_test(self) -> bool:
        self.run.in_test = False
        return True
    
class TestCallback(Callback):
    _order = -1
    def do_stop(self) -> bool:
        self.learner.stop = self.run.epoch_iters >= 10
        return True

### Training callback routine

In [120]:
class Fitter:
    def __init__(self, learner: Learner, cb: CallbackHandler = None, testing: bool = False) -> None:
        self.learner = learner
        self.cb = cb if cb is not None else CallbackHandler()
        self.testing = testing
        self._default_callbacks()
        self.device = self.learner.device
        
    def _default_callbacks(self) -> None:
        self.recorder = Recorder()
        self.cb.add(self.recorder)
        self.cb.add(Checkpointer())
        self.cb.add(TrainValCallback())
        if self.testing: self.cb.add(TestCallback())
        
    def one_batch(self, batch: Tuple[torch.Tensor]) -> None:
        batch = self._convert_batch(batch)
        X = batch[0]
        if not self.cb.on_batch_begin(batch): return
        y_hat = self.learner.model(X)
        if not self.cb.on_loss_begin(y_hat): return
        Y = batch[1]
        loss = self.learner.loss_fn(y_hat, Y)
        self.cb.on_loss_end(loss)
        if not self.cb.on_backward_begin(): return
        loss.backward()  # get grads
        self.cb.on_backward_end()
        self.cb.on_step_begin()
        self.learner.optimizer.step()  # apply grads
        self.cb.on_step_end()
        self.learner.optimizer.zero_grad()  # zero grads
        self.cb.on_batch_end()
        
    def all_batches(self, dataloader: torch.utils.data.DataLoader) -> None:
        with self.pbar(dataloader) as pbar:
            for batch in dataloader:
                self.one_batch(batch)
                pbar.update(self.recorder.batch_size)
                if self.cb.do_stop(): break
        
    def _fit_one_epoch(self, epoch: int = None) -> None:
        self.cb.on_epoch_begin(epoch)
        dataloader = self.learner.data.train_dl
        self.all_batches(dataloader)
        self.cb.on_epoch_end()
        return 
    
    def fit(self, epochs: int) -> None:
        self.cb.set_run()
        self.cb.on_fit_begin(self.learner)
        for epoch in range(epochs):
            self._fit_one_epoch(epoch)
            self.validate()
        return
                
    def validate(self) -> None:
        self.cb.begin_validate()
        dataloader = self.learner.data.val_dl
        with torch.no_grad():
            self.all_batches(dataloader)
        self.cb.end_validate()
        return
    
    def test(self) -> list:
        self.cb.begin_test()
        dataloader = self.learner.data.test_dl
        chunks = []
        with torch.no_grad():
            for batch in dataloader:
                batch = self._convert_batch(batch)
                y_hat = self.learner.model(batch[0])
                conf = torch.nn.functional.sigmoid(y_hat)
                conf = conf.detach().cpu().numpy()
                chunks.append(conf)
        self.cb.end_test()
        return np.concatenate(chunks, axis=1)

    def pbar(self, dataloader: torch.utils.data.DataLoader, **kwargs) -> tqdm_notebook:
        params = {
            'total': len(dataloader.dataset),
            'unit': 'samples',
            'leave': False
        }
        params.update(kwargs)
        return tqdm_notebook(**params)
        
    
    def _convert_batch(self, batch: Tuple[torch.Tensor]) -> Tuple[torch.Tensor]:
        """Send batch data to the model's device"""
        return tuple(x.to(self.device) for x in batch)
    

## Training

In [14]:
raw_dataset = RawDataset(project_dirname, subsample=0)
processed_dataset = ProcessedDataset(raw_dataset, overwrite=False)
trainable_dataset = TrainableDataset(processed_dataset)


Loading processed dataset from /home/myen/Learn/Kaggle/toxic_comment/data/b99b7e3dc5efe6cd58de07884a1829563aa094f60141b25554fff4bef39a23c5.pklb


In [121]:
if torch.cuda.is_available():
    device = torch.device('cuda:1')
else:
    device = torch.device('cpu')

params = {'batch_size': 128, 'num_workers': 3, 'drop_last': False, 'collate_fn': collate}

transform = Transformer('comment_words', 'b_target')
test_transform = Transformer('comment_words', None)
train_ds = CommentDataset(trainable_dataset.train_df, transform)
val_ds = CommentDataset(trainable_dataset.val_df, transform)
test_ds = CommentDataset(trainable_dataset.test_df, test_transform)

train_dl = torch.utils.data.DataLoader(train_ds, shuffle=True, **params)
val_dl = torch.utils.data.DataLoader(val_ds, shuffle=False, **params)
test_dl = torch.utils.data.DataLoader(test_ds, shuffle=False, **params)

In [122]:
model_params = {
    'vocab_size': transform.vocab.size,
    'embedding_size': 64,
    'padding_idx': transform.vocab[transform.vocab.PAD],
    'nhead': 8,
    'd_model': 256,
    'width': 3
}
model = SimpleModel(model_params).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss = nn.BCEWithLogitsLoss()
data_bunch = DataLoaderBunch(train_dl, val_dl, test_dl)

model_learner = Learner(model, optimizer, loss, data_bunch)
fitter = Fitter(model_learner, testing=True)

In [123]:
fitter.fit(1)


Beginning run li6flrl41kka


HBox(children=(IntProgress(value=0, max=1624386), HTML(value='')))



HBox(children=(IntProgress(value=0, max=180488), HTML(value='')))

Epoch 1 (3 sec): loss = 0.32757 acc = 93.59% val loss = 0.31289 val_acc = 90.70%


In [124]:
fitter.test()



[array([[0.09793811],
        [0.09672317],
        [0.0896183 ],
        [0.05187909],
        [0.07185435],
        [0.05683123],
        [0.07690277],
        [0.07318582],
        [0.09367617],
        [0.08482978],
        [0.08897717],
        [0.07439633],
        [0.06658682],
        [0.09981773],
        [0.06600633],
        [0.09581152],
        [0.08780236],
        [0.07255817],
        [0.08339865],
        [0.06272858],
        [0.0989299 ],
        [0.0885161 ],
        [0.08097576],
        [0.08044925],
        [0.09227213],
        [0.09117831],
        [0.08346997],
        [0.08183854],
        [0.09491818],
        [0.09449887],
        [0.09402139],
        [0.06928744],
        [0.06747768],
        [0.0929569 ],
        [0.05315695],
        [0.09297023],
        [0.09338074],
        [0.06813194],
        [0.08872155],
        [0.07428375],
        [0.07842431],
        [0.07437821],
        [0.07294655],
        [0.08729815],
        [0.06134611],
        [0