## Explore the data

In [58]:
%%capture
from functools import partial
import hashlib
from pathlib import Path
import pickle
import string
import time
from typing import Callable, List, Union, Tuple

import numpy as np
import pandas as pd
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from tqdm import tqdm_notebook

from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
import torch.nn as nn
import torch.optim as optim

%load_ext autoreload
%autoreload 2

In [2]:
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

In [3]:
project_dirname = Path('../').resolve()
raw_data_dirname = project_dirname / 'data/raw'
data_files = list(raw_data_dirname.iterdir())

### Read Data

In [4]:
def compute_df_sha(df: pd.DataFrame) -> str:
    id_str = ''.join(map(str, df.id))
    id_bstr = id_str.encode()
    sha = hashlib.sha256(id_bstr)
    return sha.hexdigest()

    
class RawDataset:
    def __init__(self, project_dirname:Path, subsample: int = None) -> None:
        self.subsample = subsample
        self.project_dirname = project_dirname
        self.data_dirname = self.project_dirname / 'data'
        self.raw_data_dirname = self.data_dirname / 'raw'
        self.train_filename = self.raw_data_dirname / 'train.csv'
        self.test_filename = self.raw_data_dirname / 'test.csv'
        
        assert self.train_filename.exists()
        assert self.test_filename.exists()
        
        self.train_df = pd.read_csv(self.train_filename)
        self.test_df = pd.read_csv(self.test_filename)
        
        if self.subsample != 0:
            self.train_df = self.train_df.iloc[0: self.subsample]
        self.identifier = compute_df_sha(self.train_df)

class ProcessedDataset:
    toxicity_subtypes = ['severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']
    identity_attributes = [
        'male', 'female', 'transgender', 'other_gender',
        'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation',
        'christian', 'jewish', 'muslim', 'hindu', 'buddhist', 'atheist', 'other_religion',
        'black', 'white', 'asian', 'latino', 'other_race_or_ethnicity',
        'physical_disability', 'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
        'other_disability'
    ]
    _binarize_columns = ['target'] + toxicity_subtypes
    binary_columns = [f'b_{name}' for name in _binarize_columns]
    
    def __init__(self, raw_dataset: RawDataset, overwrite: bool = False) -> None:
        self.identifier = raw_dataset.identifier
        self.project_dirname = raw_dataset.project_dirname
        self.data_dirname = raw_dataset.data_dirname
        self.cache_path = self.data_dirname / f'{self.identifier}.pklb'
        
        if self.cache_path.exists() and not overwrite:
            print(f'Loading processed dataset from {self.cache_path}')
            self.load()
        else:
            self.train_df = raw_dataset.train_df
            self.test_df = raw_dataset.test_df
            self.featurize()
            self.save()

    def featurize(self) -> None:
        print('Featurizing....')
        self._prepare_df_labels(self.train_df)
        self._prepare_features(self.train_df)
        self._prepare_features(self.test_df)
            
    def _prepare_df_labels(self, df: pd.DataFrame) -> None:
        for column, new_column in zip(self._binarize_columns, self.binary_columns):
            df[new_column] = df[column].apply(self._binarize_label)

    def _prepare_features(self, df: pd.DataFrame) -> None:
        doc = df.comment_text.apply(tokenizer)
        df['comment_words'] = doc.apply(lambda x: [w.text for w in x])
        
    def _binarize_label(self, target: float) -> int:
        """According to competition rules, target values >= 0.5 are considered the positive class."""
        return int(target >= 0.5)
    
    def save(self) -> None:
        cache_data = {
            'train_df': self.train_df,
            'test_df': self.test_df
        }
        with open(self.cache_path, 'wb') as fw:
            pickle.dump(cache_data, fw)
        
    def load(self) -> None:
        with open(self.cache_path, 'rb') as fo:
            cache_data = pickle.load(fo)
        self.train_df = cache_data['train_df']
        self.test_df = cache_data['test_df']
        
    

def split_df(df: pd.DataFrame, frac: float = 0.1) -> Tuple[pd.DataFrame]:
    n_val = int(np.ceil(df.shape[0]*frac))
    df = df.sample(frac=1.)
    val_df = df.iloc[0: n_val]
    train_df = df.iloc[n_val:]
    assert val_df.shape[0] + train_df.shape[0] == df.shape[0]
    return train_df, val_df
    
class TrainableDataset:
    def __init__(self, processed_dataset: ProcessedDataset) -> None:
        self.trainval_df = processed_dataset.train_df
        self.train_df, self.val_df = split_df(self.trainval_df, frac=0.1)
        self.test_df = processed_dataset.test_df
        
        self.n_train = self.train_df.shape[0]
        self.n_val = self.val_df.shape[0]
        self.n_test = self.test_df.shape[0]
        
    def __repr__(self) -> str:
        return f'Train samples: {self.n_train}, Val samples: {self.n_val}, Test samples: {self.n_test}'

### Vocab

In [5]:
class Vocabulary:
    PAD = "<PAD>"
    BOS = "<BOS>"
    EOS = "<EOS>"
    BOT = "<BOT>"
    EOT = "<EOT>"
    UNK = "<UNK>"
    
    specials = [PAD, BOS, EOS, BOT, EOT, UNK]
    vocab = []
    token_to_int = dict()
    int_to_token = dict()
    
    def __init__(self, tokens: list = []) -> None:
        [self.add_token(t) for t in self.specials]
        [self.add_token(t) for t in tokens]
        self.UNK_IDX = self.vocab.index(self.UNK)
            
    def add_token(self, token) -> None:
        if token in self.vocab:
            return
        else:
            idx = len(self.vocab)
            self.token_to_int[token] = idx
            self.int_to_token[idx] = token
            self.vocab.append(token)
            
    def __getitem__(self, token: str) -> int:
        return self.token_to_int.get(token, self.UNK_IDX)
    
    def __contains__(self, token: str) -> bool:
        return token in self.vocab
    
    def get(self, x: Union[str, int], reverse: bool = False) -> int:
        if reverse:
            return self.int_to_token.get(x, self.UNK)
        else:
            return self.token_to_int.get(x, self.UNK_IDX)
        
    def __len__(self) -> int:
        return len(self.token_to_int)
    
    @property
    def size(self) -> int:
        return len(self)
    
    
class VocabEncoder:
    def __init__(self, vocab: Vocabulary) -> None:
        self.vocab = vocab
        
    def _encode_token(self, token: str) -> int:
        encoded = []
        if Vocabulary.BOT in self.vocab:
            encoded.append(self.vocab[self.vocab.BOT])
        encoded += [self.vocab[c] for c in token]
        if Vocabulary.EOT in self.vocab:
            encoded.append(self.vocab[self.vocab.EOT])
        return encoded
        
    def encode(self, seq: List[str]):
        encoded = []
        if Vocabulary.BOS in self.vocab:
            encoded.append(self.vocab[self.vocab.BOS])
        for token in seq:
            encoded += self._encode_token(token)
        if Vocabulary.EOS in self.vocab:
            encoded.append(self.vocab[self.vocab.EOS])
        return encoded
    
    def decode(self, seq: List[int]) -> List[str]:
        raise NotImplementedError

### Datasets and DataLoader

In [6]:
class Transformer:
    def __init__(self, X_cols: Union[list, str], Y_cols: Union[list, str] = None) -> None:
        self.X_cols = X_cols
        self.Y_cols = Y_cols
        self.vocab = Vocabulary(string.printable)
        self.encoder = VocabEncoder(self.vocab)
        assert 'a' in self.vocab

    def _column_selector(self, df: pd.DataFrame, columns: Union[list, str]) -> pd.Series:
        return df[columns]
    
    def _vocab_encoder(self, seq: List[str]) -> pd.Series:
        return encode_sequence(seq, self.vocab)

    def __call__(self, sample: pd.Series) -> Tuple[torch.Tensor, torch.Tensor]:
        X = self._column_selector(sample, self.X_cols)
        X = self.encoder.encode(X)
        # maxlen = max(map(len, X))
        # X = pad_sequences(X, maxlen=maxlen, value=self.vocab[self.vocab.PAD])
        if self.Y_cols is not None:
            Y = self._column_selector(sample, self.Y_cols)
        else:
            Y = None
        X = torch.LongTensor(X)
        Y = torch.Tensor([Y])
        return X, Y


class BatchSampler(torch.utils.data.Sampler):
    def __init__(self, dataset: torch.utils.data.Dataset, batch_size: int, shuffle=False) -> None:
        self.N = len(dataset)
        self.batch_size = batch_size
        self.shuffle = shuffle
        
    def _index_sampler(self) -> None:
        if self.shuffle:
            self.idxs = torch.randperm(self.N).tolist()
        else:
            self.idxs = torch.arange(self.N).tolist()
        
    def __iter__(self) -> List[int]:
        self._index_sampler()
        for i in range(0, self.N, self.batch_size):
            yield self.idxs[i: i+self.batch_size]


class CommentDataset(torch.utils.data.Dataset):
    def __init__(self, df: pd.DataFrame, transform: Callable) -> None:
        self.df = df
        self.transform = transform
        
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        sample = self.df.iloc[idx]
        X, Y = self.transform(sample)
        return X, Y
    
    def __len__(self) -> int:
        return self.df.shape[0]


def collate(batch: List[tuple]) -> Tuple[torch.Tensor]:
    X, Y = zip(*batch)
    X = torch.nn.utils.rnn.pad_sequence(X, batch_first=True, padding_value=0)
    Y = torch.stack(Y)
    return X, Y

In [7]:
class DataLoaderBunch:
    def __init__(self, train_dl: torch.utils.data.DataLoader, val_dl: torch.utils.data.DataLoader, test_dl: torch.utils.data.DataLoader = None, c = None) -> None:
        self.train_dl = train_dl
        self.val_dl = val_dl
        self.test_dl = test_dl
        self.c = c
    
    @property
    def train_ds(self) -> torch.utils.data.Dataset:
        return self.train_dl.dataset
    
    @property
    def val_ds(self) -> torch.utils.data.Dataset:
        return self.val_dl.dataset
    
    @property
    def test_ds(self) -> torch.utils.data.Dataset:
        if self.test_dl is not None:
            return self.test_dl.dataset
        else:
            raise ValueError('No test dataloader available.')

### Model

In [8]:
class SimpleModel(nn.Module):
    def __init__(self, model_params: dict) -> None:
        super(SimpleModel, self).__init__()
        self.embedding_layer = nn.Embedding(
            model_params['vocab_size'],
            model_params['embedding_size'],
            padding_idx=model_params['padding_idx'],
        )
        self.conv_layer = nn.Conv1d(
            model_params['embedding_size'],
            model_params['d_model'],
            model_params['width'],
            
        )
        self.conv_pool_layer = nn.MaxPool1d(
            model_params['width'],
            stride=model_params['width'] - 1
        )
        self.transformer_layer = nn.TransformerEncoderLayer(
            model_params['d_model'],
            model_params['nhead'],
            model_params['d_model']
        )
        self.transformer_pool_layer = nn.AdaptiveMaxPool1d(1)
        self.logit_layer = nn.Linear(model_params['d_model'], 1)
        self.conv_act = nn.Tanh()
        
    def forward(self, token_ids) -> torch.Tensor:
        emb = self.embedding_layer(token_ids)
        emb_channels_first = torch.transpose(emb, 1, 2)
        conv = self.conv_layer(emb_channels_first)
        conv_pool = self.conv_pool_layer(conv)
        conv_pool = torch.transpose(conv_pool, 1, 2)
        conv_pool = self.conv_act(conv_pool)
        encoded = self.transformer_layer(conv_pool)
        encoded = torch.transpose(encoded, 1, 2)
        encoded_pool = self.transformer_pool_layer(encoded)
        encoded_pool = encoded_pool[..., 0]
        logit = self.logit_layer(encoded_pool)
        return logit

### Learner

In [81]:
def accuracy_with_logits(y_hat, y_true):
    pred = y_hat >= 0
    truth = y_true >= 0.5
    acc = (pred==truth).float().mean()
    return acc

    
class Learner:
    def __init__(self, model: nn.Module, optimizer: optim.Optimizer, loss_fn: nn.modules.loss, data: DataLoaderBunch) -> None:
        self.model = model
        self.optimizer = optimizer
        self.loss_fn = loss_fn
        self.device = list(model.parameters())[0].device
        self.data = data
        self.recorder = Recorder()
        self.callbacks = CallbackHandler([self.recorder])
        
    def one_batch(self, X: torch.Tensor, Y: torch.Tensor) -> None:
        X, Y = self._convert_batch((X, Y))
        if not self.callbacks.on_batch_begin(X, Y): return
        y_hat = self.model(X)
        if not self.callbacks.on_loss_begin(y_hat): return
        loss = self.loss_fn(y_hat, Y)
        self.callbacks.on_loss_end(loss)
        if not self.callbacks.on_backward_begin(): return
        loss.backward()  # get grads
        self.callbacks.on_backward_end()
        self.callbacks.on_step_begin()
        self.optimizer.step()  # apply grads
        self.callbacks.on_step_end()
        self.optimizer.zero_grad()  # zero grads
        self.callbacks.on_batch_end()
        
    def fit_one_epoch(self, epoch: int = None) -> None:
        if epoch == 0: self.callbacks.on_fit_begin(self)
        self.callbacks.on_epoch_begin(epoch)
        with self.pbar(self.data.train_dl) as pbar:
            for batch in self.data.train_dl:
                self.one_batch(*batch)
                batch_size = self.recorder.batch_size
                pbar.update(batch_size)
        self.callbacks.on_epoch_end()
        return 
                
    def validate(self) -> None:
        self.callbacks.begin_validate()
        with torch.no_grad():
            with self.pbar(self.data.val_dl) as pbar:
                for batch in self.data.val_dl:
                    self.one_batch(*batch)
                    batch_size = self.recorder.batch_size
                    pbar.update(batch_size)
        self.callbacks.end_validate()
        return

    def pbar(self, dataloader: torch.utils.data.DataLoader, **kwargs) -> tqdm_notebook:
        params = {
            'total': len(dataloader.dataset),
            'unit': 'samples',
            'leave': False
        }
        params.update(kwargs)
        return tqdm_notebook(**params)
        
    
    def _convert_batch(self, batch: Tuple[torch.Tensor]) -> Tuple[torch.Tensor]:
        """Send batch data to the model's device"""
        return tuple(x.to(self.device) for x in batch)
    
class Tester(Learner):
    def __init__(self, learner: nn.Module, data: DataLoaderBunch) -> None:
        self.learner = learner
        self.data = data
        self.recorder = Recorder()
        self.callbacks = CallbackHandler([self.recorder])
        self.callbacks.learner = learner
        
    def evaluate(self) -> None:
        self.callbacks.begin_validate()
        with torch.no_grad():
            with self._pbar(self.data.test_dl) as pbar:
                for batch in self.data.test_dl:
                    X, Y = self._convert_batch(*batch)
                    y_hat = self.learner.model(X)
                    batch_size = y_hat.shape[0]
                    cost = self.learner.loss_fn(y_hat, Y) * batch_size
        return 

### Callbacks

#### Base classes

In [56]:
class Callback:
    def on_fit_begin(self, learner: Learner) -> bool:
        self.learner = learner
        return True
    
    def on_fit_end(self) -> bool:
        return True
    
    def on_epoch_begin(self, epoch) -> bool:
        self.epoch = epoch
        return True
    
    def on_epoch_end(self) -> bool:
        return True
    
    def begin_validate(self) -> bool:
        return True
    
    def end_validate(self) -> bool:
        return True
    
    def begin_test(self) -> bool:
        return True
    
    def end_test(self) -> bool:
        return True
    
    def on_batch_begin(self, X: torch.Tensor, Y: torch.Tensor) -> bool:
        return True
    
    def on_loss_begin(self, y_hat: torch.Tensor) -> bool:
        return True
    
    def on_loss_end(self, loss: torch.Tensor) -> bool:
        return True
    
    def on_backward_begin(self) -> bool:
        return True
    
    def on_backward_end(self) -> bool:
        return True
    
    def on_step_begin(self) -> bool:
        return True
    
    def on_step_end(self) -> bool:
        return True
    
    def on_batch_end(self) -> bool:
        return True


class CallbackHandler:
    def __init__(self, callbacks: List[Callback] = []) -> None:
        self.callbacks = callbacks
        
    def on_fit_begin(self, learner: Learner) -> bool:
        self.learner = learner
        self.in_train = True
        learner.stop = False
        res = True
        for cb in self.callbacks:
            res = res and cb.on_fit_begin(learner)
        return res
    
    def on_fit_end(self) -> bool:
        res = not self.in_train
        for cb in self.callbacks:
            res = res and cb.after_fit()
        return res
    
    def on_epoch_begin(self, epoch: int) -> bool:
        self.learner.model.train()
        res = True
        for cb in self.callbacks:
            res = res and cb.on_epoch_begin(epoch)
        return res
    
    def on_epoch_end(self) -> bool:
        res = True
        for cb in self.callbacks:
            res = res and cb.on_epoch_end()
        return res
        
    def begin_validate(self) -> bool:
        self.learner.model.eval()
        self.in_train = False
        res = True
        for cb in self.callbacks:
            res = res and cb.begin_validate()
        return res
    
    def end_validate(self) -> bool:
        self.learner.model.train()
        self.in_train = True
        res = True
        for cb in self.callbacks:
            res = res and cb.end_validate()
        return res
    
    def begin_test(self) -> bool:
        self.learner.model.eval()
        self.in_train = False
        self.in_test = True
        res = True
        for cb in self.callbacks:
            res = res and cb.begin_test()
        return res
    
    def end_test(self) -> bool:
        self.learner.model.train()
        self.in_train = True
        self.in_test = False
        res = True
        for cb in self.callbacks:
            res = res and cb.end_test()
        return res
    
    def on_batch_begin(self, X: torch.Tensor, Y: torch.Tensor) -> bool:
        res = True
        for cb in self.callbacks:
            res = res and cb.on_batch_begin(X, Y)
        return res
    
    def on_loss_begin(self, y_hat: torch.Tensor) -> bool:
        res = True
        for cb in self.callbacks:
            res = res and cb.on_loss_begin(y_hat)
        return res
    
    def on_loss_end(self, loss: torch.Tensor) -> bool:
        res = True
        for cb in self.callbacks:
            res = res and cb.on_loss_end(loss)
        return res
    
    def on_backward_begin(self) -> bool:
        res = True
        for cb in self.callbacks:
            res = res and cb.on_backward_begin()
        return res
    
    def on_backward_end(self) -> bool:
        res = True
        for cb in self.callbacks:
            res = res and cb.on_backward_end()
        return res
    
    def on_step_begin(self) -> bool:
        res = True
        for cb in self.callbacks:
            res = res and cb.on_step_begin()
        return res
    
    def on_step_end(self) -> bool:
        res = True
        for cb in self.callbacks:
            res = res and cb.on_step_end()
        return res
    
    def on_batch_end(self) -> bool:
        res = True
        for cb in self.callbacks:
            res = res and cb.on_batch_end()
        return res
    
    def do_stop(self) -> bool:
        try:
            return self.learner.stop
        finally:
            self.learner.stop = False

#### Useful classes

In [77]:
class Recorder(Callback):
    def __init__(self) -> None:
        self.records = dict(
            loss=[], val_loss=[], 
            train_time=[], val_time=[],
            train_samples=[], val_samples=[],
        )
        self.test_records = dict(loss=None, val_loss=None, test_time=None, test_samples=None)

    def on_fit_begin(self, learner) -> bool:
        super(Recorder, self).on_fit_begin(learner)
        self.in_train = True
        self.in_test = False
        return True

    def on_epoch_begin(self, epoch: int) -> bool:
        super(Recorder, self).on_epoch_begin(epoch)
        self._reset_state()
        return True
        
    def on_batch_begin(self, X, Y) -> bool:
        self.batch_size = Y.shape[0]
        return True
    
    def on_loss_begin(self, y_hat) -> bool:
        return True
    
    def on_loss_end(self, loss) -> bool:
        self.total_loss += loss.item() * self.batch_size
        self.total_samples += self.batch_size
        return True
    
    def on_backward_begin(self) -> bool:
        return self.in_train
    
    def on_epoch_end(self) -> bool:
        super(Recorder, self).on_epoch_end()
        self._log_metrics()
        return True
    
    def begin_validate(self) -> bool:
        super(Recorder, self).begin_validate()
        self.in_train = False
        self._reset_state()
        return True
    
    def end_validate(self) -> bool:
        self._log_metrics()
        self.in_train = True
        print(f'{self._display_latest_metrics()}')
        return True
    
    def begin_test(self) -> bool:
        super(Recorder, self).begin_test()
        self.in_train = False
        self.in_test = True
        return True
    
    def end_test(self) -> bool:
        self._log_metrics()
        self.in_test = False
        return True
    
    def _log_metrics(self) -> None:
        elapsed_time = time.time() - self.time_start
        self.total_loss /= self.total_samples
        if self.in_train:
            self.records['loss'].append(self.total_loss)
            self.records['train_time'].append(elapsed_time)
            self.records['train_samples'].append(self.total_samples)
        elif self.in_test:
            self.records['test_loss'] = self.total_loss
            self.records['test_time'] = elapsed_time
            self.records['test_samples'] = self.total_samples
        else:
            self.records['val_loss'].append(self.total_loss)
            self.records['val_time'].append(elapsed_time)
            self.records['val_samples'].append(self.total_samples)
    
    def _reset_state(self) -> None:
        self.total_loss = 0.
        self.total_samples = 0
        self.time_start = time.time()
        
    def _display_latest_metrics(self) -> str:
        total_time = int(self.records["train_time"][-1] + self.records["val_time"][-1])
        epoch = f'Epoch {self.epoch} ({total_time} sec):'
        train_loss = f'loss = {self.records["loss"][-1]:0.5f}'
        val_loss = f'val loss = {self.records["val_loss"][-1]:0.5f}'
        return f'{epoch} {train_loss} {val_loss}'

### Training callback routine

In [12]:
def one_batch(X: torch.Tensor, Y:torch.Tensor, cb: CallbackHandler) -> None:
    if not cb.on_batch_begin(X, Y): return
    y_hat = cb.learner.model(X)
    if not cb.on_loss_begin(y_hat): return
    loss = cb.learner.loss_fn(cb.learner.model(X), Y)
    cb.on_loss_end(loss)
    if not cb.on_backward_begin(): return
    loss.backward()
    cb.on_backward_end()
    cb.on_step_begin()
    cb.learner.optimizer.step()
    cb.on_step_end()
    cb.learner.optimizer.zero_grad()
    cb.on_batch_end()
        
def all_batches(dataloader: torch.utils.data.DataLoader, cb) -> None:
    for X, Y in dataloader:
        one_batch(X, Y, cb)
        if cb.do_stop(): return
        
def fit(epochs: int, learner: Learner, cb: CallbackHandler) -> None:
    if not cb.on_fit_begin(learner): return
    print('Begin')
    for epoch in range(epochs):
        print('Training...')
        cb.on_epoch_begin(epoch)
        all_batches(learner.data.train_dl, cb)
        cb.on_epoch_end()
        
        if cb.begin_validate():
            print('Validating...')
            with torch.no_grad():
                cb.on_epoch_begin(epoch)
                all_batches(learner.data.val_dl, cb)
                cb.on_epoch_end()

        if cb.do_stop() or not cb.on_epoch_end(): break
    cb.on_fit_end()

## Training

In [80]:
raw_dataset = RawDataset(project_dirname, subsample=10000)
processed_dataset = ProcessedDataset(raw_dataset, overwrite=False)
trainable_dataset = TrainableDataset(processed_dataset)

Loading processed dataset from /home/mayen/Learn/Kaggle/toxic_comment/data/488f084da443568790e2d337ae61163f8e6a3e0cdbb87cdc7ed7a0108936e241.pklb


In [82]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

params = {'batch_size': 128, 'num_workers': 3, 'drop_last': False, 'collate_fn': collate}

transform = Transformer('comment_words', 'b_target')
train_ds = CommentDataset(trainable_dataset.train_df, transform)
val_ds = CommentDataset(trainable_dataset.val_df, transform)
test_ds = CommentDataset(trainable_dataset.test_df, transform)

train_dl = torch.utils.data.DataLoader(train_ds, shuffle=True, **params)
val_dl = torch.utils.data.DataLoader(val_ds, shuffle=False, **params)
test_dl = torch.utils.data.DataLoader(test_ds, shuffle=False, **params)

In [85]:
model_params = {
    'vocab_size': transform.vocab.size,
    'embedding_size': 64,
    'padding_idx': transform.vocab[transform.vocab.PAD],
    'nhead': 4,
    'd_model': 128,
    'width': 3
}
model = SimpleModel(model_params).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss = nn.BCEWithLogitsLoss()
data_bunch = DataLoaderBunch(train_dl, val_dl, test_dl)

model_learner = Learner(model, optimizer, loss, data_bunch)
callbacks = CallbackHandler([Recorder()])

In [86]:
for epoch in range(5):
    model_learner.fit_one_epoch(epoch)
    model_learner.validate()

HBox(children=(IntProgress(value=0, max=9000), HTML(value='')))



HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 0 (8 sec): loss: 0.26054 val loss: 0.22936


HBox(children=(IntProgress(value=0, max=9000), HTML(value='')))



HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 1 (8 sec): loss: 0.22199 val loss: 0.22735


HBox(children=(IntProgress(value=0, max=9000), HTML(value='')))



HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 2 (8 sec): loss: 0.22181 val loss: 0.22619


HBox(children=(IntProgress(value=0, max=9000), HTML(value='')))



HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 3 (8 sec): loss: 0.22136 val loss: 0.22378


HBox(children=(IntProgress(value=0, max=9000), HTML(value='')))



HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Epoch 4 (8 sec): loss: 0.21641 val loss: 0.21064


In [87]:
model_learner.test()

HBox(children=(IntProgress(value=0, max=97320), HTML(value='')))



KeyError: Caught KeyError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/mayen/miniconda3/envs/kaggle/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 4736, in get_value
    return libindex.get_value_box(s, key)
  File "pandas/_libs/index.pyx", line 51, in pandas._libs.index.get_value_box
  File "pandas/_libs/index.pyx", line 47, in pandas._libs.index.get_value_at
  File "pandas/_libs/util.pxd", line 98, in pandas._libs.util.get_value_at
  File "pandas/_libs/util.pxd", line 83, in pandas._libs.util.validate_indexer
TypeError: 'str' object cannot be interpreted as an integer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/mayen/miniconda3/envs/kaggle/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/mayen/miniconda3/envs/kaggle/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/mayen/miniconda3/envs/kaggle/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "<ipython-input-6-87ae24501f27>", line 54, in __getitem__
    X, Y = self.transform(sample)
  File "<ipython-input-6-87ae24501f27>", line 21, in __call__
    Y = self._column_selector(sample, self.Y_cols)
  File "<ipython-input-6-87ae24501f27>", line 10, in _column_selector
    return df[columns]
  File "/home/mayen/miniconda3/envs/kaggle/lib/python3.7/site-packages/pandas/core/series.py", line 1068, in __getitem__
    result = self.index.get_value(self, key)
  File "/home/mayen/miniconda3/envs/kaggle/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 4744, in get_value
    raise e1
  File "/home/mayen/miniconda3/envs/kaggle/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 4730, in get_value
    return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
  File "pandas/_libs/index.pyx", line 80, in pandas._libs.index.IndexEngine.get_value
  File "pandas/_libs/index.pyx", line 88, in pandas._libs.index.IndexEngine.get_value
  File "pandas/_libs/index.pyx", line 131, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 1607, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 1614, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'b_target'


In [23]:
type(model_learner.recorder.total_loss.data.item())

float

In [122]:
torch.device('cuda')

device(type='cuda')

In [None]:
torch.