In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from catalyst import dl
import wandb
import joblib


DEVICE = 'cuda'
mydir = '/data2/competitions/quora-insincere-questions-classification'
SEED = 1234

tqdm.pandas()
# seed everything
import os
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

  from pandas import Panel
I0302 18:09:55.359832 139869237638976 file_utils.py:41] PyTorch version 1.4.0 available.
I0302 18:09:55.992824 139869237638976 file_utils.py:57] TensorFlow version 2.0.0 available.

The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version



In [2]:
def get_param_size(model, trainable=True):
    if trainable:
        psize = np.sum([np.prod(p.size()) for p in model.parameters() if p.requires_grad])
    else:
        psize = np.sum([np.prod(p.size()) for p in model.parameters()])
    return psize

## EMA

In [3]:
# https://discuss.pytorch.org/t/how-to-apply-exponential-moving-average-decay-for-variables/10856
class EMA():
    def __init__(self, model, mu, level='batch', n=1):
        """
        level: 'batch' or 'epoch'
          'batch': Update params every n batches.
          'epoch': Update params every epoch.
        """
        # self.ema_model = copy.deepcopy(model)
        self.mu = mu
        self.level = level
        self.n = n
        self.cnt = self.n
        self.shadow = {}
        for name, param in model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data

    def _update(self, model):
        for name, param in model.named_parameters():
            if param.requires_grad:
                new_average = (1 - self.mu) * param.data + self.mu * self.shadow[name]
                self.shadow[name] = new_average.clone()

    def set_weights(self, ema_model):
        for name, param in ema_model.named_parameters():
            if param.requires_grad:
                param.data = self.shadow[name]

    def on_batch_end(self, model):
        if self.level is 'batch':
            self.cnt -= 1
            if self.cnt == 0:
                self._update(model)
                self.cnt = self.n

    def on_epoch_end(self, model):
        if self.level is 'epoch':
            self._update(model)



## Loading embeddings

In [4]:
import gc
from gensim.models import KeyedVectors


def load_glove(word_index):
    def get_coefs(word, *arr): 
        return word, np.asarray(arr, dtype='float32')
    EMBEDDING_FILE = f'{mydir}/embeddings/glove.840B.300d/glove.840B.300d.txt'
    embeddings_index = dict(get_coefs(*o.split(' ')) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(list(embeddings_index.values()))
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    unknown_words = []
    
    embedding_matrix = np.random.normal(emb_mean, emb_std, (len(word_index), embed_size))
        
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is None:
            embedding_vector = embeddings_index.get(word.lower())
            if embedding_vector is None:
                unknown_words.append((word, i))
            else:
                embedding_matrix[i] = embedding_vector
        else:
            embedding_matrix[i] = embedding_vector
    print('\nTotal unknowns glove', len(unknown_words))
    print(unknown_words[-10:])

    del embeddings_index
    gc.collect()
    return embedding_matrix, unknown_words


def load_wiki(word_index):
    def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
    EMBEDDING_FILE = f'{mydir}/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    embeddings_index = dict(get_coefs(*o.split(' ')) for o in open(EMBEDDING_FILE) if len(o) > 100)

    all_embs = np.stack(list(embeddings_index.values()))
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    unknown_words = []
    embedding_matrix = np.random.normal(emb_mean, emb_std, (len(word_index), embed_size))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is None:
            embedding_vector = embeddings_index.get(word.lower())
            if embedding_vector is None:
                unknown_words.append((word, i))
            else:
                embedding_matrix[i] = embedding_vector
        else:
            embedding_matrix[i] = embedding_vector
    print('\nTotal unknowns wiki', len(unknown_words))
    print(unknown_words[-10:])

    del embeddings_index
    gc.collect()
    return embedding_matrix, unknown_words


def load_parag(word_index):
    def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
    EMBEDDING_FILE = f'{mydir}/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    embeddings_index = dict(get_coefs(*o.split(' '))
                            for o in open(EMBEDDING_FILE, encoding='utf8', errors='ignore')
                            if len(o) > 100)

    all_embs = np.stack(list(embeddings_index.values()))
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    unknown_words = []
    embedding_matrix = np.random.normal(emb_mean, emb_std, (len(word_index), embed_size))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is None:
            embedding_vector = embeddings_index.get(word.lower())
            if embedding_vector is None:
                unknown_words.append((word, i))
            else:
                embedding_matrix[i] = embedding_vector
        else:
            embedding_matrix[i] = embedding_vector
    print('\nTotal unknowns parag', len(unknown_words))
    print(unknown_words[-10:])

    del embeddings_index
    gc.collect()
    return embedding_matrix, unknown_words


# https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go
def load_ggle(word_index):
    EMBEDDING_FILE = f'{mydir}/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
    embeddings_index = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)
    embed_size = embeddings_index.get_vector('known').size

    unknown_words = []
    embedding_matrix = (np.random.rand(len(word_index), embed_size) - 0.5) / 5.0
    
    for word, i in word_index.items():
        if word in embeddings_index:
            embedding_matrix[i] = embeddings_index.get_vector(word)
        else:
            word_lower = word.lower()
            if word_lower in embeddings_index:
                embedding_matrix[i] = embeddings_index.get_vector(word_lower)
            else:
                unknown_words.append((word, i))

    print('\nTotal unknowns ggle', len(unknown_words))
    print(unknown_words[-10:])

    del embeddings_index
    gc.collect()
    return embedding_matrix, unknown_words

## GRU Model

In [5]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, h_size, n_layers, dropout, padding_idx, 
                 pretrained_embedding=None, fix_embedding=True):
        super(GRUModel, self).__init__()
        self.is_pretrained = pretrained_embedding is not None
        
        if self.is_pretrained:
            self.embed = nn.Embedding.from_pretrained(pretrained_embedding, freeze=fix_embedding)
            self.embed.padding_idx = padding_idx
        else:
            self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx)
        
        self.embed_drop = nn.Dropout(dropout)
        self.gru = nn.GRU(embed_dim, h_size, n_layers, batch_first=True, bidirectional=True, dropout=dropout)
        
        self.out = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(2*n_layers*h_size, h_size),
            nn.BatchNorm1d(h_size),
            nn.PReLU(),
            nn.Linear(h_size, 1),
        )
        self.init_weights()

    def init_weights(self):
        if not self.is_pretrained:
            d = self.embed.weight.size(1)
            nn.init.uniform_(self.embed.weight, -1/np.sqrt(d), 1/np.sqrt(d))

    def forward(self, x):
        x = self.embed(x)
        x = self.embed_drop(x)
        x, _ = self.gru(x)
        x, _ = torch.max(x, 1)
        x = self.out(x).squeeze(1)
        return x

In [6]:
class DataFrameDataset(data.Dataset):
     """Class for using pandas DataFrames as a datasource"""
     def __init__(self, examples, fields, filter_pred=None):
         """
         Create a dataset from a pandas dataframe of examples and Fields
         Arguments:
             examples pd.DataFrame: DataFrame of examples
             fields {str: Field}: The Fields to use in this tuple. The
                 string is a field name, and the Field is the associated field.
             filter_pred (callable or None): use only exanples for which
                 filter_pred(example) is true, or use all examples if None.
                 Default is None
         """
         self.examples = examples.apply(SeriesExample.fromSeries, args=(fields,), axis=1).tolist()
         if filter_pred is not None:
             self.examples = filter(filter_pred, self.examples)
         self.fields = dict(fields)
         # Unpack field tuples
         for n, f in list(self.fields.items()):
             if isinstance(n, tuple):
                 self.fields.update(zip(n, f))
                 del self.fields[n]
                    
class SeriesExample(data.Example):
     """Class to convert a pandas Series to an Example"""

     @classmethod
     def fromSeries(cls, data, fields):
         return cls.fromdict(data.to_dict(), fields)

     @classmethod
     def fromdict(cls, data, fields):
         ex = cls()

         for key, field in fields.items():
             if key not in data:
                 raise ValueError("Specified key {} was not found in "
                 "the input data".format(key))
             if field is not None:
                 setattr(ex, key, field.preprocess(data[key]))
             else:
                 setattr(ex, key, data[key])
         return ex

# Simple wrapper to join torchtext and catalyst API

class IteratorWrapper(torch.utils.data.DataLoader):
    __initialized__ = False

    def __init__(self, iter: iter):
        self.batch_size = iter.batch_size
        self.num_workers = 1
        self.collate_fn = None
        self.pin_memory = False
        self.drop_last = False
        self.timeout = 0
        self.worker_init_fn = None
        self.sampler = iter
        self.batch_sampler = iter
        self.__initialized__ = True

    def __iter__(self):
        return map(lambda batch: {
                    'features': batch.text,
                    'targets': batch.target,
                }, self.batch_sampler.__iter__())

    def __len__(self):
        return len(self.batch_sampler)

## Data preprocessing

In [7]:
import re


puncts = ',.":)(-!?|;\'$&/[]>%=#*+\\•~@£·_{}©^®`<→°€™›♥←×§″′Â█½à…“★”–●â►−¢²¬░¶↑±¿▾═¦║\
―¥▓—‹─▒：¼⊕▼▪†■’▀¨▄♫☆é¯♦¤▲è¸¾Ã⋅‘∞∙）↓、│（»，♪╩╚³・╦╣╔╗▬❤ïØ¹≤‡√'


def clean_text(x, puncts=puncts): #добавляет пробелы вокруг пунктуации
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x


def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [8]:
df_train = pd.read_csv(f'{mydir}/train.csv', index_col=0)
df_train = df_train.rename(columns={'question_text': 'text'})

df_train['text'] = df_train['text'].progress_apply(str.lower)
df_train['text'] = df_train['text'].progress_apply(clean_text)
df_train['text'] = df_train['text'].progress_apply(clean_numbers)

df_train, df_test = train_test_split(df_train, train_size=0.7, random_state=SEED)

HBox(children=(FloatProgress(value=0.0, max=1306122.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1306122.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1306122.0), HTML(value='')))




In [9]:
max_len = 50

TEXT = data.Field(
                  postprocessing = lambda batch, vocab: [x[:max_len] for x in batch],
                  lower=True,
                  tokenize='spacy', 
                  tokenizer_language='en', 
                  batch_first=True)
LABEL = data.LabelField(dtype=torch.float)

train = DataFrameDataset(df_train, fields={'text': TEXT, 'target': LABEL})
test = DataFrameDataset(df_test, fields={'text': TEXT, 'target': LABEL})

TEXT.build_vocab(train, test, min_freq=1)
LABEL.build_vocab(train)

train, valid = train.split(split_ratio=0.9)
len(train), len(valid)

(822856, 91429)

In [10]:
vocab_size = len(TEXT.vocab)
vocab_size

185888

In [11]:
# word_index = dict(TEXT.vocab.stoi)
# embedding_matrix_1, _ = load_glove(word_index)
# embedding_matrix_2, _ = load_wiki(word_index)
# embedding_matrix_3, _ = load_parag(word_index)
# embedding_matrix_4, _ = load_ggle(word_index)

# embedding_matrix = np.hstack((embedding_matrix_1, 
#                               embedding_matrix_2,
#                               embedding_matrix_3,
#                               embedding_matrix_4))
# del embedding_matrix_1, embedding_matrix_2, embedding_matrix_3, embedding_matrix_4

# joblib.dump(embedding_matrix, f'{mydir}/embedding_matrix')

embedding_matrix = joblib.load(f'{mydir}/embedding_matrix')

In [12]:
h_size = 128
num_epochs = 10 
n_layers = 1
dropout = 0.2
embed_dim = embedding_matrix.shape[1]
batch_size = 512


train_iter, valid_iter, test_iter = data.BucketIterator.splits((train, valid, test), 
                                                               batch_size=batch_size, 
                                                               sort_key=lambda x: len(x.text),
                                                               sort=True,
                                                               device=DEVICE)
train_iter = IteratorWrapper(train_iter)
valid_iter = IteratorWrapper(valid_iter)
test_iter = IteratorWrapper(test_iter)
loaders = {'train': train_iter, 'valid': valid_iter}


model = GRUModel(vocab_size=vocab_size, 
                 embed_dim=embed_dim, 
                 h_size=h_size, 
                 n_layers=n_layers, 
                 dropout=dropout, 
                 padding_idx=TEXT.vocab.stoi['<pad>'], 
                 pretrained_embedding=torch.tensor(embedding_matrix).float(), 
                 fix_embedding=True)


optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, verbose=True, patience=2, factor=0.5)


dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1



In [13]:
logdir = f'{mydir}/log_quora1'
!rm -rf {logdir}

In [14]:
# use SupervisedWandbRunner runner to send statistics to wandb
runner = dl.SupervisedWandbRunner(DEVICE)
runner.train(model, 
             loaders=loaders,
             num_epochs=num_epochs,
             logdir=logdir,
             criterion=nn.BCEWithLogitsLoss(),
             optimizer=optimizer, 
             scheduler=scheduler,  
             callbacks=[
                dl.callbacks.CheckpointCallback(2), # save 2 best models (by epoch) into logdir
                dl.callbacks.EarlyStoppingCallback(3), # stop training, if valid loss does not improve last 3 epochs
             ],
             # send current hyperparam values to wandb
             monitoring_params={
                 'entity': 'denaas', # your wandb username
                 'project': 'text-augmentation', # project name
                 'name': 'quora-embed-original', # name of the specific run
                 'group': 'examples',
                 'config': {
                     'model': 'bigru',
                     'optimizer': str(optimizer),
                     'scheduler': 'plateau',
                     'early_stop': 3,
                     'vocab_size': len(TEXT.vocab.stoi),
                     'h_size': h_size,
                     'n_layers': n_layers,
                     'dropout': dropout,
                     'batch_size': batch_size,
                     'embed_dim': embed_dim,
                     'max_len': max_len,
                 },
             },
#              check=True, # set if you want to check pipeline for correctness, without actual training
             verbose=True)



I0302 18:12:37.242854 139869237638976 run_manager.py:924] system metrics and metadata threads started
I0302 18:12:37.244071 139869237638976 run_manager.py:933] checking resume status, waiting at most 10 seconds
I0302 18:12:37.543257 139869237638976 run_manager.py:951] resuming run from id: UnVuOnYxOjBhcDR6MTdtOnRleHQtYXVnbWVudGF0aW9uOmRlbmFhcw==
I0302 18:12:37.565651 139869237638976 run_manager.py:963] upserting run before process can begin, waiting at most 10 seconds
I0302 18:12:37.842063 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/config.yaml
I0302 18:12:37.877508 139861820270336 run_manager.py:1048] saving patches
I0302 18:12:38.440521 139861820270336 run_manager.py:1052] saving pip packages
I0302 18:12:38.442026 139861820270336 run_manager.py:1054] initializing streaming files api
I0302 18:12:38.443096 139861820270336 run_manager.py:1061] unblocking file change observer, beginning sync with W&B servers
I0302 18:12:38.727312 1398618

1/10 * Epoch (train):  89% 1424/1608 [00:15<00:02, 84.98it/s, loss=0.138] 

I0302 18:12:55.610726 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-metadata.json


1/10 * Epoch (train): 100% 1608/1608 [00:18<00:00, 87.68it/s, loss=0.672]
1/10 * Epoch (valid): 100% 179/179 [00:01<00:00, 162.19it/s, loss=0.329]


I0302 18:13:00.663387 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-history.jsonl
I0302 18:13:00.664492 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-summary.json


[2020-03-02 18:13:02,986] 
1/10 * Epoch 1 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=116159.7151 | _timers/batch_time=0.0068 | _timers/data_time=0.0056 | _timers/model_time=0.0012 | loss=0.1197
1/10 * Epoch 1 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=128642.3233 | _timers/batch_time=0.0046 | _timers/data_time=0.0036 | _timers/model_time=0.0010 | loss=0.1221


I0302 18:13:02.986194 139869237638976 logging.py:153] 
1/10 * Epoch 1 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=116159.7151 | _timers/batch_time=0.0068 | _timers/data_time=0.0056 | _timers/model_time=0.0012 | loss=0.1197
1/10 * Epoch 1 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=128642.3233 | _timers/batch_time=0.0046 | _timers/data_time=0.0036 | _timers/model_time=0.0010 | loss=0.1221


2/10 * Epoch (train):  55% 887/1608 [00:07<00:05, 121.62it/s, loss=0.072]

I0302 18:13:10.133784 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-events.jsonl


2/10 * Epoch (train):  69% 1115/1608 [00:09<00:04, 111.05it/s, loss=0.115]

I0302 18:13:12.134839 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-metadata.json


2/10 * Epoch (train): 100% 1608/1608 [00:15<00:00, 104.90it/s, loss=0.240]
2/10 * Epoch (valid): 100% 179/179 [00:01<00:00, 160.08it/s, loss=0.287]


I0302 18:13:20.170768 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-history.jsonl
I0302 18:13:20.171578 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-summary.json
I0302 18:13:29.177090 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-metadata.json
I0302 18:13:41.186951 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-events.jsonl
I0302 18:13:45.189335 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-metadata.json
I0302 18:14:00.294060 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-metadata.json
I0302 18:14:12.307579 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-events.jsonl
I0302 18:14:16.310575 139861851895552 run_ma

[2020-03-02 18:14:19,388] 
2/10 * Epoch 2 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=120482.3016 | _timers/batch_time=0.0049 | _timers/data_time=0.0038 | _timers/model_time=0.0011 | loss=0.1002
2/10 * Epoch 2 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=125724.3892 | _timers/batch_time=0.0046 | _timers/data_time=0.0036 | _timers/model_time=0.0010 | loss=0.1175


I0302 18:14:19.388062 139869237638976 logging.py:153] 
2/10 * Epoch 2 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=120482.3016 | _timers/batch_time=0.0049 | _timers/data_time=0.0038 | _timers/model_time=0.0011 | loss=0.1002
2/10 * Epoch 2 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=125724.3892 | _timers/batch_time=0.0046 | _timers/data_time=0.0036 | _timers/model_time=0.0010 | loss=0.1175


3/10 * Epoch (train):  91% 1461/1608 [00:13<00:01, 75.04it/s, loss=0.131] 

I0302 18:14:33.323537 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-metadata.json


3/10 * Epoch (train): 100% 1608/1608 [00:16<00:00, 96.90it/s, loss=0.121]
3/10 * Epoch (valid): 100% 179/179 [00:01<00:00, 148.96it/s, loss=0.292]


I0302 18:14:37.954852 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-history.jsonl
I0302 18:14:37.957623 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-summary.json
I0302 18:14:42.958442 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-events.jsonl
I0302 18:14:48.961453 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-metadata.json


[2020-03-02 18:15:05,109] 
3/10 * Epoch 3 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=112558.8061 | _timers/batch_time=0.0054 | _timers/data_time=0.0042 | _timers/model_time=0.0011 | loss=0.0944
3/10 * Epoch 3 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=118660.6164 | _timers/batch_time=0.0049 | _timers/data_time=0.0039 | _timers/model_time=0.0010 | loss=0.1072


I0302 18:15:05.109247 139869237638976 logging.py:153] 
3/10 * Epoch 3 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=112558.8061 | _timers/batch_time=0.0054 | _timers/data_time=0.0042 | _timers/model_time=0.0011 | loss=0.0944
3/10 * Epoch 3 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=118660.6164 | _timers/batch_time=0.0049 | _timers/data_time=0.0039 | _timers/model_time=0.0010 | loss=0.1072


4/10 * Epoch (train):   3% 43/1608 [00:00<05:00,  5.21it/s, loss=0.084]

I0302 18:15:05.973800 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-metadata.json


4/10 * Epoch (train):  62% 991/1608 [00:11<00:05, 107.13it/s, loss=0.062]

I0302 18:15:16.483069 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-events.jsonl


4/10 * Epoch (train):  98% 1572/1608 [00:18<00:00, 56.47it/s, loss=0.175] 

I0302 18:15:23.498250 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-metadata.json


4/10 * Epoch (train): 100% 1608/1608 [00:19<00:00, 83.94it/s, loss=0.110]
4/10 * Epoch (valid): 100% 179/179 [00:01<00:00, 153.41it/s, loss=0.313]


I0302 18:15:26.175103 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-history.jsonl
I0302 18:15:26.175870 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-summary.json
I0302 18:15:41.882728 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-metadata.json


[2020-03-02 18:15:45,530] 
4/10 * Epoch 4 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=112276.2091 | _timers/batch_time=0.0069 | _timers/data_time=0.0057 | _timers/model_time=0.0011 | loss=0.0895
4/10 * Epoch 4 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=120393.5368 | _timers/batch_time=0.0048 | _timers/data_time=0.0038 | _timers/model_time=0.0010 | loss=0.1083


I0302 18:15:45.530226 139869237638976 logging.py:153] 
4/10 * Epoch 4 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=112276.2091 | _timers/batch_time=0.0069 | _timers/data_time=0.0057 | _timers/model_time=0.0011 | loss=0.0895
4/10 * Epoch 4 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=120393.5368 | _timers/batch_time=0.0048 | _timers/data_time=0.0038 | _timers/model_time=0.0010 | loss=0.1083


5/10 * Epoch (train):  23% 369/1608 [00:03<00:09, 129.20it/s, loss=0.046]

I0302 18:15:48.938265 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-events.jsonl


5/10 * Epoch (train):  77% 1246/1608 [00:11<00:03, 91.35it/s, loss=0.106] 

I0302 18:15:56.948723 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-metadata.json


5/10 * Epoch (train): 100% 1608/1608 [00:16<00:00, 95.29it/s, loss=0.062]
5/10 * Epoch (valid): 100% 179/179 [00:01<00:00, 150.28it/s, loss=0.324]


I0302 18:16:04.360232 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-history.jsonl
I0302 18:16:04.360900 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-summary.json
I0302 18:16:13.366167 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-metadata.json
I0302 18:16:20.370108 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-events.jsonl


[2020-03-02 18:16:26,908] 
5/10 * Epoch 5 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=111920.5808 | _timers/batch_time=0.0054 | _timers/data_time=0.0042 | _timers/model_time=0.0012 | loss=0.0852
5/10 * Epoch 5 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=120370.8362 | _timers/batch_time=0.0049 | _timers/data_time=0.0039 | _timers/model_time=0.0010 | loss=0.1088


I0302 18:16:26.908911 139869237638976 logging.py:153] 
5/10 * Epoch 5 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=111920.5808 | _timers/batch_time=0.0054 | _timers/data_time=0.0042 | _timers/model_time=0.0012 | loss=0.0852
5/10 * Epoch 5 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=120370.8362 | _timers/batch_time=0.0049 | _timers/data_time=0.0039 | _timers/model_time=0.0010 | loss=0.1088


6/10 * Epoch (train):  17% 281/1608 [00:02<00:10, 126.11it/s, loss=0.047]

I0302 18:16:29.437052 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-metadata.json


6/10 * Epoch (train): 100% 1608/1608 [00:16<00:00, 99.99it/s, loss=0.063] 
6/10 * Epoch (valid): 100% 179/179 [00:01<00:00, 158.69it/s, loss=0.344]


I0302 18:16:44.857912 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-history.jsonl
I0302 18:16:44.858759 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-summary.json
I0302 18:16:45.859146 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-metadata.json
I0302 18:16:50.863534 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-events.jsonl


Early stop at 5 epoch
[2020-03-02 18:16:59,211] 
6/10 * Epoch 6 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=117947.9777 | _timers/batch_time=0.0051 | _timers/data_time=0.0040 | _timers/model_time=0.0011 | loss=0.0809
6/10 * Epoch 6 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=129336.7603 | _timers/batch_time=0.0046 | _timers/data_time=0.0036 | _timers/model_time=0.0010 | loss=0.1119


I0302 18:16:59.211848 139869237638976 logging.py:153] 
6/10 * Epoch 6 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=117947.9777 | _timers/batch_time=0.0051 | _timers/data_time=0.0040 | _timers/model_time=0.0011 | loss=0.0809
6/10 * Epoch 6 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=129336.7603 | _timers/batch_time=0.0046 | _timers/data_time=0.0036 | _timers/model_time=0.0010 | loss=0.1119
I0302 18:16:59.215524 139869237638976 run_manager.py:1068] shutting down system stats and metadata service
I0302 18:16:59.334183 139869237638976 run_manager.py:1080] stopping streaming files and file change observer
I0302 18:16:59.336037 139861851895552 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-events.jsonl
I0302 18:16:59.343093 139869237638976 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-metadata.json
I0302 18:16:59.344762 139869237638976 run_manager.py:677] file/dir created: /tmp

Top best models:
/data2/competitions/quora-insincere-questions-classification/log_quora1_dup/checkpoints/train.3.pth	0.1072
/data2/competitions/quora-insincere-questions-classification/log_quora1_dup/checkpoints/train.4.pth	0.1083


In [15]:
dl.utils.unpack_checkpoint(dl.utils.load_checkpoint(f'{logdir}/checkpoints/best_full.pth'), model=model)

In [16]:
import scipy


# find threshold
runner = dl.SupervisedRunner()
y_proba = runner.predict_loader(model, valid_iter)
y_proba = 1 / (1 + np.exp(-y_proba))
y_true = np.concatenate([x['targets'].cpu().numpy() for x in valid_iter])
res = scipy.optimize.minimize(
    lambda t: -metrics.f1_score(y_true, (y_proba >= t).astype(np.int)),
    x0=0.5,
    method='Nelder-Mead',
    tol=1e-3,
)
threshold = res.x[0]


runner = dl.SupervisedRunner()
y_proba = runner.predict_loader(model, test_iter)
y_proba = 1 / (1 + np.exp(-y_proba))
y_true = np.concatenate([x['targets'].cpu().numpy() for x in test_iter])

auc_test = metrics.roc_auc_score(y_true, y_proba)
f1_test = metrics.f1_score(y_true, (y_proba >= threshold).astype(np.int))

print(f1_test, threshold, auc_test)
wandb.log({'scores/f1': f1_test, 'scores/f1_threshold': threshold, 'scores/f1_auc': auc_test})

0.6731092270903737 0.4875 0.9639730904008862


I0302 18:17:11.432411 139869237638976 run_manager.py:924] system metrics and metadata threads started
I0302 18:17:11.435420 139869237638976 run_manager.py:933] checking resume status, waiting at most 10 seconds
I0302 18:17:12.047194 139861811877632 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_151236-0ap4z17m/wandb-metadata.json
I0302 18:17:13.733177 139869237638976 run_manager.py:951] resuming run from id: UnVuOnYxOjBhcDR6MTdtOnRleHQtYXVnbWVudGF0aW9uOmRlbmFhcw==
I0302 18:17:13.756480 139869237638976 run_manager.py:963] upserting run before process can begin, waiting at most 10 seconds
I0302 18:17:14.024659 139861354964736 run_manager.py:1048] saving patches
I0302 18:17:14.610379 139861354964736 run_manager.py:1052] saving pip packages
I0302 18:17:14.612020 139861354964736 run_manager.py:1054] initializing streaming files api
I0302 18:17:14.613062 139861354964736 run_manager.py:1061] unblocking file change observer, beginning sync with W&B servers
I0302 18:17:14.614069