In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from catalyst import dl
import wandb
import joblib


DEVICE = 'cuda'
mydir = '/data2/competitions/quora-insincere-questions-classification'
SEED = 1234

tqdm.pandas()
# seed everything
import os
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

  from pandas import Panel
I0302 18:19:43.302976 140359865444160 file_utils.py:41] PyTorch version 1.4.0 available.
I0302 18:19:43.909866 140359865444160 file_utils.py:57] TensorFlow version 2.0.0 available.

The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version



In [2]:
def get_param_size(model, trainable=True):
    if trainable:
        psize = np.sum([np.prod(p.size()) for p in model.parameters() if p.requires_grad])
    else:
        psize = np.sum([np.prod(p.size()) for p in model.parameters()])
    return psize

## EMA

In [3]:
# https://discuss.pytorch.org/t/how-to-apply-exponential-moving-average-decay-for-variables/10856
class EMA():
    def __init__(self, model, mu, level='batch', n=1):
        """
        level: 'batch' or 'epoch'
          'batch': Update params every n batches.
          'epoch': Update params every epoch.
        """
        # self.ema_model = copy.deepcopy(model)
        self.mu = mu
        self.level = level
        self.n = n
        self.cnt = self.n
        self.shadow = {}
        for name, param in model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data

    def _update(self, model):
        for name, param in model.named_parameters():
            if param.requires_grad:
                new_average = (1 - self.mu) * param.data + self.mu * self.shadow[name]
                self.shadow[name] = new_average.clone()

    def set_weights(self, ema_model):
        for name, param in ema_model.named_parameters():
            if param.requires_grad:
                param.data = self.shadow[name]

    def on_batch_end(self, model):
        if self.level is 'batch':
            self.cnt -= 1
            if self.cnt == 0:
                self._update(model)
                self.cnt = self.n

    def on_epoch_end(self, model):
        if self.level is 'epoch':
            self._update(model)



## Loading embeddings

In [4]:
import gc
from gensim.models import KeyedVectors


def load_glove(word_index):
    def get_coefs(word, *arr): 
        return word, np.asarray(arr, dtype='float32')
    EMBEDDING_FILE = f'{mydir}/embeddings/glove.840B.300d/glove.840B.300d.txt'
    embeddings_index = dict(get_coefs(*o.split(' ')) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(list(embeddings_index.values()))
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    unknown_words = []
    
    embedding_matrix = np.random.normal(emb_mean, emb_std, (len(word_index), embed_size))
        
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is None:
            embedding_vector = embeddings_index.get(word.lower())
            if embedding_vector is None:
                unknown_words.append((word, i))
            else:
                embedding_matrix[i] = embedding_vector
        else:
            embedding_matrix[i] = embedding_vector
    print('\nTotal unknowns glove', len(unknown_words))
    print(unknown_words[-10:])

    del embeddings_index
    gc.collect()
    return embedding_matrix, unknown_words


def load_wiki(word_index):
    def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
    EMBEDDING_FILE = f'{mydir}/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    embeddings_index = dict(get_coefs(*o.split(' ')) for o in open(EMBEDDING_FILE) if len(o) > 100)

    all_embs = np.stack(list(embeddings_index.values()))
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    unknown_words = []
    embedding_matrix = np.random.normal(emb_mean, emb_std, (len(word_index), embed_size))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is None:
            embedding_vector = embeddings_index.get(word.lower())
            if embedding_vector is None:
                unknown_words.append((word, i))
            else:
                embedding_matrix[i] = embedding_vector
        else:
            embedding_matrix[i] = embedding_vector
    print('\nTotal unknowns wiki', len(unknown_words))
    print(unknown_words[-10:])

    del embeddings_index
    gc.collect()
    return embedding_matrix, unknown_words


def load_parag(word_index):
    def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
    EMBEDDING_FILE = f'{mydir}/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    embeddings_index = dict(get_coefs(*o.split(' '))
                            for o in open(EMBEDDING_FILE, encoding='utf8', errors='ignore')
                            if len(o) > 100)

    all_embs = np.stack(list(embeddings_index.values()))
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    unknown_words = []
    embedding_matrix = np.random.normal(emb_mean, emb_std, (len(word_index), embed_size))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is None:
            embedding_vector = embeddings_index.get(word.lower())
            if embedding_vector is None:
                unknown_words.append((word, i))
            else:
                embedding_matrix[i] = embedding_vector
        else:
            embedding_matrix[i] = embedding_vector
    print('\nTotal unknowns parag', len(unknown_words))
    print(unknown_words[-10:])

    del embeddings_index
    gc.collect()
    return embedding_matrix, unknown_words


# https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go
def load_ggle(word_index):
    EMBEDDING_FILE = f'{mydir}/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
    embeddings_index = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)
    embed_size = embeddings_index.get_vector('known').size

    unknown_words = []
    embedding_matrix = (np.random.rand(len(word_index), embed_size) - 0.5) / 5.0
    
    for word, i in word_index.items():
        if word in embeddings_index:
            embedding_matrix[i] = embeddings_index.get_vector(word)
        else:
            word_lower = word.lower()
            if word_lower in embeddings_index:
                embedding_matrix[i] = embeddings_index.get_vector(word_lower)
            else:
                unknown_words.append((word, i))

    print('\nTotal unknowns ggle', len(unknown_words))
    print(unknown_words[-10:])

    del embeddings_index
    gc.collect()
    return embedding_matrix, unknown_words

## GRU Model

In [5]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, h_size, n_layers, dropout, padding_idx, 
                 pretrained_embedding=None, fix_embedding=True):
        super(GRUModel, self).__init__()
        self.is_pretrained = pretrained_embedding is not None
        
        if self.is_pretrained:
            self.embed = nn.Embedding.from_pretrained(pretrained_embedding, freeze=fix_embedding)
            self.embed.padding_idx = padding_idx
        else:
            self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx)
            
        self.embed_drop = nn.Dropout(dropout)
        self.gru = nn.GRU(embed_dim, h_size, n_layers, batch_first=True, bidirectional=True, dropout=dropout)
        
        self.out = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(2*n_layers*h_size, h_size),
            nn.BatchNorm1d(h_size),
            nn.PReLU(),
            nn.Linear(h_size, 1),
        )
        self.init_weights()

    def init_weights(self):
        if not self.is_pretrained:
            d = self.embed.weight.size(1)
            nn.init.uniform_(self.embed.weight, -1/np.sqrt(d), 1/np.sqrt(d))

    def forward(self, x):
        x = self.embed(x)
        x = self.embed_drop(x)
        x, _ = self.gru(x)
        x, _ = torch.max(x, 1)
        x = self.out(x).squeeze(1)
        return x

In [6]:
class DataFrameDataset(data.Dataset):
     """Class for using pandas DataFrames as a datasource"""
     def __init__(self, examples, fields, filter_pred=None):
         """
         Create a dataset from a pandas dataframe of examples and Fields
         Arguments:
             examples pd.DataFrame: DataFrame of examples
             fields {str: Field}: The Fields to use in this tuple. The
                 string is a field name, and the Field is the associated field.
             filter_pred (callable or None): use only exanples for which
                 filter_pred(example) is true, or use all examples if None.
                 Default is None
         """
         self.examples = examples.apply(SeriesExample.fromSeries, args=(fields,), axis=1).tolist()
         if filter_pred is not None:
             self.examples = filter(filter_pred, self.examples)
         self.fields = dict(fields)
         # Unpack field tuples
         for n, f in list(self.fields.items()):
             if isinstance(n, tuple):
                 self.fields.update(zip(n, f))
                 del self.fields[n]
                    
class SeriesExample(data.Example):
     """Class to convert a pandas Series to an Example"""

     @classmethod
     def fromSeries(cls, data, fields):
         return cls.fromdict(data.to_dict(), fields)

     @classmethod
     def fromdict(cls, data, fields):
         ex = cls()

         for key, field in fields.items():
             if key not in data:
                 raise ValueError("Specified key {} was not found in "
                 "the input data".format(key))
             if field is not None:
                 setattr(ex, key, field.preprocess(data[key]))
             else:
                 setattr(ex, key, data[key])
         return ex

# Simple wrapper to join torchtext and catalyst API

class IteratorWrapper(torch.utils.data.DataLoader):
    __initialized__ = False

    def __init__(self, iter: iter):
        self.batch_size = iter.batch_size
        self.num_workers = 1
        self.collate_fn = None
        self.pin_memory = False
        self.drop_last = False
        self.timeout = 0
        self.worker_init_fn = None
        self.sampler = iter
        self.batch_sampler = iter
        self.__initialized__ = True

    def __iter__(self):
        return map(lambda batch: {
                    'features': batch.text,
                    'targets': batch.target,
                }, self.batch_sampler.__iter__())

    def __len__(self):
        return len(self.batch_sampler)

## Data preprocessing

In [7]:
import re


puncts = ',.":)(-!?|;\'$&/[]>%=#*+\\•~@£·_{}©^®`<→°€™›♥←×§″′Â█½à…“★”–●â►−¢²¬░¶↑±¿▾═¦║\
―¥▓—‹─▒：¼⊕▼▪†■’▀¨▄♫☆é¯♦¤▲è¸¾Ã⋅‘∞∙）↓、│（»，♪╩╚³・╦╣╔╗▬❤ïØ¹≤‡√'


def clean_text(x, puncts=puncts): #добавляет пробелы вокруг пунктуации
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x


def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [8]:
df_train = pd.read_csv(f'{mydir}/train.csv', index_col=0)
df_train = df_train.rename(columns={'question_text': 'text'})

df_train['text'] = df_train['text'].progress_apply(str.lower)
df_train['text'] = df_train['text'].progress_apply(clean_text)
df_train['text'] = df_train['text'].progress_apply(clean_numbers)

df_train, df_test = train_test_split(df_train, train_size=0.7, random_state=SEED)

augmented = pd.read_csv(f'{mydir}/augmented_fairseq.csv', index_col=0)
augmented = augmented.rename(columns={'question_text': 'text'})
augmented['text'] = augmented['text'].str[2:-2]

augmented['text'] = augmented['text'].progress_apply(str.lower)
augmented['text'] = augmented['text'].progress_apply(clean_text)
augmented['text'] = augmented['text'].progress_apply(clean_numbers)

df_train = pd.concat([df_train, augmented]).reset_index(drop=True)

HBox(children=(FloatProgress(value=0.0, max=1306122.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1306122.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1306122.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=914280.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=914280.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=914280.0), HTML(value='')))




In [9]:
max_len = 50

TEXT = data.Field(
                  postprocessing = lambda batch, vocab: [x[:max_len] for x in batch],
                  lower=True,
                  tokenize='spacy', 
                  tokenizer_language='en', 
                  batch_first=True)
LABEL = data.LabelField(dtype=torch.float)

train = DataFrameDataset(df_train, fields={'text': TEXT, 'target': LABEL})
test = DataFrameDataset(df_test, fields={'text': TEXT, 'target': LABEL})

TEXT.build_vocab(train, test, min_freq=1)
LABEL.build_vocab(train)

train, valid = train.split(split_ratio=0.9)
len(train), len(valid)

(1645708, 182857)

In [10]:
vocab_size = len(TEXT.vocab)
vocab_size

195394

In [11]:
word_index = dict(TEXT.vocab.stoi)
embedding_matrix_1, _ = load_glove(word_index)
embedding_matrix_2, _ = load_wiki(word_index)
embedding_matrix_3, _ = load_parag(word_index)
embedding_matrix_4, _ = load_ggle(word_index)

embedding_matrix = np.hstack((embedding_matrix_1, 
                              embedding_matrix_2,
                              embedding_matrix_3,
                              embedding_matrix_4))
del embedding_matrix_1, embedding_matrix_2, embedding_matrix_3, embedding_matrix_4

joblib.dump(embedding_matrix, f'{mydir}/embedding_matrix_dup')


Total unknowns glove 74234
[('할', 195382), ('했다', 195383), ('행복하게', 195384), ('혀', 195385), ('호', 195386), ('흡', 195387), ('\uf0d8what', 195388), ('\ufeffwhat', 195389), ('ｈow', 195392), ('ｘ', 195393)]

Total unknowns wiki 102249
[('했다', 195383), ('행복하게', 195384), ('혀', 195385), ('호', 195386), ('흡', 195387), ('\uf0d8what', 195388), ('\ufeffwhat', 195389), ('＄', 195390), ('ｈow', 195392), ('ｘ', 195393)]

Total unknowns parag 53170
[('할', 195382), ('했다', 195383), ('행복하게', 195384), ('혀', 195385), ('호', 195386), ('흡', 195387), ('\uf0d8what', 195388), ('\ufeffwhat', 195389), ('ｈow', 195392), ('ｘ', 195393)]


I0302 18:28:48.087625 140359865444160 utils_any2vec.py:341] loading projection weights from /data2/competitions/quora-insincere-questions-classification/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin
I0302 18:29:34.826177 140359865444160 utils_any2vec.py:405] loaded (3000000, 300) matrix from /data2/competitions/quora-insincere-questions-classification/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin



Total unknowns ggle 116591
[('행복하게', 195384), ('혀', 195385), ('호', 195386), ('흡', 195387), ('\uf0d8what', 195388), ('\ufeffwhat', 195389), ('＄', 195390), ('＞', 195391), ('ｈow', 195392), ('ｘ', 195393)]


['/data2/competitions/quora-insincere-questions-classification/embedding_matrix_dup']

In [11]:
embedding_matrix = joblib.load(f'{mydir}/embedding_matrix_dup')

In [12]:
h_size = 128
num_epochs = 10 
n_layers = 1
dropout = 0.1
embed_dim = embedding_matrix.shape[1]
batch_size = 512


train_iter, valid_iter, test_iter = data.BucketIterator.splits((train, valid, test), 
                                                               batch_size=batch_size, 
                                                               sort_key=lambda x: len(x.text),
                                                               sort=True,
                                                               device=DEVICE)
train_iter = IteratorWrapper(train_iter)
valid_iter = IteratorWrapper(valid_iter)
test_iter = IteratorWrapper(test_iter)
loaders = {'train': train_iter, 'valid': valid_iter}


model = GRUModel(vocab_size=vocab_size, 
                 embed_dim=embed_dim, 
                 h_size=h_size, 
                 n_layers=n_layers, 
                 dropout=dropout, 
                 padding_idx=TEXT.vocab.stoi['<pad>'], 
                 pretrained_embedding=torch.tensor(embedding_matrix).float(), 
                 fix_embedding=True)


optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, verbose=True, patience=2, factor=0.5)


dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.1 and num_layers=1



In [13]:
logdir = f'{mydir}/log_quora1_dup'
!rm -rf {logdir}

In [14]:
# use SupervisedWandbRunner runner to send statistics to wandb
runner = dl.SupervisedWandbRunner(DEVICE)
runner.train(model, 
             loaders=loaders,
             num_epochs=num_epochs,
             logdir=logdir,
             criterion=nn.BCEWithLogitsLoss(),
             optimizer=optimizer, 
             scheduler=scheduler,  
             callbacks=[
                dl.callbacks.CheckpointCallback(2), # save 2 best models (by epoch) into logdir
                dl.callbacks.EarlyStoppingCallback(3), # stop training, if valid loss does not improve last 3 epochs
             ],
             # send current hyperparam values to wandb
             monitoring_params={
                 'entity': 'denaas', # your wandb username
                 'project': 'text-augmentation', # project name
                 'name': 'quora-embed-dup', # name of the specific run
                 'group': 'examples',
                 'config': {
                     'model': 'bigru',
                     'optimizer': str(optimizer),
                     'scheduler': 'plateau',
                     'early_stop': 3,
                     'vocab_size': vocab_size,
                     'h_size': h_size,
                     'n_layers': n_layers,
                     'dropout': dropout,
                     'batch_size': batch_size,
                     'embed_dim': embed_dim,
                     'max_len': max_len,
                 },
             },
#              check=True, # set if you want to check pipeline for correctness, without actual training
             verbose=True)



I0302 18:29:43.742481 140359865444160 run_manager.py:924] system metrics and metadata threads started
I0302 18:29:43.743592 140359865444160 run_manager.py:933] checking resume status, waiting at most 10 seconds
I0302 18:29:44.008087 140359865444160 run_manager.py:951] resuming run from id: UnVuOnYxOmJqZHp6YjZhOnRleHQtYXVnbWVudGF0aW9uOmRlbmFhcw==
I0302 18:29:44.027082 140359865444160 run_manager.py:963] upserting run before process can begin, waiting at most 10 seconds
I0302 18:29:44.320137 140352964880128 run_manager.py:1048] saving patches
I0302 18:29:44.371698 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/config.yaml
I0302 18:29:45.041044 140352964880128 run_manager.py:1052] saving pip packages
I0302 18:29:45.042504 140352964880128 run_manager.py:1054] initializing streaming files api
I0302 18:29:45.043643 140352964880128 run_manager.py:1061] unblocking file change observer, beginning sync with W&B servers
I0302 18:29:45.350311 1403529

1/10 * Epoch (train):  51% 1631/3215 [00:13<00:12, 129.34it/s, loss=0.059]

I0302 18:30:00.632044 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


1/10 * Epoch (train):  94% 3018/3215 [00:26<00:02, 82.47it/s, loss=0.166] 

I0302 18:30:13.648898 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-events.jsonl


1/10 * Epoch (train): 100% 3205/3215 [00:29<00:00, 49.03it/s, loss=0.244]

I0302 18:30:16.656267 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


1/10 * Epoch (train): 100% 3215/3215 [00:29<00:00, 107.98it/s, loss=0.353]
1/10 * Epoch (valid): 100% 358/358 [00:02<00:00, 158.53it/s, loss=0.314]


I0302 18:30:19.708699 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-history.jsonl
I0302 18:30:19.710001 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-summary.json


[2020-03-02 18:30:22,139] 
1/10 * Epoch 1 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=123572.9128 | _timers/batch_time=0.0049 | _timers/data_time=0.0038 | _timers/model_time=0.0011 | loss=0.1126
1/10 * Epoch 1 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=126760.2132 | _timers/batch_time=0.0047 | _timers/data_time=0.0037 | _timers/model_time=0.0009 | loss=0.1213


I0302 18:30:22.139730 140359865444160 logging.py:153] 
1/10 * Epoch 1 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=123572.9128 | _timers/batch_time=0.0049 | _timers/data_time=0.0038 | _timers/model_time=0.0011 | loss=0.1126
1/10 * Epoch 1 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=126760.2132 | _timers/batch_time=0.0047 | _timers/data_time=0.0037 | _timers/model_time=0.0009 | loss=0.1213


2/10 * Epoch (train):  36% 1152/3215 [00:10<00:16, 121.56it/s, loss=0.048]

I0302 18:30:32.413002 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


2/10 * Epoch (train):  77% 2474/3215 [00:22<00:08, 91.82it/s, loss=0.094] 

I0302 18:30:44.418489 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-events.jsonl


2/10 * Epoch (train):  88% 2820/3215 [00:26<00:04, 85.28it/s, loss=0.153]

I0302 18:30:48.425047 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


2/10 * Epoch (train): 100% 3215/3215 [00:32<00:00, 99.17it/s, loss=0.210]
2/10 * Epoch (valid): 100% 358/358 [00:07<00:00, 48.22it/s, loss=0.237] 


I0302 18:31:02.786560 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-history.jsonl
I0302 18:31:02.787737 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-summary.json
I0302 18:31:09.791024 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json
I0302 18:31:19.795473 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-events.jsonl
I0302 18:31:25.799352 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


[2020-03-02 18:31:35,077] 
2/10 * Epoch 2 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=113461.0343 | _timers/batch_time=0.0053 | _timers/data_time=0.0041 | _timers/model_time=0.0011 | loss=0.0982
2/10 * Epoch 2 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=119237.6418 | _timers/batch_time=0.0189 | _timers/data_time=0.0179 | _timers/model_time=0.0010 | loss=0.1142


I0302 18:31:35.077609 140359865444160 logging.py:153] 
2/10 * Epoch 2 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=113461.0343 | _timers/batch_time=0.0053 | _timers/data_time=0.0041 | _timers/model_time=0.0011 | loss=0.0982
2/10 * Epoch 2 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=119237.6418 | _timers/batch_time=0.0189 | _timers/data_time=0.0179 | _timers/model_time=0.0010 | loss=0.1142


3/10 * Epoch (train):  19% 625/3215 [00:05<00:19, 130.49it/s, loss=0.073]

I0302 18:31:40.894947 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


3/10 * Epoch (train):  58% 1879/3215 [00:15<00:11, 120.47it/s, loss=0.055]

I0302 18:31:50.909386 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-events.jsonl


3/10 * Epoch (train):  79% 2538/3215 [00:21<00:06, 104.32it/s, loss=0.138]

I0302 18:31:56.921505 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


3/10 * Epoch (train): 100% 3215/3215 [00:30<00:00, 104.73it/s, loss=0.130]
3/10 * Epoch (valid): 100% 358/358 [00:02<00:00, 158.78it/s, loss=0.270]


I0302 18:32:08.943109 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-history.jsonl
I0302 18:32:08.943978 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-summary.json
I0302 18:32:12.945218 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json
I0302 18:32:21.954688 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-events.jsonl
I0302 18:32:28.959042 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


[2020-03-02 18:32:36,919] 
3/10 * Epoch 3 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=118075.0616 | _timers/batch_time=0.0050 | _timers/data_time=0.0039 | _timers/model_time=0.0011 | loss=0.0911
3/10 * Epoch 3 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=125208.4177 | _timers/batch_time=0.0047 | _timers/data_time=0.0037 | _timers/model_time=0.0010 | loss=0.1113


I0302 18:32:36.919386 140359865444160 logging.py:153] 
3/10 * Epoch 3 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=118075.0616 | _timers/batch_time=0.0050 | _timers/data_time=0.0039 | _timers/model_time=0.0011 | loss=0.0911
3/10 * Epoch 3 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=125208.4177 | _timers/batch_time=0.0047 | _timers/data_time=0.0037 | _timers/model_time=0.0010 | loss=0.1113


4/10 * Epoch (train):  31% 995/3215 [00:08<00:17, 129.27it/s, loss=0.060]

I0302 18:32:45.016277 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


4/10 * Epoch (train):  61% 1947/3215 [00:16<00:11, 110.97it/s, loss=0.065]

I0302 18:32:53.025110 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-events.jsonl


4/10 * Epoch (train):  87% 2789/3215 [00:24<00:04, 95.21it/s, loss=0.140] 

I0302 18:33:01.045060 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


4/10 * Epoch (train): 100% 3215/3215 [00:30<00:00, 106.92it/s, loss=0.107]
4/10 * Epoch (valid): 100% 358/358 [00:02<00:00, 167.60it/s, loss=0.236]


I0302 18:33:10.130794 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-history.jsonl
I0302 18:33:10.131500 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-summary.json
I0302 18:33:17.134274 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json
I0302 18:33:23.137892 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-events.jsonl
I0302 18:33:33.143790 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


[2020-03-02 18:33:38,327] 
4/10 * Epoch 4 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=121530.4187 | _timers/batch_time=0.0049 | _timers/data_time=0.0038 | _timers/model_time=0.0011 | loss=0.0847
4/10 * Epoch 4 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=134389.2059 | _timers/batch_time=0.0044 | _timers/data_time=0.0035 | _timers/model_time=0.0009 | loss=0.1099


I0302 18:33:38.327310 140359865444160 logging.py:153] 
4/10 * Epoch 4 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=121530.4187 | _timers/batch_time=0.0049 | _timers/data_time=0.0038 | _timers/model_time=0.0011 | loss=0.0847
4/10 * Epoch 4 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=134389.2059 | _timers/batch_time=0.0044 | _timers/data_time=0.0035 | _timers/model_time=0.0009 | loss=0.1099


5/10 * Epoch (train):  42% 1344/3215 [00:10<00:14, 129.12it/s, loss=0.050]

I0302 18:33:49.245618 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


5/10 * Epoch (train):  60% 1940/3215 [00:15<00:11, 107.30it/s, loss=0.094]

I0302 18:33:54.251821 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-events.jsonl


5/10 * Epoch (train):  94% 3031/3215 [00:26<00:02, 80.37it/s, loss=0.188] 

I0302 18:34:05.265293 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


5/10 * Epoch (train): 100% 3215/3215 [00:29<00:00, 107.37it/s, loss=0.067]
5/10 * Epoch (valid): 100% 358/358 [00:02<00:00, 156.88it/s, loss=0.271]


I0302 18:34:11.279843 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-history.jsonl
I0302 18:34:11.280933 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-summary.json
I0302 18:34:22.585362 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json
I0302 18:34:25.586606 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-events.jsonl


[2020-03-02 18:34:25,760] 
5/10 * Epoch 5 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=122039.9658 | _timers/batch_time=0.0049 | _timers/data_time=0.0038 | _timers/model_time=0.0011 | loss=0.0790
5/10 * Epoch 5 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=128173.4590 | _timers/batch_time=0.0047 | _timers/data_time=0.0038 | _timers/model_time=0.0009 | loss=0.1140


I0302 18:34:25.760475 140359865444160 logging.py:153] 
5/10 * Epoch 5 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=122039.9658 | _timers/batch_time=0.0049 | _timers/data_time=0.0038 | _timers/model_time=0.0011 | loss=0.0790
5/10 * Epoch 5 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=128173.4590 | _timers/batch_time=0.0047 | _timers/data_time=0.0038 | _timers/model_time=0.0009 | loss=0.1140


6/10 * Epoch (train):  42% 1353/3215 [00:11<00:14, 124.45it/s, loss=0.048]

I0302 18:34:42.360959 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


6/10 * Epoch (train):  94% 3035/3215 [00:33<00:02, 70.73it/s, loss=0.148] 

I0302 18:34:59.380938 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


6/10 * Epoch (train):  96% 3098/3215 [00:34<00:01, 62.56it/s, loss=0.133]

I0302 18:35:00.385976 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-events.jsonl


6/10 * Epoch (train): 100% 3215/3215 [00:37<00:00, 86.76it/s, loss=0.065]
6/10 * Epoch (valid): 100% 358/358 [00:02<00:00, 153.05it/s, loss=0.294]


I0302 18:35:05.633958 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-history.jsonl
I0302 18:35:05.634685 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-summary.json
I0302 18:35:15.955268 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


[2020-03-02 18:35:20,083] 
6/10 * Epoch 6 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=115520.0736 | _timers/batch_time=0.0068 | _timers/data_time=0.0057 | _timers/model_time=0.0011 | loss=0.0742
6/10 * Epoch 6 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=122178.5187 | _timers/batch_time=0.0048 | _timers/data_time=0.0039 | _timers/model_time=0.0009 | loss=0.1105


I0302 18:35:20.083850 140359865444160 logging.py:153] 
6/10 * Epoch 6 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=115520.0736 | _timers/batch_time=0.0068 | _timers/data_time=0.0057 | _timers/model_time=0.0011 | loss=0.0742
6/10 * Epoch 6 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=122178.5187 | _timers/batch_time=0.0048 | _timers/data_time=0.0039 | _timers/model_time=0.0009 | loss=0.1105


7/10 * Epoch (train):  40% 1282/3215 [00:11<00:15, 122.04it/s, loss=0.024]

I0302 18:35:31.167263 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


7/10 * Epoch (train):  44% 1406/3215 [00:12<00:14, 124.59it/s, loss=0.061]

I0302 18:35:32.170906 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-events.jsonl


7/10 * Epoch (train):  92% 2959/3215 [00:27<00:03, 78.96it/s, loss=0.122] 

I0302 18:35:47.190322 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


7/10 * Epoch (train): 100% 3215/3215 [00:31<00:00, 102.31it/s, loss=0.054]
7/10 * Epoch (valid): 100% 358/358 [00:02<00:00, 158.33it/s, loss=0.284]


I0302 18:35:54.230973 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-history.jsonl
I0302 18:35:54.231799 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-summary.json
I0302 18:36:03.236207 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-events.jsonl
I0302 18:36:03.244806 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json


Early stop at 6 epoch
[2020-03-02 18:36:08,848] 
7/10 * Epoch 7 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=116627.4067 | _timers/batch_time=0.0051 | _timers/data_time=0.0040 | _timers/model_time=0.0011 | loss=0.0700
7/10 * Epoch 7 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=126607.9177 | _timers/batch_time=0.0047 | _timers/data_time=0.0037 | _timers/model_time=0.0009 | loss=0.1157


I0302 18:36:08.848874 140359865444160 logging.py:153] 
7/10 * Epoch 7 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=116627.4067 | _timers/batch_time=0.0051 | _timers/data_time=0.0040 | _timers/model_time=0.0011 | loss=0.0700
7/10 * Epoch 7 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=126607.9177 | _timers/batch_time=0.0047 | _timers/data_time=0.0037 | _timers/model_time=0.0009 | loss=0.1157
I0302 18:36:08.852340 140359865444160 run_manager.py:1068] shutting down system stats and metadata service


Top best models:
/data2/competitions/quora-insincere-questions-classification/log_quora1_dup/checkpoints/train.4.pth	0.1099
/data2/competitions/quora-insincere-questions-classification/log_quora1_dup/checkpoints/train.6.pth	0.1105


I0302 18:36:09.238575 140359865444160 run_manager.py:1080] stopping streaming files and file change observer
I0302 18:36:09.242645 140352999483136 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-events.jsonl
I0302 18:36:09.246645 140359865444160 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json
I0302 18:36:09.248886 140359865444160 run_manager.py:677] file/dir created: /tmp/wandb/run-20200302_152942-bjdzzb6a/log.txt
I0302 18:36:09.251935 140359865444160 run_manager.py:677] file/dir created: /tmp/wandb/run-20200302_152942-bjdzzb6a/valid_log/events.out.tfevents.1583163016.UNIT-1482.9632.1
I0302 18:36:09.252740 140359865444160 run_manager.py:677] file/dir created: /tmp/wandb/run-20200302_152942-bjdzzb6a/train_log/events.out.tfevents.1583162987.UNIT-1482.9632.0
I0302 18:36:09.253377 140359865444160 run_manager.py:677] file/dir created: /tmp/wandb/run-20200302_152942-bjdzzb6a/train_log
I0302 18:36:09.25401

In [15]:
dl.utils.unpack_checkpoint(dl.utils.load_checkpoint(f'{logdir}/checkpoints/best_full.pth'), model=model)

In [16]:
import scipy


# find threshold
runner = dl.SupervisedRunner()
y_proba = runner.predict_loader(model, valid_iter)
y_proba = 1 / (1 + np.exp(-y_proba))
y_true = np.concatenate([x['targets'].cpu().numpy() for x in valid_iter])
res = scipy.optimize.minimize(
    lambda t: -metrics.f1_score(y_true, (y_proba >= t).astype(np.int)),
    x0=0.5,
    method='Nelder-Mead',
    tol=1e-3,
)
threshold = res.x[0]


runner = dl.SupervisedRunner()
y_proba = runner.predict_loader(model, test_iter)
y_proba = 1 / (1 + np.exp(-y_proba))
y_true = np.concatenate([x['targets'].cpu().numpy() for x in test_iter])

auc_test = metrics.roc_auc_score(y_true, y_proba)
f1_test = metrics.f1_score(y_true, (y_proba >= threshold).astype(np.int))

print(f1_test, threshold, auc_test)
wandb.log({'scores/f1': f1_test, 'scores/f1_threshold': threshold, 'scores/f1_auc': auc_test})

0.6905000556854883 0.31874999999999987 0.9697459995230024


I0302 18:36:26.615827 140359865444160 run_manager.py:924] system metrics and metadata threads started
I0302 18:36:26.616446 140359865444160 run_manager.py:933] checking resume status, waiting at most 10 seconds
I0302 18:36:27.233987 140352956487424 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_152942-bjdzzb6a/wandb-metadata.json
I0302 18:36:27.633795 140359865444160 run_manager.py:951] resuming run from id: UnVuOnYxOmJqZHp6YjZhOnRleHQtYXVnbWVudGF0aW9uOmRlbmFhcw==
I0302 18:36:27.669797 140359865444160 run_manager.py:963] upserting run before process can begin, waiting at most 10 seconds
I0302 18:36:27.936141 140352899876608 run_manager.py:1048] saving patches
I0302 18:36:28.545298 140352899876608 run_manager.py:1052] saving pip packages
I0302 18:36:28.547137 140352899876608 run_manager.py:1054] initializing streaming files api
I0302 18:36:28.548027 140352899876608 run_manager.py:1061] unblocking file change observer, beginning sync with W&B servers
I0302 18:36:28.549411