In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from catalyst import dl
import wandb
import joblib


DEVICE = 'cuda'
# your working dir
mydir = '/data2/competitions/quora-insincere-questions-classification'
SEED = 1234

tqdm.pandas()
# seed everything
import os
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

  from pandas import Panel
I0302 18:21:31.321669 140447729411904 file_utils.py:41] PyTorch version 1.4.0 available.
I0302 18:21:31.967522 140447729411904 file_utils.py:57] TensorFlow version 2.0.0 available.

The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version



In [2]:
def get_param_size(model, trainable=True):
    if trainable:
        psize = np.sum([np.prod(p.size()) for p in model.parameters() if p.requires_grad])
    else:
        psize = np.sum([np.prod(p.size()) for p in model.parameters()])
    return psize

## EMA

In [3]:
# https://discuss.pytorch.org/t/how-to-apply-exponential-moving-average-decay-for-variables/10856
class EMA():
    def __init__(self, model, mu, level='batch', n=1):
        """
        level: 'batch' or 'epoch'
          'batch': Update params every n batches.
          'epoch': Update params every epoch.
        """
        # self.ema_model = copy.deepcopy(model)
        self.mu = mu
        self.level = level
        self.n = n
        self.cnt = self.n
        self.shadow = {}
        for name, param in model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data

    def _update(self, model):
        for name, param in model.named_parameters():
            if param.requires_grad:
                new_average = (1 - self.mu) * param.data + self.mu * self.shadow[name]
                self.shadow[name] = new_average.clone()

    def set_weights(self, ema_model):
        for name, param in ema_model.named_parameters():
            if param.requires_grad:
                param.data = self.shadow[name]

    def on_batch_end(self, model):
        if self.level is 'batch':
            self.cnt -= 1
            if self.cnt == 0:
                self._update(model)
                self.cnt = self.n

    def on_epoch_end(self, model):
        if self.level is 'epoch':
            self._update(model)



## GRU Model

In [4]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, h_size, n_layers, dropout, padding_idx, 
                 pretrained_embedding=None, fix_embedding=True):
        super(GRUModel, self).__init__()
        self.is_pretrained = pretrained_embedding is not None
        
        if self.is_pretrained:
            self.embed = nn.Embedding.from_pretrained(pretrained_embedding, freeze=fix_embedding)
            self.embed.padding_idx = padding_idx
        else:
            self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx)
            
        # dropout after embeddings
        self.embed_drop = nn.Dropout(dropout)
        self.gru = nn.GRU(embed_dim, h_size, n_layers, 
                          batch_first=True, 
                          bidirectional=True, dropout=dropout)
        
        # use Sequential for FFNN
        self.out = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(2*n_layers*h_size, h_size),
            nn.BatchNorm1d(h_size),
            nn.PReLU(),
            nn.Linear(h_size, 1),
        )
        self.init_weights()

    def init_weights(self):
        if not self.is_pretrained:
            d = self.embed.weight.size(1)
            nn.init.uniform_(self.embed.weight, -1/np.sqrt(d), 1/np.sqrt(d))

    def forward(self, x):
        x = self.embed(x)
        x = self.embed_drop(x)
        x, _ = self.gru(x)
        x, _ = torch.max(x, 1)
        x = self.out(x).squeeze(1)
        return x

## Data preprocessing

In [5]:
class DataFrameDataset(data.Dataset):
     """Class for using pandas DataFrames as a datasource"""
     def __init__(self, examples, fields, filter_pred=None):
         """
         Create a dataset from a pandas dataframe of examples and Fields
         Arguments:
             examples pd.DataFrame: DataFrame of examples
             fields {str: Field}: The Fields to use in this tuple. The
                 string is a field name, and the Field is the associated field.
             filter_pred (callable or None): use only exanples for which
                 filter_pred(example) is true, or use all examples if None.
                 Default is None
         """
         self.examples = examples.apply(SeriesExample.fromSeries, args=(fields,), axis=1).tolist()
         if filter_pred is not None:
             self.examples = filter(filter_pred, self.examples)
         self.fields = dict(fields)
         # Unpack field tuples
         for n, f in list(self.fields.items()):
             if isinstance(n, tuple):
                 self.fields.update(zip(n, f))
                 del self.fields[n]
                    
class SeriesExample(data.Example):
     """Class to convert a pandas Series to an Example"""

     @classmethod
     def fromSeries(cls, data, fields):
         return cls.fromdict(data.to_dict(), fields)

     @classmethod
     def fromdict(cls, data, fields):
         ex = cls()

         for key, field in fields.items():
             if key not in data:
                 raise ValueError("Specified key {} was not found in "
                 "the input data".format(key))
             if field is not None:
                 setattr(ex, key, field.preprocess(data[key]))
             else:
                 setattr(ex, key, data[key])
         return ex

# Simple wrapper to join torchtext and catalyst API

class IteratorWrapper(torch.utils.data.DataLoader):
    __initialized__ = False

    def __init__(self, iter: iter):
        self.batch_size = iter.batch_size
        self.num_workers = 1
        self.collate_fn = None
        self.pin_memory = False
        self.drop_last = False
        self.timeout = 0
        self.worker_init_fn = None
        self.sampler = iter
        self.batch_sampler = iter
        self.__initialized__ = True

    def __iter__(self):
        return map(lambda batch: {
                    'features': batch.text,
                    'targets': batch.target,
                }, self.batch_sampler.__iter__())

    def __len__(self):
        return len(self.batch_sampler)

In [6]:
import re


puncts = ',.":)(-!?|;\'$&/[]>%=#*+\\•~@£·_{}©^®`<→°€™›♥←×§″′Â█½à…“★”–●â►−¢²¬░¶↑±¿▾═¦║\
―¥▓—‹─▒：¼⊕▼▪†■’▀¨▄♫☆é¯♦¤▲è¸¾Ã⋅‘∞∙）↓、│（»，♪╩╚³・╦╣╔╗▬❤ïØ¹≤‡√'


def clean_text(x, puncts=puncts): #добавляет пробелы вокруг пунктуации
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x


def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [7]:
df_train = pd.read_csv(f'{mydir}/train.csv', index_col=0)
df_train = df_train.rename(columns={'question_text': 'text'})

df_train['text'] = df_train['text'].progress_apply(str.lower)
df_train['text'] = df_train['text'].progress_apply(clean_text)
df_train['text'] = df_train['text'].progress_apply(clean_numbers)

df_train, df_test = train_test_split(df_train, train_size=0.7, random_state=SEED)

HBox(children=(FloatProgress(value=0.0, max=1306122.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1306122.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1306122.0), HTML(value='')))




In [8]:
max_len = 50

TEXT = data.Field(
#     include_lengths=True,
                  postprocessing = lambda batch, vocab: [x[:max_len] for x in batch],
                  lower=True,
                  tokenize='spacy', 
                  tokenizer_language='en', 
                  batch_first=True
                 )
LABEL = data.LabelField(dtype=torch.float)

train = DataFrameDataset(df_train, fields={'text': TEXT, 'target': LABEL})
test = DataFrameDataset(df_test, fields={'text': TEXT, 'target': LABEL})

TEXT.build_vocab(train, test, min_freq=5)
LABEL.build_vocab(train)

train, valid = train.split(split_ratio=0.9)
len(train), len(valid)

(822856, 91429)

In [9]:
vocab_size = len(TEXT.vocab)
vocab_size

49185

In [10]:
h_size = 128
num_epochs = 10 
n_layers = 1
dropout = 0.1
embed_dim = 128
batch_size = 512


train_iter, valid_iter, test_iter = data.BucketIterator.splits((train, valid, test), 
                                                               batch_size=batch_size, 
                                                               sort_key=lambda x: len(x.text),
                                                               sort=True,
                                                               device=DEVICE)

In [11]:
train_iter = IteratorWrapper(train_iter)
valid_iter = IteratorWrapper(valid_iter)
test_iter = IteratorWrapper(test_iter)
loaders = {'train': train_iter, 'valid': valid_iter}


model = GRUModel(vocab_size=vocab_size, 
                 embed_dim=embed_dim, 
                 h_size=h_size, 
                 n_layers=n_layers, 
                 dropout=dropout, 
                 padding_idx=TEXT.vocab.stoi['<pad>'], 
                 pretrained_embedding=None, 
                 fix_embedding=False)


optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, verbose=True, patience=2, factor=0.5)


dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.1 and num_layers=1



In [12]:
logdir = f'{mydir}/log_quora2'
!rm -rf {logdir}

In [13]:
# use SupervisedWandbRunner runner to send statistics to wandb
runner = dl.SupervisedWandbRunner(DEVICE)
runner.train(model, 
             loaders=loaders,
             num_epochs=num_epochs,
             logdir=logdir,
             criterion=nn.BCEWithLogitsLoss(),
             optimizer=optimizer, 
             scheduler=scheduler,  
             callbacks=[
                dl.callbacks.CheckpointCallback(2), # save 2 best models (by epoch) into logdir
                dl.callbacks.EarlyStoppingCallback(3), # stop training, if valid loss does not improve last 3 epochs
             ],
             # send current hyperparam values to wandb
             monitoring_params={
                 'entity': 'denaas', # your wandb username
                 'project': 'text-augmentation', # project name
                 'name': 'quora-noembed-original', # name of the specific run
                 'group': 'examples',
                 'config': {
                     'model': 'bigru',
                     'optimizer': str(optimizer),
                     'scheduler': 'plateau',
                     'early_stop': 3,
                     'vocab_size': vocab_size,
                     'h_size': h_size,
                     'n_layers': n_layers,
                     'dropout': dropout,
                     'batch_size': batch_size,
                     'embed_dim': embed_dim,
                     'max_len': max_len,
                 },
             },
#              check=True, # set if you want to check pipeline for correctness, without actual training
             verbose=True)



I0302 18:38:28.034538 140447729411904 run_manager.py:924] system metrics and metadata threads started
I0302 18:38:28.035439 140447729411904 run_manager.py:933] checking resume status, waiting at most 10 seconds
I0302 18:38:28.312668 140447729411904 run_manager.py:951] resuming run from id: UnVuOnYxOml2MHhjeTkxOnRleHQtYXVnbWVudGF0aW9uOmRlbmFhcw==
I0302 18:38:28.332295 140447729411904 run_manager.py:963] upserting run before process can begin, waiting at most 10 seconds
I0302 18:38:28.607837 140442043508480 run_manager.py:1048] saving patches
I0302 18:38:28.671578 140442075133696 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_153827-iv0xcy91/config.yaml
I0302 18:38:29.026587 140442043508480 run_manager.py:1052] saving pip packages
I0302 18:38:29.028063 140442043508480 run_manager.py:1054] initializing streaming files api
I0302 18:38:29.028974 140442043508480 run_manager.py:1061] unblocking file change observer, beginning sync with W&B servers
I0302 18:38:29.356307 1404420

1/10 * Epoch (train):  92% 1481/1608 [00:16<00:01, 84.23it/s, loss=0.219] 

I0302 18:38:46.850527 140442075133696 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_153827-iv0xcy91/wandb-metadata.json


1/10 * Epoch (train): 100% 1608/1608 [00:17<00:00, 89.64it/s, loss=0.578]
1/10 * Epoch (valid): 100% 179/179 [00:01<00:00, 161.39it/s, loss=0.251]


I0302 18:38:49.866588 140442075133696 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_153827-iv0xcy91/wandb-history.jsonl
I0302 18:38:49.867388 140442075133696 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_153827-iv0xcy91/wandb-summary.json


[2020-03-02 18:38:50,003] 
1/10 * Epoch 1 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=118858.6015 | _timers/batch_time=0.0065 | _timers/data_time=0.0054 | _timers/model_time=0.0011 | loss=0.1298
1/10 * Epoch 1 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=114324.3365 | _timers/batch_time=0.0051 | _timers/data_time=0.0041 | _timers/model_time=0.0010 | loss=0.1558


I0302 18:38:50.003142 140447729411904 logging.py:153] 
1/10 * Epoch 1 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=118858.6015 | _timers/batch_time=0.0065 | _timers/data_time=0.0054 | _timers/model_time=0.0011 | loss=0.1298
1/10 * Epoch 1 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=114324.3365 | _timers/batch_time=0.0051 | _timers/data_time=0.0041 | _timers/model_time=0.0010 | loss=0.1558


2/10 * Epoch (train):  69% 1106/1608 [00:09<00:04, 110.66it/s, loss=0.089]

I0302 18:38:59.879318 140442075133696 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_153827-iv0xcy91/wandb-events.jsonl


2/10 * Epoch (train):  81% 1307/1608 [00:11<00:03, 98.57it/s, loss=0.146] 

I0302 18:39:01.881912 140442075133696 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_153827-iv0xcy91/wandb-metadata.json


2/10 * Epoch (train): 100% 1608/1608 [00:15<00:00, 102.25it/s, loss=0.245]
2/10 * Epoch (valid): 100% 179/179 [00:01<00:00, 170.50it/s, loss=0.277]


I0302 18:39:06.891876 140442075133696 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_153827-iv0xcy91/wandb-history.jsonl
I0302 18:39:06.892608 140442075133696 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_153827-iv0xcy91/wandb-summary.json


[2020-03-02 18:39:07,643] 
2/10 * Epoch 2 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=117076.5020 | _timers/batch_time=0.0050 | _timers/data_time=0.0039 | _timers/model_time=0.0011 | loss=0.0961
2/10 * Epoch 2 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=120055.0663 | _timers/batch_time=0.0049 | _timers/data_time=0.0039 | _timers/model_time=0.0010 | loss=0.1750


I0302 18:39:07.643361 140447729411904 logging.py:153] 
2/10 * Epoch 2 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=117076.5020 | _timers/batch_time=0.0050 | _timers/data_time=0.0039 | _timers/model_time=0.0011 | loss=0.0961
2/10 * Epoch 2 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=120055.0663 | _timers/batch_time=0.0049 | _timers/data_time=0.0039 | _timers/model_time=0.0010 | loss=0.1750


3/10 * Epoch (train):  73% 1166/1608 [00:10<00:04, 100.44it/s, loss=0.100]

I0302 18:39:17.985757 140442075133696 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_153827-iv0xcy91/wandb-metadata.json


3/10 * Epoch (train): 100% 1608/1608 [00:15<00:00, 103.85it/s, loss=0.085]
3/10 * Epoch (valid): 100% 179/179 [00:01<00:00, 178.23it/s, loss=0.359]


I0302 18:39:24.997493 140442075133696 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_153827-iv0xcy91/wandb-history.jsonl
I0302 18:39:24.999510 140442075133696 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_153827-iv0xcy91/wandb-summary.json


Early stop at 2 epoch
[2020-03-02 18:39:25,011] 
3/10 * Epoch 3 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=119700.9695 | _timers/batch_time=0.0050 | _timers/data_time=0.0038 | _timers/model_time=0.0011 | loss=0.0745
3/10 * Epoch 3 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=124843.4703 | _timers/batch_time=0.0047 | _timers/data_time=0.0037 | _timers/model_time=0.0010 | loss=0.2056


I0302 18:39:25.011820 140447729411904 logging.py:153] 
3/10 * Epoch 3 (train): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=119700.9695 | _timers/batch_time=0.0050 | _timers/data_time=0.0038 | _timers/model_time=0.0011 | loss=0.0745
3/10 * Epoch 3 (valid): _base/lr=0.0010 | _base/momentum=0.9000 | _timers/_fps=124843.4703 | _timers/batch_time=0.0047 | _timers/data_time=0.0037 | _timers/model_time=0.0010 | loss=0.2056
I0302 18:39:25.016955 140447729411904 run_manager.py:1068] shutting down system stats and metadata service


Top best models:
/data2/competitions/quora-insincere-questions-classification/log_quora2/checkpoints/train.1.pth	0.1558
/data2/competitions/quora-insincere-questions-classification/log_quora2/checkpoints/train.2.pth	0.1750


I0302 18:39:25.927115 140447729411904 run_manager.py:1080] stopping streaming files and file change observer
I0302 18:39:25.998388 140442075133696 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_153827-iv0xcy91/wandb-metadata.json
I0302 18:39:26.000784 140447729411904 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_153827-iv0xcy91/wandb-events.jsonl
I0302 18:39:26.002572 140447729411904 run_manager.py:677] file/dir created: /tmp/wandb/run-20200302_153827-iv0xcy91/valid_log/events.out.tfevents.1583163528.UNIT-1482.9902.1
I0302 18:39:26.003966 140447729411904 run_manager.py:677] file/dir created: /tmp/wandb/run-20200302_153827-iv0xcy91/log.txt
I0302 18:39:26.005576 140447729411904 run_manager.py:677] file/dir created: /tmp/wandb/run-20200302_153827-iv0xcy91/train_log/events.out.tfevents.1583163510.UNIT-1482.9902.0
I0302 18:39:26.007193 140447729411904 run_manager.py:677] file/dir created: /tmp/wandb/run-20200302_153827-iv0xcy91/train_log
I0302 18:39:26.00876

In [14]:

dl.utils.unpack_checkpoint(dl.utils.load_checkpoint(f'{logdir}/checkpoints/best_full.pth'), model=model)

In [15]:
import scipy


# find threshold on valid dataset
runner = dl.SupervisedRunner()
y_proba = runner.predict_loader(model, valid_iter)
# convert logits to probabilities
y_proba = 1 / (1 + np.exp(-y_proba))
y_true = np.concatenate([x['targets'].cpu().numpy() for x in valid_iter])
res = scipy.optimize.minimize(
    lambda t: -metrics.f1_score(y_true, (y_proba >= t).astype(np.int)),
    x0=0.5,
    method='Nelder-Mead',
    tol=1e-3,
)
threshold = res.x[0]


runner = dl.SupervisedRunner()
y_proba = runner.predict_loader(model, test_iter)
# convert logits to probabilities
y_proba = 1 / (1 + np.exp(-y_proba))
y_true = np.concatenate([x['targets'].cpu().numpy() for x in test_iter])

auc_test = metrics.roc_auc_score(y_true, y_proba)
f1_test = metrics.f1_score(y_true, (y_proba >= threshold).astype(np.int))

print(f1_test, threshold, auc_test)
wandb.log({'scores/f1': f1_test, 'scores/f1_threshold': threshold, 'scores/f1_auc': auc_test})

0.6141959868823301 0.6382812500000004 0.9489274833350626


I0302 18:39:38.873837 140447729411904 run_manager.py:924] system metrics and metadata threads started
I0302 18:39:38.877126 140447729411904 run_manager.py:933] checking resume status, waiting at most 10 seconds
I0302 18:39:39.151937 140447729411904 run_manager.py:951] resuming run from id: UnVuOnYxOml2MHhjeTkxOnRleHQtYXVnbWVudGF0aW9uOmRlbmFhcw==
I0302 18:39:39.173845 140447729411904 run_manager.py:963] upserting run before process can begin, waiting at most 10 seconds
I0302 18:39:39.442445 140442060293888 run_manager.py:1048] saving patches
I0302 18:39:39.501732 140442075133696 run_manager.py:688] file/dir modified: /tmp/wandb/run-20200302_153827-iv0xcy91/wandb-metadata.json
I0302 18:39:39.930966 140442060293888 run_manager.py:1052] saving pip packages
I0302 18:39:39.932473 140442060293888 run_manager.py:1054] initializing streaming files api
I0302 18:39:39.933523 140442060293888 run_manager.py:1061] unblocking file change observer, beginning sync with W&B servers
I0302 18:39:39.934738