In [None]:
__author__ = 'Nick Sarris (ngs5st)'

import os
import time
import gc
import sys

import numpy as np
import pandas as pd
import random
import shutil
import pickle

import torch
from torch import nn
from torch.utils import data
import torch.nn.functional as F
from torch.autograd import Variable

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from random import shuffle

from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertModel
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
from apex import amp
from apex.optimizers import FusedAdam

from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing.sequence import pad_sequences

tqdm.pandas()
print(os.listdir("./data/"))

In [None]:
def seed_everything(seed=1235):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(1235)

In [None]:
class MyBertClassifier(BertPreTrainedModel):

    def __init__(self, config, num_aux_targets):
        super(MyBertClassifier, self).__init__(config)

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(config.hidden_size, config.hidden_size)
        self.linear_out = nn.Linear(config.hidden_size, 1)
        self.linear_aux_out = nn.Linear(config.hidden_size, num_aux_targets)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        
        h_conc_linear1  = F.relu(self.linear(pooled_output))
        h_conc_linear1 = self.dropout(h_conc_linear1)
    
        hidden = pooled_output + h_conc_linear1        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out

In [None]:
start_time = time.time()
print("Establishing Global Variables ...")

# Data Directory
directory = './data/'

# Torch Device
device = torch.device('cuda')

# Model Parameters
max_length = 220
batch_size = 64
n_epochs = 2
accumulation_steps = 1

# Model/Split Seed/Parameters
# Change model_seed with every new/different model
# Keep split_seed the same throughout
model_seed = 1234
current_split = 0

# Model File Paths
TRAIN_FILE = directory + 'train.csv'
TEST_FILE  = directory + 'test.csv'
PROCESSED_FILE = 'train_seq.pickle'

# Directory/BERT Paths
WORK_DIR = directory
BERT_MODEL_PATH = directory + 'uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'
BERT_WEIGHT_PATH = directory + 'bert_pytorch.bin'

convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
    BERT_MODEL_PATH + 'bert_model.ckpt',
    BERT_MODEL_PATH + 'bert_config.json',
    WORK_DIR + 'pytorch_model.bin')

shutil.copyfile(BERT_MODEL_PATH + 'bert_config.json', WORK_DIR + 'bert_config.json')
bert_config = BertConfig(BERT_MODEL_PATH + 'bert_config.json')

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Loading Initial Data ...")

with open(PROCESSED_FILE,'rb') as f:
    x_train = pickle.load(f)

lengths = torch.from_numpy(np.array([len(x) for x in x_train]))
x_train = torch.tensor(pad_sequences(x_train, maxlen=max_length, padding='post'), dtype=torch.long)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Initial Data ...")

train_df = pd.read_csv(TRAIN_FILE)
train_df['comment_text'] = train_df['comment_text'].astype(str) 
train_df = train_df.fillna(0)

y_aux_train = train_df[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']].values
identity_columns = ['asian', 'atheist', 'bisexual', 'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu', 'homosexual_gay_or_lesbian']

weights = np.ones((len(train_df['comment_text']),)) / 4
weights += (train_df[identity_columns].fillna(0).values >= 0.5).sum(axis=1) / 4
weights += (( (train_df['target'].values>=0.5).astype(bool).astype(np.int) +
    (train_df[identity_columns].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
weights += (( (train_df['target'].values<0.5).astype(bool).astype(np.int) +
    (train_df[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4

y_train = np.vstack([(train_df['target'].values>=0.5).astype(np.int),weights]).T
y_train_torch = torch.tensor(np.hstack([y_train, y_aux_train]), dtype=torch.float32)
y_identities = (train_df[identity_columns] >= 0.5).astype(int).values

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
class SequenceBucketCollator():
    
    def __init__(self, choose_length, sequence_index, length_index, label_index=None):
        
        self.choose_length = choose_length
        self.sequence_index = sequence_index
        self.length_index = length_index
        self.label_index = label_index
        
    def __call__(self, batch):
        
        batch = [torch.stack(x) for x in list(zip(*batch))]
        sequences = batch[self.sequence_index]
        lengths = batch[self.length_index]
        
        length = self.choose_length(lengths)
        mask = torch.arange(start=0, end=max_length, step=1) < length
        padded_sequences = sequences[:, mask]
        batch[self.sequence_index] = padded_sequences
        
        if self.label_index is not None:
            return batch[self.sequence_index], batch[self.label_index]
    
        return batch[self.sequence_index]

class LenMatchBatchSampler(data.BatchSampler):
    
    def __iter__(self):

        buckets = [[]] * 100
        yielded = 0

        for idx in self.sampler:
            count_zeros = torch.sum(self.sampler.data_source[idx][0] == 0)
            count_zeros = int(count_zeros / 64) 
            if len(buckets[count_zeros]) == 0:  buckets[count_zeros] = []

            buckets[count_zeros].append(idx)

            if len(buckets[count_zeros]) == self.batch_size:
                batch = list(buckets[count_zeros])
                yield batch
                yielded += 1
                buckets[count_zeros] = []

        batch = []
        leftover = [idx for bucket in buckets for idx in bucket]

        for idx in leftover:
            batch.append(idx)
            if len(batch) == self.batch_size:
                yielded += 1
                yield batch
                batch = []

        if len(batch) > 0 and not self.drop_last:
            yielded += 1
            yield batch

        assert len(self) == yielded, "produced an inccorect number of batches. expected %i, but yielded %i" % (len(self), yielded)

In [None]:
start_time = time.time()
print("Finalizing Datasets ...")
    
splits = list(KFold(n_splits=5, shuffle=True, random_state=model_seed)
              .split(x_train, y_train_torch))

train_idx = splits[current_split][0]
valid_idx = splits[current_split][1]

train_collator = SequenceBucketCollator(
    lambda length_vals: length_vals.max(), sequence_index=0, length_index=1, label_index=2)
test_collator = SequenceBucketCollator(
    lambda length_vals: length_vals.max(), sequence_index=0, length_index=1)

y_train_id_set = y_identities[train_idx]
y_valid_id_set = y_identities[valid_idx]

train_dataset = data.TensorDataset(x_train[train_idx], lengths[train_idx], y_train_torch[train_idx])
valid_dataset = data.TensorDataset(x_train[valid_idx], lengths[valid_idx], y_train_torch[valid_idx])
valid_loader = data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=train_collator, 
                               pin_memory=True)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
class FocalLoss(nn.Module):
    
    def __init__(self, weight=None, alpha=0.5, gamma=3, size_average=True):
        super(FocalLoss, self).__init__()
        
        self.alpha = alpha
        self.gamma = gamma
        self.size_average = size_average
        self.eps = 1e-12
        self.weight = weight

    def forward(self, inputs, targets):
        
        p = torch.sigmoid(inputs)
        pt = torch.where(targets > 0, p, 1 - p)
        t = torch.ones_like(targets)
        alpha = torch.where(targets > 0, self.alpha * t, (1 - self.alpha) * t)

        one_tensor = torch.Tensor([1]).cuda()
        loss = -alpha * (torch.pow((1 - pt), self.gamma)) * torch.log(torch.min(pt + self.eps, one_tensor))
        
        if self.weight is not None: loss = loss * self.weight
        if self.size_average: loss = loss.mean()
        else: loss = loss.sum()
            
        return loss

In [None]:
def custom_loss(data, targets):

    bce_loss_1 = FocalLoss(weight=targets[:,1:2])(data[:,:1],targets[:,:1])
    bce_loss_2 = nn.BCEWithLogitsLoss()(data[:,1:],targets[:,2:])
    return (bce_loss_1) + bce_loss_2

In [None]:
start_time = time.time()
print("Establishing Models ...")

lrs = [2e-5, 1e-5]
model = MyBertClassifier.from_pretrained("../working",cache_dir=None, num_aux_targets=y_aux_train.shape[-1])

model.zero_grad()
model = model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = FusedAdam(optimizer_grouped_parameters, lr=2e-5, bias_correction=False, max_grad_norm=1.0)
model, optimizer = amp.initialize(model, optimizer,  keep_batchnorm_fp32=False, opt_level="O2",loss_scale="dynamic", verbosity=0)
model = model.train()

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Training Models ...")

tk = tqdm(range(n_epochs))
for epoch in tk:

    step = 0
    avg_loss = 0.
    avg_accuracy = 0.
    lossf = None

    lr = lrs[epoch]
    seed_everything(model_seed + 2 * epoch + current_split)
    ran_sampler = data.RandomSampler(train_dataset)
    len_sampler = LenMatchBatchSampler(ran_sampler, batch_size=batch_size, drop_last = False)
    train_loader = data.DataLoader(train_dataset, batch_sampler=len_sampler, collate_fn=train_collator, pin_memory=True)

    for param_group in optimizer.param_groups:
        param_group['lr'] = lrs[epoch]

    tk0 = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)    
    warmup = WarmupLinearSchedule(warmup=0.05, t_total=len(train_loader))

    optimizer.zero_grad()
    for i, (x_batch, y_batch) in tk0:

        y_pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None)    
        loss = custom_loss(y_pred, y_batch.to(device))

        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        if (i+1) % accumulation_steps == 0:
            lr_this_step = lr * warmup.get_lr(step, 0.05)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            optimizer.step()
            optimizer.zero_grad()
            step += 1

        if lossf:
            lossf = 0.98*lossf+0.02*loss.item()
        else:
            lossf = loss.item()

        tk0.set_postfix(loss = lossf)        
        avg_loss += loss.item() / len(train_loader)
        avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:,0])>0.5) == (y_batch[:,0]>0.5).to(device)).to(torch.float) ).item()/len(train_loader)
    
    tk.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy)
    output_model_file = "bert_pytorch_model_{}.bin".format(epoch)
    torch.save(model.state_dict(), output_model_file)    

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
class Evaluator:

    def __init__(self, y_true, y_identity, power=-5, overall_model_weight=0.25):
        self.y = (y_true >= 0.5).astype(int)
        self.y_i = (y_identity >= 0.5).astype(int)
        self.n_subgroups = self.y_i.shape[1]
        self.power = power
        self.overall_model_weight = overall_model_weight

    @staticmethod
    def _compute_auc(y_true, y_pred):
        try:
            return roc_auc_score(y_true, y_pred)
        except ValueError:
            return np.nan

    def _compute_subgroup_auc(self, i, y_pred):
        mask = self.y_i[:, i] == 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def _compute_bpsn_auc(self, i, y_pred):
        mask = self.y_i[:, i] + self.y == 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def _compute_bnsp_auc(self, i, y_pred):
        mask = self.y_i[:, i] + self.y != 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def compute_bias_metrics_for_model(self, y_pred):
        records = np.zeros((3, self.n_subgroups))
        for i in range(self.n_subgroups):
            records[0, i] = self._compute_subgroup_auc(i, y_pred)
            records[1, i] = self._compute_bpsn_auc(i, y_pred)
            records[2, i] = self._compute_bnsp_auc(i, y_pred)
        return records

    def _calculate_overall_auc(self, y_pred):
        return roc_auc_score(self.y, y_pred)

    def _power_mean(self, array):
        total = sum(np.power(array, self.power))
        return np.power(total / len(array), 1 / self.power)

    def get_final_metric(self, y_pred):
        bias_metrics = self.compute_bias_metrics_for_model(y_pred)
        bias_score = np.average([
            self._power_mean(bias_metrics[0]),
            self._power_mean(bias_metrics[1]),
            self._power_mean(bias_metrics[2])
        ])
        overall_score = self.overall_model_weight * self._calculate_overall_auc(y_pred)
        bias_score = (1 - self.overall_model_weight) * bias_score
        return overall_score + bias_score

In [None]:
start_time = time.time()
print("Validating Models ...")

for param in model.parameters():
    param.requires_grad = False

model.eval()
valid_preds = np.zeros((len(y_train_torch[valid_idx])))
tk1 = tqdm(enumerate(valid_loader), total=len(valid_loader), leave=False)
evaluator = Evaluator(np.array(y_train_torch[valid_idx])[:, 0], y_valid_id_set)

for i, (x_batch, y_batch) in tk1:

    y_pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None)
    valid_preds[i * batch_size:(i + 1) * batch_size] = y_pred[:,0].detach().cpu().squeeze().numpy()
    
    if i % 200 == 0:
        print("Model: 1 | Current CV Score: {}".format(evaluator.get_final_metric(valid_preds)))

valid_preds = torch.sigmoid(torch.tensor(valid_preds)).numpy().ravel()

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
print("Model: 1 | Final CV Score: {}".format(evaluator.get_final_metric(valid_preds)))