## This is BERT based fine tune model for binary prediction

Loss function is customized. The custom loss functions requires 7 field including the target toxic indication. This means our model need to return 7 fields prediction to be able to use this loss function. Considering there is no existing model that returns 7 labels, I will need to use pretrained BERT model and add a Linear layer with 7 output fields (BertClass).

In [1]:
%autosave 0

Autosave disabled


In [2]:
import os
if not os.path.exists('data'):
  !mkdir data
  # Download the training and the test corpus
  !wget -nv --show-progress -O data/test.csv.zip https://www.dropbox.com/s/xp6bo8yo1vbv5yg/test.csv.zip?dl=1
  !wget -nv --show-progress -O data/train.csv.zip https://www.dropbox.com/s/xei6z41mfrcnxcd/train.csv.zip?dl=1
  # Download the pretrained weights for bert base. 
  !wget -nv --show-progress -O data/uncased_L-12_H-768_A-12.zip \
          https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip

  !wget -nv --show-progress  -O data/cased_L-12_H-768_A-12.zip \
          https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip
  # unzip weights & conifg and remove the original zip
  !unzip -d data/ data/cased_L-12_H-768_A-12.zip && rm data/cased_L-12_H-768_A-12.zip
  !unzip -d data/ data/uncased_L-12_H-768_A-12.zip && rm data/uncased_L-12_H-768_A-12.zip

In [3]:
import sys, os
import numpy as np 
import pandas as pd 
import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

%load_ext autoreload
%autoreload 2
%matplotlib inline
from tqdm import tqdm, notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings(action='once')
import pickle
import shutil

In [4]:
# Let's activate CUDA for GPU based operations
device=torch.device('cuda')

In [5]:
# In bert we need all inputs to have the same length, we will use the first 220 characters. 
MAX_SEQUENCE_LENGTH = 220
SEED = 1234
# We shall run a single epoch (ie. one pass over the data)
EPOCHS = 1
PATH = '.'#'/root/v2/week06/hw' # /root/v2/week06/hw"
DATA_DIR = os.path.join(PATH, "data")
WORK_DIR = os.path.join(PATH, "workingdir")

# Validation and training sizes are here. 
train_size= 1000000 # 1000000 
valid_size= 500000  # 500000

This should be the files you downloaded earlier when you ran `download.sh`

In [6]:
!pip install transformers



In [7]:
%%capture
from transformers import BertModel, BertConfig,BertTokenizer, AdamW, get_linear_schedule_with_warmup,BertForSequenceClassification
import transformers
import random
#from transformers import AdamW as BertAdam

We shall now load the model. When you run this, comment out the `capture` command to understand the archecture.

In [8]:
%%capture
bert_config = BertConfig()

Bert needs a special formatting of sentences, so we have a sentence start and end token, as well as separators.   
Thanks to this [script](https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming) for a fast convertor of the sentences.

In [9]:
def convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in notebook.tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    print(longer)
    return np.array(all_tokens)

Now we load the BERT tokenizer and convert the sentences.

In [10]:
%%time
# tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_all = pd.read_csv(os.path.join(DATA_DIR, "train.csv.zip")).sample(train_size+valid_size,random_state=SEED)
print('loaded %d records' % len(train_all))

# Make sure all comment_text values are strings
train_all['comment_text'] = train_all['comment_text'].astype(str) 

sequences = convert_lines(train_all["comment_text"].fillna("DUMMY_VALUE"),MAX_SEQUENCE_LENGTH,tokenizer)
train_all=train_all.fillna(0)

loaded 1500000 records


HBox(children=(FloatProgress(value=0.0, max=1500000.0), HTML(value='')))


33722
CPU times: user 28min 5s, sys: 6.35 s, total: 28min 11s
Wall time: 28min 4s


In [11]:
train_all['target']=(train_all['target']>=0.5).astype(float)
# Training data - sentences
X = sequences[:train_size] 
# Target - the toxicity. 
y = train_all[['target']].values[:train_size]
X_val = sequences[train_size:]                
y_val = train_all[['target']].values[train_size:]

# Training data creations
train_dataset = torch.utils.data.TensorDataset(torch.tensor(X,dtype=torch.long), torch.tensor(y,dtype=torch.long))

In [12]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(SEED)
torch.backends.cudnn.deterministic = True


In [13]:
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

# Overall
weights = np.ones((len(X),)) / 4

X_train = train_all[:train_size]

# Subgroup
weights += (X_train[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) / 4

# Background Positive, Subgroup Negative
weights += (( (X_train['target'].values>=0.5).astype(bool).astype(np.int) +
   (X_train[identity_columns].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4

# Background Negative, Subgroup Positive
weights += (( (X_train['target'].values<0.5).astype(bool).astype(np.int) +
   (X_train[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
loss_weight = 1.0 / weights.mean()

y_train = np.vstack([(X_train['target'].values>=0.5).astype(np.int),weights]).T
y_aux_train = X_train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
y_train_torch = torch.tensor(np.hstack([y_train, y_aux_train]), dtype=torch.float32)

#prepar validation data
X_val_all = train_all[train_size:]
y_val = (X_val_all['target'].values>=0.5).astype(np.int)


In [14]:
def custom_loss(data, targets):
    ''' Define custom loss function for weighted BCE on 'target' column '''
    bce_loss_1 = nn.BCEWithLogitsLoss(weight=targets[:,1:2])(data[:,:1],targets[:,:1])
    bce_loss_2 = nn.BCEWithLogitsLoss()(data[:,1:],targets[:,2:])
    return (bce_loss_1 * loss_weight) + bce_loss_2

### Define a BERT fine tuning model that generate 7 output numbers:

In [15]:
%%capture

# Creating the customized model, by adding a drop out and a dense layer on top of bert to get the final output for the model. 
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bertlayer = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(self.bertlayer.config.hidden_dropout_prob)
        self.linearlayer = nn.Linear(self.bertlayer.config.hidden_size, 7, bias = False) 
        self.output = nn.Sigmoid()
    
    def forward(self, ids, attention_mask, token_type_ids = None):
        output_1= self.bertlayer(ids, attention_mask, token_type_ids)
        dropout = self.dropout(output_1[1])
        output_2 = self.linearlayer(dropout)
        #output = self.output(output_2)

        return output_2

model = BERTClass()
model.to(device)

In [18]:
batch_size = 32
total_steps = len(train_dataset) * EPOCHS

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=10,
  num_training_steps=total_steps
)

scaler = torch.cuda.amp.GradScaler()
train_dataset = torch.utils.data.TensorDataset(torch.tensor(X,dtype=torch.long), y_train_torch)

tq = notebook.tqdm(range(EPOCHS))

for epoch in tq:
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    tk0 = notebook.tqdm(enumerate(train_loader),total=len(train_loader),leave=False)
    
    train_loss = 0
    for i,(x_batch, y_batch) in tk0:
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            # with labels, it outputs loss/
            out = model(x_batch.to(device), attention_mask=(x_batch>0).to(device))
            loss = custom_loss(out, y_batch.to(device))
            train_loss += loss
            
        # Backprop using loss
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        ret = torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
    print(f"Training loss:{train_loss/len(train_dataset)}")


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=31250.0), HTML(value='')))

Training loss:0.007288726046681404



In [19]:
val_dataset = torch.utils.data.TensorDataset(torch.tensor(X_val,dtype=torch.long), torch.tensor(y_val,dtype=torch.long))
valid_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

#tk0 = tqdm_notebook(valid_loader)
tk0 = notebook.tqdm(enumerate(valid_loader),total=len(valid_loader),leave=False)

valid_preds = np.zeros((len(X_val)))
val_loss =0
for i,(x_batch, y_batch)  in tk0:
    with torch.no_grad():       
        # Get loss and logits for both classes
        y_pred =model(x_batch.to(device), attention_mask=(x_batch>0).to(device))[:,0]
        valid_preds[i*batch_size:(i+1)*batch_size] = y_pred.detach().cpu().squeeze().numpy()

        # accumulate validatio losses
        val_loss += loss
print(f"Validation loss:{val_loss/len(X_val)}")
print('AUC score : {:.5f}'.format(roc_auc_score(y_val.squeeze(), valid_preds)))

HBox(children=(FloatProgress(value=0.0, max=15625.0), HTML(value='')))

Validation loss:0.015533684752881527
AUC score : 0.96235
