In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import time
import datetime
import gc
import random
import re
import operator
import pickle
from tqdm import tqdm
import pkg_resources
import scipy.stats as stats
import sys


import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader,TensorDataset,Dataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.optim.optimizer import Optimizer

# sklearn
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import f1_score,precision_score,recall_score,roc_auc_score,log_loss


# stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lc = LancasterStemmer()
from nltk.stem import SnowballStemmer
sb = SnowballStemmer("english")

%load_ext autoreload
%autoreload 2
%matplotlib inline

import shutil

def seed_everything(SEED=42):
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    os.environ['PYTHONHASHSEED'] = str(SEED)
    torch.backends.cudnn.benchmark = False

def init_func(worker_id):
    np.random.seed(SEED+worker_id)

SEED=42
seed_everything(SEED=SEED)

tqdm.pandas()
t1 = datetime.datetime.now()

In [2]:
# Installing Nvidia Apex
! pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ../input/nvidiaapex/repository/NVIDIA-apex-39e153a

  cmdoptions.check_install_build_global(options)
Created temporary directory: /tmp/pip-ephem-wheel-cache-5ibrgn_b
Created temporary directory: /tmp/pip-req-tracker-8txmqvn6
Created requirements tracker '/tmp/pip-req-tracker-8txmqvn6'
Created temporary directory: /tmp/pip-install-8kkqgtos
Processing /kaggle/input/nvidiaapex/repository/NVIDIA-apex-39e153a
  Created temporary directory: /tmp/pip-req-build-nj3yyggs
  Added file:///kaggle/input/nvidiaapex/repository/NVIDIA-apex-39e153a to build tracker '/tmp/pip-req-tracker-8txmqvn6'
    Running setup.py (path:/tmp/pip-req-build-nj3yyggs/setup.py) egg_info for package from file:///kaggle/input/nvidiaapex/repository/NVIDIA-apex-39e153a
    Running command python setup.py egg_info
    torch.__version__  =  1.0.1.post2
    running egg_info
    creating pip-egg-info/apex.egg-info
    writing pip-egg-info/apex.egg-info/PKG-INFO
    writing dependency_links to pip-egg-info/apex.egg-info/dependency_links.txt
    writing top-level na

In [3]:
from apex import amp
MAX_SEQUENCE_LENGTH = 200
EPOCHS = 1
Data_dir="../input/jigsaw-unintended-bias-in-toxicity-classification"
Input_dir = "../input"
WORK_DIR = "../working/"

device=torch.device('cuda')

## STORING THE PRETRAINED MODEL

In [4]:
package_dir_a = "../input/gpt2-pytorch/pytorch-pretrained-bert-master/pytorch-pretrained-BERT-master/"
import sys
sys.path.insert(0,package_dir_a)
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2ClassificationHeadModel,GPT2Config,OpenAIAdam

identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

aux_cols=['target','severe_toxicity','obscene','identity_attack','insult','threat']

gpt_config = GPT2Config(vocab_size_or_config_json_file="../input/gpt2-models/config.json")

## READING THE DATA

In [5]:
%%time
X = np.load("../input/train-lines-for-gpt2/lines_array_max_length_200.npy")

train = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")

train[identity_columns]=train[identity_columns].fillna(0)

aux_cols=['target','severe_toxicity','obscene','identity_attack','insult','threat']
print(len(aux_cols))
y_aux=train[aux_cols].values
train['target']=(train['target']>=0.5).astype(int)
y=train['target'].values

y=np.hstack((y.reshape(-1,1),y_aux))

print("Total Data Shape",X.shape,y.shape)

6
Total Data Shape (1804874, 200) (1804874, 7)
CPU times: user 12.5 s, sys: 5.61 s, total: 18.1 s
Wall time: 19.4 s


In [6]:
X[0]

array([ 1212,   318,   523,  3608,    13,   632,   338,   588,    11,
         705, 19188,   345,   765,   534,  2802,   284,  1100,   428,
        3548,     6, 16123,  1049,  2126,    11,   880,  1760,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

## WEIGHTS

In [7]:
# taken from the kernel https://www.kaggle.com/tanreinama/simple-lstm-using-identity-parameters-solution
weights=np.ones((train.shape[0],))
weights = weights + train[identity_columns].sum(axis=1).astype(np.bool).astype(np.int).values

# Background Positive, Subgroup Negative
weights = weights + (( (train['target']>=0.5).astype(np.bool).astype(np.int) +  \
        (train[identity_columns].fillna(0).values<0.5).sum(axis=1).astype(np.int).astype(np.bool) )>1)\
                    .astype(np.bool).astype(np.int)

# Background Negative, Subgroup Positive
weights = weights + (( (train['target']<0.5).astype(np.bool).astype(np.int) + \
        (train[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(np.int).astype(np.bool) )>1)\
                    .astype(np.bool).astype(np.int)
weights=(weights.values)/np.mean(weights)
weights[0:10]

array([0.78609906, 0.78609906, 0.78609906, 0.78609906, 1.57219811,
       1.57219811, 0.78609906, 0.78609906, 0.78609906, 0.78609906])

## METRICS

In [8]:
# taken from the kernel https://www.kaggle.com/dborkan/benchmark-kernel
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

# making subgroups as 1 and 0 with threshold 0.5
# the nan value in some examples 
for subgroup in identity_columns:
    train[subgroup]=(train[subgroup]>=0.5).astype(np.int8)

def auc_score(y_true,y_pred):
    return roc_auc_score(y_true,y_pred)

def compute_bpsn(df,subgroup):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples.
    Here, we restrict the test set to the non-toxic examples that mention the identity and 
    the toxic examples that do not"""
    
    subgroup_negative_examples=df.loc[(df[subgroup]==1) & (df['target']==0)]
    non_subgroup_positive_examples=df.loc[(df[subgroup]==0)&(df['target']==1)]
    examples=pd.concat([subgroup_negative_examples,non_subgroup_positive_examples])
    
    return roc_auc_score(examples['target'].values,examples['preds'].values)
    
    
def compute_bnsp(df,subgroup):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples.
    Here, we restrict the test set to the toxic examples that mention the identity and 
    the non-toxic examples that do not."""
    subgroup_positive_examples=df.loc[(df[subgroup]==1)&(df['target']==1)]
    non_subgroup_negative_examples=df.loc[(df[subgroup]==0)&(df['target']==0)]
    examples=pd.concat([subgroup_positive_examples,non_subgroup_negative_examples])
    
    return roc_auc_score(examples['target'].values,examples['preds'].values)

def compute_bias_auc(indices=None,preds=None,df=None):
    """ Computes the three auc for all the subgroups """
    if df is None:
        df=train.copy()
        df['preds']=preds
        df=df.loc[indices].copy()
    
    records=[]
    for subgroup in identity_columns:
        record={
            'subgroup': subgroup,
            'subgroup_size': df.loc[df[subgroup]==1].shape[0]
        }
        record[SUBGROUP_AUC]=roc_auc_score(df.loc[df[subgroup]==1,'target'].values,
                                                       df.loc[df[subgroup]==1,'preds'].values)
        record[BPSN_AUC]=compute_bpsn(df,subgroup)
        record[BNSP_AUC]=compute_bnsp(df,subgroup)
        records.append(record)
    
    return pd.DataFrame(records)[['subgroup','subgroup_size',SUBGROUP_AUC,BPSN_AUC,BNSP_AUC]]

def compute_power_mean(series,p):
    total=np.sum(np.power(series,p))
    return np.power(total/len(series),1/p)

def compute_final_metric(indices,preds,overall_auc=None,p=-5,w=0.25):
    df=train.copy()
    df=df.loc[indices].copy()
    df['preds']=preds
    
    if overall_auc is None:
        overall_auc=roc_auc_score(df['target'],df['preds'])
    bias_df=compute_bias_auc(df=df)
    bias_score=np.average([
        compute_power_mean(bias_df[SUBGROUP_AUC],p),
        compute_power_mean(bias_df[BPSN_AUC],p),
        compute_power_mean(bias_df[BNSP_AUC],p),
    ])
    
    return bias_df,w*overall_auc+(1-w)*bias_score

## SPLITTING THE DATA TO TRAIN AND VALDIATION

In [9]:
n_folds = 4
kfold=StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=SEED)
oof_preds = np.zeros((X.shape[0],))

for fold,(train_index,val_index) in enumerate(kfold.split(X,train['target'].values)):
    if fold == 0:
        FOLD = fold
        X_train,X_val = X[train_index].copy(),X[val_index].copy()
        y_train,y_val = y[train_index].copy(),y[val_index].copy()
        weights_train = weights[train_index].copy()
        val_indices = val_index
        train_indices = train_index
        
        gc.enable()
        del X,y,weights
        print(gc.collect())
        break
        
print("Training Shape",X_train.shape,y_train.shape)
print("Validation Shape",X_val.shape,y_val.shape)

35
Training Shape (1353655, 200) (1353655, 7)
Validation Shape (451219, 200) (451219, 7)


In [10]:
# getting ready for the train data_set
train_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train.copy(),dtype=torch.long), \
                                               torch.tensor(y_train.copy(),dtype=torch.float) , \
                                               torch.tensor(weights_train.copy(),dtype=torch.float))

print("Deleting the X_train and y_train")
gc.enable()
del X_train,y_train,weights_train
gc.collect()

Deleting the X_train and y_train


0

## LOSS FUNCTIONS

In [11]:
class BceLoss(nn.Module):
    def __init__(self,eps=1e-8):
        super().__init__()
        self.eps = eps
        self.sigmoid = nn.Sigmoid()
    
    def forward(self,y_true,y_pred,weights=None):
        """
        Computes the BCE Loss with respect to y_true
        Weights is of shape : (batch_size,)
        """
        y_pred = self.sigmoid(y_pred)
        
        if weights is None:
            weights = torch.ones((y_true.shape[0],)).type('torch.FloatTensor').cuda()
        
        y_pred = torch.clamp(y_pred,self.eps,1-self.eps)
        m = y_pred.shape[0]
        
        if len(y_true.shape)==1:
            # changing y_true and y_pred 
            y_true = torch.unsqueeze(y_true,1)
            y_pred = torch.unsqueeze(y_pred,1)
        loss = torch.sum(y_true*torch.log(y_pred)+(1-y_true)*torch.log(1-y_pred),dim=1)
#         print(y_true.shape,y_pred.shape,loss.shape)
        loss = -torch.sum(weights*loss)/m
        
        return loss
        
class BinaryFocalLoss(nn.Module):
    def __init__(self,gamma=0,eps=1e-3):
        super().__init__()
        self.gamma=gamma
        self.eps=eps
        self.sigmoid = nn.Sigmoid()
    
    def forward(self,y_true,y_pred,weights=None):
        
        y_pred = self.sigmoid(y_pred)
        
        if weights is None:
            weights = torch.ones((y_true.shape[0],)).type('torch.FloatTensor').cuda()

        y_pred = torch.clamp(y_pred,self.eps,1-self.eps)
        m = y_pred.shape[0]
        
        if len(y_true.shape)==1:
            # changing the shape of y_pred and y_true
            y_true = torch.unsqueeze(y_true,1)
            y_pred = torch.unsqueeze(y_pred,1)
        
        loss = torch.sum(y_true*torch.pow(1-y_pred,self.gamma)*torch.log(y_pred) + \
               (1-y_true)*torch.pow(y_pred,self.gamma)*torch.log(1-y_pred),dim=1)
        
        loss = -torch.sum(weights*loss)/m
        
        return loss

In [12]:
output_model_file = "fine_tuned_gpt2_fold_"+str(FOLD+1)+"_seed_"+str(SEED)+".bin"

lr = 5e-5
batch_size = 32
accumulation_steps = 2                  # number of backprops to calculate for the update.

## TRAINING

In [13]:
model = GPT2ClassificationHeadModel.from_pretrained(pretrained_model_name_or_path="../input/gpt2-models/",clf_dropout=0.4,
                                                    n_class = len(aux_cols)+1)
model.zero_grad()
model.cuda()

loss_fn = BinaryFocalLoss(gamma = 2)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

num_train_optimization_steps = int(EPOCHS*len(train_dataset)/batch_size/accumulation_steps)

optimizer = OpenAIAdam(optimizer_grouped_parameters, 
                       lr=lr,
                       warmup=0.05,
                       t_total=num_train_optimization_steps)

model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)

train_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle = True)
# loss_values = []
for epoch in range(EPOCHS):
    optimizer.zero_grad()
    for batch,(X_train,y_train,weights) in tqdm(enumerate(train_loader),total=len(train_loader),leave=False):
        X_train = X_train.cuda()
        y_train = y_train.cuda()
        weights = weights.cuda()
        y_pred = model.forward(X_train)
        loss = loss_fn(y_train,y_pred,weights)
#         loss_values.append(loss)
#         import pdb ; pdb.set_trace()
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        if (batch+1) % accumulation_steps == 0:             # Wait for several backward steps
            optimizer.step()                            # Now we can do an optimizer step
            optimizer.zero_grad()

 18%|█▊        | 7416/42302 [1:21:13<6:24:36,  1.51it/s]

In [14]:
# saving the model            
torch.save(model.state_dict(), output_model_file)

In [15]:
# deleting the things that we don't need
gc.enable()
del train_dataset,train_loader,model
gc.collect()

0

## VALIDATION

In [16]:
# Run validation
model = GPT2ClassificationHeadModel(gpt_config,clf_dropout=0.4,n_class=len(aux_cols)+1)
model.load_state_dict(torch.load(output_model_file))
model.cuda()
for param in model.parameters():
    param.requires_grad=False
model.eval()
# X_val = np.zeros((20000,200))
batch_size = 128
valid_preds = np.zeros((X_val.shape[0],))
valid_dataset = TensorDataset(torch.tensor(X_val,dtype=torch.long))
valid_loader = DataLoader(valid_dataset,batch_size=batch_size,shuffle = False)
val_index = 0
for batch,(X_val,) in  tqdm(enumerate(valid_loader),total = len(valid_loader),leave = False):
    X_val = X_val.cuda()
    y_pred = model.forward(X_val)
    valid_preds[val_index:val_index+X_val.shape[0]] = y_pred[:,0].cpu().detach().numpy().reshape((-1,))
    val_index = val_index + X_val.shape[0]

                                                   

In [17]:
%%time
m,l = compute_final_metric(val_indices,valid_preds)
print(l)
display(m)

0.9376397865158378


Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
0,male,11104,0.932398,0.946864,0.962621
1,female,13407,0.929282,0.949486,0.959423
2,homosexual_gay_or_lesbian,2780,0.868999,0.888718,0.965972
3,christian,10050,0.937523,0.965937,0.946866
4,jewish,1868,0.905857,0.937881,0.954745
5,muslim,5271,0.881239,0.917447,0.958869
6,black,3653,0.858428,0.886464,0.964542
7,white,6228,0.862236,0.892632,0.964781
8,psychiatric_or_mental_illness,1242,0.939351,0.935404,0.971645


CPU times: user 4.7 s, sys: 848 ms, total: 5.55 s
Wall time: 5.57 s


In [18]:
oof_preds[val_indices] = valid_preds
oof=pd.DataFrame()
oof['id']=train['id']
oof['prediction']=oof_preds
oof.to_csv("oof.csv",index=False)
print(oof.shape)
oof.head()

(1804874, 2)


Unnamed: 0,id,prediction
0,59848,0.0
1,59849,0.0
2,59852,-3.037109
3,59855,0.0
4,59856,1.761719
