In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sys
import os
import time
import datetime
import gc
import random
import re
import operator
import pickle
from tqdm import tqdm
from joblib import Parallel, delayed

from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import f1_score,precision_score,recall_score,roc_auc_score,log_loss


import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader,TensorDataset,Dataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.optim.optimizer import Optimizer

from keras.preprocessing.text import Tokenizer,text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

# stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lc = LancasterStemmer()
from nltk.stem import SnowballStemmer
sb = SnowballStemmer("english")

def seed_everything(SEED=42):
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    os.environ['PYTHONHASHSEED'] = str(SEED)
    torch.backends.cudnn.benchmark = False

def init_func(worker_id):
    np.random.seed(SEED+worker_id)

tqdm.pandas()

# noting down the run time of the kernel
t1=time.time()

def logit(x):
    return np.log(x/(1-x))

def sigmoid(x):
    return (1/(1+np.exp(-x)))

Using TensorFlow backend.


In [2]:
sample=pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv")
# print(sample.shape)

# display(sample.head())

## NEW BERT INFERENCE

In [3]:
%%time
MAX_SEQUENCE_LENGTH = 250
SEED = 42

package_dir_a = "../input/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT/"
BERT_MODEL_PATH = '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'

from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertConfig
from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer,BertAdam,BertModel
bert_config = BertConfig.from_json_file(BERT_MODEL_PATH + 'bert_config.json')

class BertForSequenceClassification(BertPreTrainedModel):
    """BERT model for classification."""
    def __init__(self, config, num_labels=2):
        super(BertForSequenceClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(2304, num_labels)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        layer_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        avg_pool = torch.mean(layer_output, 1)
        max_pool, _ = torch.max(layer_output, 1)
        h_conc = torch.cat((avg_pool, max_pool, pooled_output), 1)
        # final_output = torch.cat((h_conc, pooled_output), 1)
        pooled_output = self.dropout(h_conc)
        logits = self.classifier(pooled_output)

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss
        else:
            return logits

bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)
bert_config = BertConfig.from_json_file(BERT_MODEL_PATH + 'bert_config.json')

def convert_lines_bert(x):
    tokens_a = bert_tokenizer.tokenize(x)
    if len(tokens_a) > MAX_SEQUENCE_LENGTH:
        tokens_a = tokens_a[-MAX_SEQUENCE_LENGTH:]
    
    one_token = bert_tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens_a + ["[SEP]"]) + \
                [0]*(MAX_SEQUENCE_LENGTH-len(tokens_a))
    
    return np.array(one_token)

test = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")
test['comment_text'] = test['comment_text'].fillna("__NOT AVAILABLE__").astype(str)

X_test = np.array(Parallel(n_jobs=4,backend="multiprocessing")(delayed(convert_lines_bert)(x) for x in test['comment_text'].copy()))
print(X_test.shape)

(97320, 252)
CPU times: user 2.56 s, sys: 516 ms, total: 3.07 s
Wall time: 1min 28s


In [4]:
! ls ../input/machaogongbertmodels

fine_tuned_bert_uncased_drop_0.2_fold_1_seed_42.bin
fine_tuned_bert_uncased_drop_0.2_fold_2_seed_42.bin
fine_tuned_bert_uncased_drop_0.2_fold_3_seed_42.bin
fine_tuned_bert_uncased_drop_0.2_fold_4_seed_42.bin
fine_tuned_bert_uncased_drop_0.2_fold_5_seed_42.bin
fine_tuned_bert_uncased_fold_1_seed_42.bin
fine_tuned_bert_uncased_fold_2_seed_42.bin
fine_tuned_bert_uncased_fold_3_seed_42.bin
fine_tuned_bert_uncased_fold_4_seed_42.bin
fine_tuned_bert_uncased_fold_5_seed_42.bin
fine_tuned_bert_uncased_lr_decrease_drop_0.4_fold_1_seed_10796.bin
fine_tuned_bert_uncased_lr_decrease_drop_0.4_fold_2_seed_10796.bin
fine_tuned_bert_uncased_lr_decrease_drop_0.4_fold_3_seed_10796.bin
fine_tuned_bert_uncased_lr_decrease_drop_0.4_fold_4_seed_10796.bin
fine_tuned_bert_uncased_lr_decrease_drop_0.4_fold_5_seed_10796.bin
fine_tuned_bert_uncased_lr_decrease_fold_1_seed_10796.bin
fine_tuned_bert_uncased_lr_decrease_fold_2_seed_10796.bin
fine_tuned_bert_uncased_lr_decrease_fold_3_seed_10796.bin

In [5]:
file_names = [
              'fine_tuned_bert_uncased_drop_0.2_fold_4_seed_42.bin',
              'fine_tuned_bert_uncased_fold_4_seed_42.bin',
              'fine_tuned_bert_uncased_lr_decrease_fold_2_seed_10796.bin',
              'fine_tuned_bert_uncased_drop_0.2_fold_1_seed_42.bin',
              'fine_tuned_bert_uncased_drop_0.2_fold_3_seed_42.bin',
              'fine_tuned_bert_uncased_fold_1_seed_42.bin'
             ]

aux_cols=['target','severe_toxicity','obscene','identity_attack','insult','threat']

test_preds_new_uncased_bert = np.zeros((X_test.shape[0],))

for filename in file_names:
    model = BertForSequenceClassification(bert_config,num_labels=len(aux_cols)+1)
    model.load_state_dict(torch.load("../input/machaogongbertmodels/"+filename))
    model.cuda()
    for param in model.parameters():
        param.requires_grad=False
    model.eval()
    
    batch_size = 128
    test_index = 0
    test_preds_per_file = np.zeros((X_test.shape[0],))
    test_dataset = TensorDataset(torch.tensor(X_test,dtype=torch.long))
    test_loader = DataLoader(test_dataset,batch_size=batch_size,shuffle = False)  # set shuffle = False.
    
    for batch,(x_test,) in  tqdm(enumerate(test_loader),total = len(test_loader),leave = False):
        x_test = x_test.cuda()
        attention_mask = (x_test>0).cuda()
        # the model gives logits .... 
        y_pred = torch.sigmoid(model.forward(x_test,attention_mask=attention_mask,labels=None))
        
        test_preds_per_file[test_index : test_index+x_test.shape[0]] = y_pred[:,0].cpu().detach().squeeze().numpy()
        
        test_index = test_index + x_test.shape[0]
        
    test_preds_new_uncased_bert = test_preds_new_uncased_bert + test_preds_per_file/len(file_names)
    
    gc.enable()
    del model,test_dataset,test_loader,x_test
    gc.collect()
    
gc.enable()
del test,X_test,bert_tokenizer,bert_config
gc.collect()

test_preds_new_uncased_bert[0:10]

                                                 

array([0.05506802, 0.02594015, 0.13303767, 0.07483855, 0.81706617,
       0.02735929, 0.08431552, 0.10653062, 0.21092247, 0.16753298])

## BERT UNCASED MODEL INFERENCE

In [6]:
%%time
MAX_SEQUENCE_LENGTH = 250
SEED = 42

package_dir_a = "../input/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT/"
BERT_MODEL_PATH = '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'

from pytorch_pretrained_bert.modeling import BertConfig,BertForSequenceClassification
from pytorch_pretrained_bert.tokenization import BertTokenizer

bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH+"vocab.txt", cache_dir=None,do_lower_case=True)
bert_config = BertConfig.from_json_file(BERT_MODEL_PATH + 'bert_config.json')

def convert_lines_bert(x):
    tokens_a = bert_tokenizer.tokenize(x)
    if len(tokens_a) > MAX_SEQUENCE_LENGTH:
        tokens_a = tokens_a[-MAX_SEQUENCE_LENGTH:]
    
    one_token = bert_tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens_a + ["[SEP]"]) + \
                [0]*(MAX_SEQUENCE_LENGTH-len(tokens_a))
    
    return np.array(one_token)

test = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")
test['comment_text'] = test['comment_text'].fillna("__NOT AVAILABLE__").astype(str)

X_test = np.array(Parallel(n_jobs=4,backend="multiprocessing")(delayed(convert_lines_bert)(x) for x in test['comment_text'].copy()))
print(X_test.shape)

(97320, 252)
CPU times: user 2.22 s, sys: 780 ms, total: 3 s
Wall time: 1min 27s


In [7]:
# bert_config

In [8]:
file_names = [
              'fine_tuned_bert_uncased_fold_4_seed_42.bin',
              'fine_tuned_bert_uncased_fold_5_seed_42.bin'
            ]

aux_cols=['target','severe_toxicity','obscene','identity_attack','insult','threat']

test_preds_uncased_bert = np.zeros((X_test.shape[0],))

for filename in file_names:
    model = BertForSequenceClassification(bert_config,num_labels=len(aux_cols)+1)
    model.load_state_dict(torch.load("../input/jigsaw-models/"+filename))
    model.cuda()
    for param in model.parameters():
        param.requires_grad=False
    model.eval()
    
    batch_size = 128
    test_index = 0
    test_preds_per_file = np.zeros((X_test.shape[0],))
    test_dataset = TensorDataset(torch.tensor(X_test,dtype=torch.long))
    test_loader = DataLoader(test_dataset,batch_size=batch_size,shuffle = False)  # set shuffle = False.
    
    for batch,(x_test,) in  tqdm(enumerate(test_loader),total = len(test_loader),leave = False):
        x_test = x_test.cuda()
        attention_mask = (x_test>0).cuda()
        # the model gives logits .... 
        y_pred = torch.sigmoid(model.forward(x_test,attention_mask=attention_mask,labels=None))
        
        test_preds_per_file[test_index : test_index+x_test.shape[0]] = y_pred[:,0].cpu().detach().squeeze().numpy()
        
        test_index = test_index + x_test.shape[0]
        
    test_preds_uncased_bert = test_preds_uncased_bert + test_preds_per_file/len(file_names)
    
    gc.enable()
    del model,test_dataset,test_loader,x_test
    gc.collect()
    
gc.enable()
del test,X_test,bert_tokenizer,bert_config
gc.collect()

test_preds_uncased_bert[0:10]

                                                 

array([0.05971995, 0.02074253, 0.12473429, 0.11312987, 0.85660744,
       0.02587287, 0.08236159, 0.08498987, 0.19745827, 0.12781212])

## BERT CASED MODEL INFERENCE

In [9]:
# %%time
# MAX_SEQUENCE_LENGTH = 250
# SEED = 42

# package_dir_a = "../input/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT/"
# BERT_MODEL_PATH = '../input/bert-pretrained-models/cased_l-12_h-768_a-12/cased_L-12_H-768_A-12/'

# from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertConfig
# from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
# from pytorch_pretrained_bert import BertTokenizer,BertAdam,BertModel
# bert_config = BertConfig.from_json_file(BERT_MODEL_PATH + 'bert_config.json')

# class BertForSequenceClassification(BertPreTrainedModel):
#     """BERT model for classification."""
#     def __init__(self, config, num_labels=2):
#         super(BertForSequenceClassification, self).__init__(config)
#         self.num_labels = num_labels
#         self.bert = BertModel(config)
#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
#         self.classifier = nn.Linear(2304, num_labels)
#         self.apply(self.init_bert_weights)

#     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
#         layer_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
#         avg_pool = torch.mean(layer_output, 1)
#         max_pool, _ = torch.max(layer_output, 1)
#         h_conc = torch.cat((avg_pool, max_pool, pooled_output), 1)
#         # final_output = torch.cat((h_conc, pooled_output), 1)
#         pooled_output = self.dropout(h_conc)
#         logits = self.classifier(pooled_output)

#         if labels is not None:
#             loss_fct = CrossEntropyLoss()
#             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
#             return loss
#         else:
#             return logits
        
# bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH+"vocab.txt", cache_dir=None,do_lower_case=False)
# bert_config = BertConfig.from_json_file(BERT_MODEL_PATH + 'bert_config.json')

# def convert_lines_bert(x):
#     tokens_a = bert_tokenizer.tokenize(x)
#     if len(tokens_a) > MAX_SEQUENCE_LENGTH:
#         tokens_a = tokens_a[-MAX_SEQUENCE_LENGTH:]
    
#     one_token = bert_tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens_a + ["[SEP]"]) + \
#                 [0]*(MAX_SEQUENCE_LENGTH-len(tokens_a))
    
#     return np.array(one_token)

# test = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")
# test['comment_text'] = test['comment_text'].fillna("__NOT AVAILABLE__").astype(str)

# X_test = np.array(Parallel(n_jobs=4,backend="multiprocessing")(delayed(convert_lines_bert)(x) for x in test['comment_text'].copy()))
# print(X_test.shape)

In [10]:
# ! ls ../input/

In [11]:

# file_names = [
#               'fine_tuned_bert_cased_fold_1_seed_42.bin',
#               'fine_tuned_bert_cased_fold_3_seed_42.bin'
#              ]


# aux_cols=['target','severe_toxicity','obscene','identity_attack','insult','threat']

# test_preds_cased_bert = np.zeros((X_test.shape[0],))

# is_sigmoid = True

# for filename in file_names:
#     model = BertForSequenceClassification(bert_config,num_labels=len(aux_cols)+1)
#     model.load_state_dict(torch.load("../input/new-cased-bert-models-fold012/"+filename))
#     model = model.cuda()
#     for param in model.parameters():
#         param.requires_grad=False
#     model.eval()
    
#     batch_size = 128
#     test_index = 0
#     test_preds_per_file = np.zeros((X_test.shape[0],))
#     test_dataset = TensorDataset(torch.tensor(X_test,dtype=torch.long))
#     test_loader = DataLoader(test_dataset,batch_size=batch_size,shuffle = False)  # set shuffle = False.
    
#     for batch,(x_test,) in  tqdm(enumerate(test_loader),total = len(test_loader),leave = False):
#         x_test = x_test.cuda()
#         attention_mask = (x_test>0).cuda()
#         # the model gives logits .... 
#         if is_sigmoid:
#             y_pred = torch.sigmoid(model.forward(x_test,attention_mask=attention_mask,labels=None))
#         else:
#             y_pred = (model.forward(x_test,attention_mask=attention_mask,labels=None))
#         test_preds_per_file[test_index : test_index+x_test.shape[0]] = y_pred[:,0].cpu().detach().squeeze().numpy()
        
#         test_index = test_index + x_test.shape[0]
        
#     test_preds_cased_bert = test_preds_cased_bert + test_preds_per_file/len(file_names)
    
#     gc.enable()
#     del model,test_dataset,test_loader,x_test
#     gc.collect()

# if not is_sigmoid:
#     test_preds_cased_bert = sigmoid(test_preds_cased_bert)

# gc.enable()
# del test,X_test,bert_tokenizer,bert_config
# gc.collect()
    
# test_preds_cased_bert[0:10]


### SUBMISSION FILE

In [12]:
# probability blend
# sample['prediction'] = 0.7*(test_preds_uncased_bert) + 0.0*(test_preds_gpt2) + 0.3*(test_preds_cased_bert) 
sample['prediction'] = 0.25*(test_preds_uncased_bert) + 0.75*(test_preds_new_uncased_bert)
print(sample.shape)
sample.head()

(97320, 2)


Unnamed: 0,id,prediction
0,7000000,0.056231
1,7000001,0.024641
2,7000002,0.130962
3,7000003,0.084411
4,7000004,0.826951


In [13]:
# sample1=pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv")
# print(sample1.shape)
# sample1['prediction'] = 0.4*(test_preds_uncased_bert) + 0.6*(test_preds_new_uncased_bert)
# sample1['prediction1'] = 0.3*(test_preds_uncased_bert) + 0.7*(test_preds_new_uncased_bert)
# sample1.head()
# sample1.to_csv("submission1.csv",index=False)

In [14]:
# # logit blend
# test_preds_logit = logit(test_preds)
# test_preds_cased_bert_logit = logit(test_preds_cased_bert)
# test_preds_uncased_bert_logit = logit(test_preds_uncased_bert)

# sample['prediction'] = 0.34*(test_preds_logit) + 0.33*(test_preds_cased_bert_logit) + \
#                         0.33*(test_preds_uncased_bert_logit)

# print(sample.shape)
# sample.head()

In [15]:
sample.to_csv("submission.csv",index=False)

In [16]:
print("The total run time of the kernel is",time.time()-t1)

The total run time of the kernel is 6653.8334374427795
