In [None]:
__author__ = 'Nick Sarris (ngs5st)'

import os
import time
import gc
import sys
import gzip

import numpy as np
import pandas as pd
import random
import shutil
import pickle

import torch
from torch import nn
from torch.utils import data
import torch.nn.functional as F

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from random import shuffle

from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertModel
from pytorch_pretrained_bert.optimization import BertAdam

from keras.preprocessing import text, sequence
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from nltk.tokenize.treebank import TreebankWordTokenizer

print(os.listdir("./data/"))

In [None]:
def seed_everything(seed=1235):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(1235)

In [None]:
start_time = time.time()
print("Establishing Global Variables ...")

# Data Directory
directory = './data/'

# Torch Device
device = torch.device('cuda')

# Model Parameters
max_length = 220
batch_size = 32
n_epochs = 2
accumulation_steps = 1

# Model/Split Seed/Parameters
# Change model_seed with every new/different model
# Keep split_seed the same throughout
model_seed = 1234
current_split = 0

# Model File Paths
TRAIN_FILE = directory + 'train.csv'
TEST_FILE  = directory + 'test.csv'
PROCESSED_FILE = 'train_seq.pickle'

# Directory/BERT Paths
WORK_DIR = directory
BERT_MODEL_PATH = directory + 'uncased_L-12_H-768_A-12/'
BERT_WEIGHT_PATH = 'bert_pytorch_model.bin'

convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
    BERT_MODEL_PATH + 'bert_model.ckpt',
    BERT_MODEL_PATH + 'bert_config.json',
    WORK_DIR + 'pytorch_model.bin')

shutil.copyfile(BERT_MODEL_PATH + 'bert_config.json', WORK_DIR + 'bert_config.json')
bert_config = BertConfig(BERT_MODEL_PATH + 'bert_config.json')

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
def convert_lines(example, max_seq_length,tokenizer):
    
    max_seq_length -= 2
    all_tokens = []
    longer = 0
    
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = (tokenizer.convert_tokens_to_ids(
            ["[CLS]"] + tokens_a + ["[SEP]"]) + [0] * (max_seq_length - len(tokens_a)))
        all_tokens.append(one_token)
    
    return np.array(all_tokens)

In [None]:
start_time = time.time()
print("Processing Data ...")

bert_test = pd.read_csv(TEST_FILE)
bert_test['comment_text'] = bert_test['comment_text'].astype(str) 
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)
bert_test_set = convert_lines(bert_test["comment_text"].fillna("DUMMY_VALUE"), max_length, tokenizer)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
class MyBertClassifier(BertPreTrainedModel):

    def __init__(self, config, num_aux_targets):
        super(MyBertClassifier, self).__init__(config)

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(config.hidden_size, config.hidden_size)
        self.linear_out = nn.Linear(config.hidden_size, 1)
        self.linear_aux_out = nn.Linear(config.hidden_size, num_aux_targets)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        
        h_conc_linear1  = F.relu(self.linear(pooled_output))
        h_conc_linear1 = self.dropout(h_conc_linear1)
    
        hidden = pooled_output + h_conc_linear1        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out

In [None]:
start_time = time.time()
print("Establishing DataLoaders ...")

device = torch.device('cuda')
bert_test_size = len(bert_test_set)
bert_test_dataset = torch.utils.data.TensorDataset(torch.tensor(bert_test_set, dtype=torch.long))
bert_test_loader = torch.utils.data.DataLoader(bert_test_dataset, batch_size=batch_size, shuffle=False)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
def bert_model_inference(model_data, test_loader, test_size, batch_size):
                    
    device = torch.device('cuda')
    model = MyBertClassifier(bert_config, 6)
    model.load_state_dict(torch.load(model_data))
    model.to(device)
    
    for param in model.parameters():
        param.requires_grad = False
    model.eval()
    
    test_preds = np.zeros((test_size))
    tk = tqdm(enumerate(test_loader), total=len(test_loader), leave=False)

    for i, (x_batch,) in tk:
        y_pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None)
        test_preds[i * batch_size: (i + 1) * batch_size] = y_pred[:,0].detach().cpu().squeeze().numpy()

    test_preds = torch.sigmoid(torch.tensor(test_preds)).numpy().ravel()
    return test_preds

In [None]:
start_time = time.time()
print("Inferencing Predictions ...")

preds = bert_model_inference(bert_model_data[0], bert_test_loader, bert_test_size, batch_size)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Generate Submission ...")

test = pd.read_csv(TEST_FILE)
submission = pd.DataFrame.from_dict({'id': test['id'], 'prediction': preds})
submission.to_csv('bert_submission.csv', index=False)

print("--- %s seconds ---" % (time.time() - start_time))