In [None]:
# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch.nn as nn
import torch
import transformers
from tqdm import tqdm
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup


# import torch_xla
# import torch_xla.debug.metrics as met
# import torch_xla.distributed.data_parallel as dp
# import torch_xla.distributed.parallel_loader as pl
# import torch_xla.utils.utils as xu
# import torch_xla.core.xla_model as xm
# import torch_xla.distributed.xla_multiprocessing as xmp
# import torch_xla.test.test_utils as test_utils

import warnings
warnings.filterwarnings("ignore")


In [None]:
# MAX_LEN = 160 # 192 128
# TRAIN_BATCH_SIZE = 64 # 128
# VALID_BATCH_SIZE = 64
# EPOCHS = 2
# # BERT_PATH = "../input/bert-base-multilingual-uncased/" #"../input/bert_base_uncased/" bert-base-multilingual-uncased
# BERT_PATH = "../input/bert-base-uncased/"
# MODEL_PATH = "model.bin"
# TOKENIZER = transformers.BertTokenizer.from_pretrained(
#     BERT_PATH,
#     do_lower_case=True
# )

In [None]:
class BERTDataset:
    def __init__(self, comment_text, target, tokenizer, max_length):
        self.comment_text = comment_text
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_length
    
    def __len__(self):
        return len(self.comment_text)
    
    def __getitem__(self, item):
        comment_text = str(self.comment_text[item])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        padding_length = self.max_len - len(ids)
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.target[item], dtype=torch.float)
        }

In [None]:
class BERTBaseUncased(nn.Module):
    def __init__(self, bert_path):
        super(BERTBaseUncased, self).__init__()
        self.bert_path = bert_path
        self.bert = transformers.BertModel.from_pretrained(self.bert_path)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768*2, 1)
    
    def forward(self, ids, mask, token_type_ids):
        o1, _ = self.bert(
            ids, 
            attention_mask=mask,
            token_type_ids=token_type_ids
        )

        mean_pooling = torch.mean(o1, 1)
        max_pooling, _ = torch.max(o1, 1)
        cat = torch.cat((mean_pooling, max_pooling), 1)

        bo = self.bert_drop(cat)
        output = self.out(bo)
        return output

In [None]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))


def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()

    for bi, d in enumerate(data_loader):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        loss = loss_fn(outputs, targets)
        loss.backward()
        xm.optimizer_step(optimizer)
        scheduler.step()
        
        if bi % 10 == 0:
            xm.master_print(f"bi={bi}, loss={loss}")


def eval_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    for bi, d in enumerate(data_loader):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        fin_targets.extend(targets.cpu().detach().numpy().tolist())
        fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
# tox = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv", usecols=["comment_text", "toxic"])
# bias = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv", usecols=["comment_text", "toxic"])
# val = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/validation.csv")

In [None]:
# test = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/test.csv")

In [None]:
# val.lang.value_counts()

In [None]:
# val[val.lang == 'tr']['toxic'].value_counts()

In [None]:
# val[val.lang == 'es']['toxic'].value_counts()

In [None]:
# val[val.lang == 'it']['toxic'].value_counts()

In [None]:
# test.lang.value_counts()

In [None]:
# len(test[test.lang == 'pt'].content[6])

In [None]:
def run():
    df1 = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv", usecols=["comment_text", "toxic"])
    df2 = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv", usecols=["comment_text", "toxic"])

    df_all = pd.concat([df1, df2], axis=0).reset_index(drop=True)   
    df_train = df_all.sample(frac=1, random_state=100).reset_index(drop=True).head(200_000)

    df_valid = pd.read_csv("../input/jigsaw-translate-en/validation_en.csv", usecols=["content_en", "toxic"])
    df_valid.loc[df_valid.content_en != 'pass' , 'comment_text'] = df_valid.loc[df_valid.content_en != 'pass' , 'content_en']
    df_valid = df_valid.loc[:, ["comment_text", "toxic"]]

    df_train = pd.concat([df_train, df_valid], axis=0).reset_index(drop=True)
    df_train = df_train.sample(frac=1, random_state=100).reset_index(drop=True)

    train_dataset = BERTDataset(
        comment_text=df_train.comment_text.values,
        target=df_train.toxic.values,
        tokenizer=TOKENIZER, 
        max_length=MAX_LEN
    )
    
    train_sampler = torch.utils.data.DistributedSampler(
        train_dataset, 
        num_replicas=xm.xrt_world_size(), 
        rank=xm.get_ordinal(),
        shuffle=True
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        num_workers=1,
        sampler=train_sampler,
        drop_last=True
    )

    valid_dataset = BERTDataset(
        comment_text=df_valid.comment_text.values,
        target=df_valid.toxic.values,
        tokenizer=TOKENIZER, 
        max_length=MAX_LEN
    )
    
    valid_sampler = torch.utils.data.DistributedSampler(
        valid_dataset, 
        num_replicas=xm.xrt_world_size(), 
        rank=xm.get_ordinal(),
        shuffle=False
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=16, #VALID_BATCH_SIZE,
        num_workers=1,#
        sampler=valid_sampler, #
        drop_last=False #
    )

    device = xm.xla_device()
    model = BERTBaseUncased(bert_path=BERT_PATH).to(device)
    
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    
    num_train_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE / xm.xrt_world_size() * EPOCHS)
    xm.master_print(f'num_train_steps = {num_train_steps}, world_size={xm.xrt_world_size()}')
    lr = 0.4 * 1e-5 * xm.xrt_world_size()
    optimizer = AdamW(optimizer_parameters, lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    best_score = float('-inf')
    for epoch in range(EPOCHS):
        # train
        para_loader = pl.ParallelLoader(train_data_loader, [device])
        train_fn(para_loader.per_device_loader(device), model, optimizer, device, scheduler)
        
        # valid
        para_loader = pl.ParallelLoader(valid_data_loader, [device])
        outputs, targets = eval_fn(para_loader.per_device_loader(device), model, device)
        
        xm.save(model.state_dict(), MODEL_PATH)
        targets = np.array(targets) >= 0.5
        auc_score = metrics.roc_auc_score(targets, outputs)
        xm.master_print(f"AUC = {auc_score}")
        if auc_score > best_score:
            xm.save(model.state_dict(), "best_model.bin")
            best_score = auc_score
            xm.master_print("New best score: %.5f" % best_score)

In [None]:
# def _mp_fn(rank, flags):
#     torch.set_default_tensor_type('torch.FloatTensor')
#     a = run()

# FLAGS={}
# xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=1, start_method='fork')

In [None]:
# %tb

# inference

In [None]:
import torch
import pandas as pd
from scipy import stats
import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import OrderedDict, namedtuple
import torch.nn as nn
from torch.optim import lr_scheduler
import joblib

import logging
import transformers
import sys

In [None]:
class BERTBaseUncased(nn.Module):
    def __init__(self, bert_path):
        super(BERTBaseUncased, self).__init__()
        self.bert_path = bert_path
        self.bert = transformers.BertModel.from_pretrained(self.bert_path)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768 * 2, 1)

    def forward(
            self,
            ids,
            mask,
            token_type_ids
    ):
        o1, o2 = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids)
        
        apool = torch.mean(o1, 1)
        mpool, _ = torch.max(o1, 1)
        cat = torch.cat((apool, mpool), 1)

        bo = self.bert_drop(cat)
        p2 = self.out(bo)
        return p2


class BERTDatasetTest:
    def __init__(self, comment_text, tokenizer, max_length):
        self.comment_text = comment_text
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, item):
        comment_text = str(self.comment_text[item])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        
        padding_length = self.max_length - len(ids)
        
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }

In [None]:
'../input/model3-30-03/best_model3_30_03.bin''

In [None]:
# df = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/test.csv")
df = pd.read_csv("../input/jigsaw-translate-en/test_en.csv")
df.loc[df.content_en != 'pass' , 'content'] = df.loc[df.content_en != 'pass' , 'content_en']
df.drop('content_en', axis=1, inplace=True)
# tokenizer = transformers.BertTokenizer.from_pretrained("../input/bert-base-multilingual-uncased/", do_lower_case=True)
tokenizer = transformers.BertTokenizer.from_pretrained("../input/bert-base-uncased/", do_lower_case=True)

In [None]:
device = "cuda"
# model = BERTBaseUncased(bert_path="../input/bert-base-multilingual-uncased/").to(device)
model = BERTBaseUncased(bert_path="../input/bert-base-uncased/").to(device)
# model.load_state_dict(torch.load('../input/best-m2/best_model2.bin'))
model.load_state_dict(torch.load('../input/model3-30-03/best_model3_30_03.bin'))
model.eval()

In [None]:
valid_dataset = BERTDatasetTest(
        comment_text=df.content.values,
        tokenizer=tokenizer,
        max_length=160
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=64,
    drop_last=False,
    num_workers=4,
    shuffle=False
)

In [None]:
with torch.no_grad():
    fin_outputs = []
    for bi, d in tqdm(enumerate(valid_data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)

        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        outputs_np = outputs.cpu().detach().numpy().tolist()
        fin_outputs.extend(outputs_np)

In [None]:
sample = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv")
sample.loc[:, "toxic"] = fin_outputs
sample.to_csv("submission.csv", index=False)

In [None]:
sample.head()