In [1]:
"""
pseudo labeling
tweet dataset
"""
class Config:
    author = "mst8823"
    wandb_entity = "mst8823"
    
    competition = "jigsaw-toxic-severity-rating"
    name = "Pseudo-Labeling-001"
    debug = False
    inference_only = False
    use_pretrain_model = True
    target_cols = ["pseudo_label"]
    
    model_name = "unitary/multilingual-toxic-xlm-roberta"
    hidden_size = 768
    head = 256
    tail = 0
    max_length = head + tail

    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    seed = 2022

    max_epochs = 4
    gradient_clip_val = 100
    accumulate_grad_batches = 1
    early_stopping = False
    optimizer = dict(
        optimizer="AdamW", 
        lr=1e-5, 
        weight_decay=2e-5
        )
    scheduler = dict(
        interval = "step",
        scheduler="CosineAnnealingWarmupRestarts",
        max_lr=1e-5,
        min_lr=1e-6,
        T_mult=1,
        warmup_steps=10,
        gamma=1)
    
    train_batch_size = 8
    valid_batch_size = 32
    num_workers = 4
    resume_from_checkpoint = None

    colab_dir = "/content/drive/Shareddrives/Jigsaw-Rate-Severity-of-Toxic-Comments"
    drive_path = colab_dir + f"/{author}"
    api_path = drive_path + "/kaggle.json"

    upload_from_colab = False
    kaggle_dataset_path = None

    """
    - step scheduler example
    scheduler = dict(
        interval = "step",
        scheduler="get_cosine_schedule_with_warmup",
        num_warmup_steps=256, 
        num_cycles=0.5)

    """

In [2]:
import os
import re
import sys
import logging
import shutil
import json
import datetime
import requests
import itertools
import functools
import warnings
import joblib
import gc
import random
import string
import re
import collections

import pandas as pd
import numpy as np
import nltk

from tqdm.auto import tqdm
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.special import softmax
from bs4 import BeautifulSoup

import torch
import torch.nn as nn
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import (
    CosineAnnealingWarmRestarts,
    CosineAnnealingLR,
    MultiStepLR, 
    ReduceLROnPlateau
    )
from torch.utils.data import Dataset, DataLoader

In [3]:
# =========================
# Utils
# =========================
class Logger:
    """ ref) https://github.com/ghmagazine/kagglebook/blob/master/ch04-model-interface/code/util.py"""
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))


def seed_everything(seed=2022):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def setup(cfg):
    cfg.COLAB = "google.colab" in sys.modules
    if cfg.COLAB:
        print("This environment is Google Colab")
        
        # mount
        from google.colab import drive
        if not os.path.isdir("/content/drive"):
            drive.mount('/content/drive') 
        
        # import library
        ! pip install --quiet pytorch_lightning
        ! pip install --quiet transformers
        ! pip install --quiet wandb
        ! pip install --quiet sentencepiece
        ! pip install --quiet 'git+https://github.com/katsura-jp/pytorch-cosine-annealing-with-warmup'

        # use kaggle api (need kaggle token)
        f = open(cfg.api_path, 'r')
        json_data = json.load(f) 
        os.environ["KAGGLE_USERNAME"] = json_data["username"]
        os.environ["KAGGLE_KEY"] = json_data["key"]
        
        # set dirs
        cfg.DRIVE = cfg.drive_path
        cfg.EXP = (cfg.name if cfg.name is not None 
            else requests.get("http://172.28.0.2:9000/api/sessions").json()[0]["name"][:-6])
        cfg.INPUT = os.path.join(cfg.DRIVE, "Input")
        cfg.OUTPUT = os.path.join(cfg.DRIVE, "Output")
        cfg.SUBMISSION = os.path.join(cfg.DRIVE, "Submission")
        cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
        cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, "model")
        cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, "fig")
        cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, "preds")

        # input data
        cfg.INPUT_JIGSAW_01 = os.path.join(cfg.INPUT, "jigsaw-toxic-comment-classification-challenge")
        cfg.INPUT_JIGSAW_02 = os.path.join(cfg.INPUT, "jigsaw-unintended-bias-in-toxicity-classification")
        cfg.INPUT_JIGSAW_03 = os.path.join(cfg.INPUT, "jigsaw-multilingual-toxic-comment-classification")
        cfg.INPUT_JIGSAW_04 = os.path.join(cfg.INPUT, "jigsaw-toxic-severity-rating")
        cfg.INPUT_RUDDIT = os.path.join(cfg.INPUT, "ruddit-jigsaw-dataset")
        cfg.jigsaw_inputs = [cfg.INPUT_JIGSAW_01, cfg.INPUT_JIGSAW_02, cfg.INPUT_JIGSAW_03, cfg.INPUT_JIGSAW_04, 
                             cfg.INPUT_RUDDIT]

        # make dirs
        for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS] + cfg.jigsaw_inputs:
            os.makedirs(d, exist_ok=True)

        if not os.path.isfile(os.path.join(cfg.INPUT_JIGSAW_04, "comments_to_score.csv")):
            print("load dataset")
            ! pip install --upgrade --force-reinstall --no-deps kaggle
            ! kaggle competitions download -c jigsaw-toxic-comment-classification-challenge -p $cfg.INPUT_JIGSAW_01 
            ! kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -p $cfg.INPUT_JIGSAW_02 
            ! kaggle competitions download -c jigsaw-multilingual-toxic-comment-classification -p $cfg.INPUT_JIGSAW_03 
            ! kaggle competitions download -c jigsaw-toxic-severity-rating -p $cfg.INPUT_JIGSAW_04 
            ! kaggle datasets download -d rajkumarl/ruddit-jigsaw-dataset -p $cfg.INPUT_RUDDIT

            for input_path in cfg.jigsaw_inputs:
                filepath = f'{input_path}/{input_path.split("/")[-1]}'
                ! unzip -d $input_path $filepath

    else:
        print("This environment is Kaggle Kernel")
        if not cfg.inference_only:
            ! pip install --quiet pytorch_lightning==1.5.8 
            ! pip install --quiet 'git+https://github.com/katsura-jp/pytorch-cosine-annealing-with-warmup'

        # set dirs
        cfg.INPUT = f"../input"

        # input data
        cfg.INPUT_JIGSAW_01 = os.path.join(cfg.INPUT, "jigsaw-toxic-comment-classification-challenge")
        cfg.INPUT_JIGSAW_02 = os.path.join(cfg.INPUT, "jigsaw-unintended-bias-in-toxicity-classification")
        cfg.INPUT_JIGSAW_03 = os.path.join(cfg.INPUT, "jigsaw-multilingual-toxic-comment-classification")
        cfg.INPUT_JIGSAW_04 = os.path.join(cfg.INPUT, "jigsaw-toxic-severity-rating")
        cfg.INPUT_RUDDIT = os.path.join(cfg.INPUT, "ruddit-jigsaw-dataset")
        cfg.jigsaw_inputs = [cfg.INPUT_JIGSAW_01, cfg.INPUT_JIGSAW_02, cfg.INPUT_JIGSAW_03, cfg.INPUT_JIGSAW_04, 
                             cfg.INPUT_RUDDIT]

        cfg.EXP = cfg.OUTPUT_EXP = "./"
        if cfg.kaggle_dataset_path is not None:
            cfg.EXP_MODEL = os.path.join(cfg.kaggle_dataset_path, "model")
        else:
            cfg.EXP_MODEL = os.path.join(cfg.EXP, "model")

        cfg.SUBMISSION = "./"
        cfg.EXP_FIG = os.path.join(cfg.EXP, "fig")
        cfg.EXP_PREDS = os.path.join(cfg.EXP, "preds")

        # make dirs
        make_dirs = [cfg.EXP_FIG, cfg.EXP_PREDS]
        if not cfg.inference_only:
            make_dirs.append(cfg.EXP_MODEL)
        for d in make_dirs:
            os.makedirs(d, exist_ok=True)

    # set device    
    cfg.DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    warnings.filterwarnings("ignore")
    seed_everything(cfg.seed)

    cfg.logger = Logger(cfg.OUTPUT_EXP)

    return cfg


# =========================
# SetUp
# =========================
Config = setup(Config)

# 2nd import
import pytorch_lightning as pl
import wandb

from transformers import (AutoConfig, AutoModel, AutoTokenizer)
from transformers import (get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup)

if not Config.inference_only:
    from cosine_annealing_warmup import CosineAnnealingWarmupRestarts

# wandb setting
if not Config.COLAB:
    if  not Config.inference_only:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        api_key = user_secrets.get_secret("WANDB_API")
        wandb.login(key=api_key)
else:
    wandb.login()


This environment is Kaggle Kernel


[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Dataset

In [4]:
# =============================
# Dataset
# =============================
class JigsawTrainDataset(Dataset):
    def __init__(self, cfg, df, tokenizer, text_col):
        self.cfg = cfg
        self.comment_text = df[text_col].values
        self.targets = df[cfg.target_cols].values
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.comment_text)
    
    def __getitem__(self, idx):

        text = str(self.comment_text[idx])
        inputs = prepare_input(self.cfg, text, self.tokenizer)
        targets = torch.tensor(self.targets[idx]).float()

        return inputs, targets


class JigsawTestDataset(Dataset):
    def __init__(self, cfg, df, tokenizer, text_col):
        self.cfg = cfg
        self.comment_text = df[text_col].fillna("none").values
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.comment_text)
    
    def __getitem__(self, idx):
        text = str(self.comment_text[idx])
        inputs = prepare_input(self.cfg, text, self.tokenizer)
        return inputs


def prepare_input(cfg, text, tokenizer):
    if cfg.tail == 0:
        inputs = tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=cfg.max_length,
            pad_to_max_length=True,
            truncation=True)
        
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)

    else:
        inputs = tokenizer.encode_plus(
            text,
            return_tensors=None, 
            add_special_tokens=True, 
            truncation=True)
        
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > cfg.max_length:
                v = np.hstack([v[:cfg.head], v[-cfg.tail:]])

            if k == 'input_ids':
                new_v = np.ones(cfg.max_length) * tokenizer.pad_token_id

            else:
                new_v = np.zeros(cfg.max_length)

            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)

    return inputs


class JigsawDataModule(pl.LightningDataModule):
    def __init__(self, cfg, tokenizer, train_df, valid_df, text_col):
        super(JigsawDataModule).__init__()

        self.cfg = cfg
        self.text_col = text_col
        self.tokenizer = tokenizer
        self.train_df = train_df
        self.valid_df = valid_df

        self.train_dataset = None
        self.val_dataset = None

    def setup(self, stage=None):
        self.train_dataset = JigsawTrainDataset(
            cfg=self.cfg, df=self.train_df, tokenizer=self.tokenizer, text_col=self.text_col)
        self.val_dataset = JigsawTrainDataset(
            cfg=self.cfg, df=self.valid_df, tokenizer=self.tokenizer, text_col=self.text_col)
        
    def train_dataloader(self):
        train_dataloader = DataLoader(
            self.train_dataset, 
            batch_size=self.cfg.train_batch_size, 
            shuffle=True, 
            num_workers=self.cfg.num_workers, 
            pin_memory=True, 
            drop_last=True)
        
        return train_dataloader

    def val_dataloader(self):
        val_dataloader = DataLoader(
            self.val_dataset,
            batch_size=self.cfg.valid_batch_size,
            shuffle=False,
            num_workers=self.cfg.num_workers, 
            pin_memory=True, 
            drop_last=False)

        return val_dataloader

## Model

In [5]:
# =============================
# Model
# =============================
def get_optimizer(cfg, parameters):
    opt = cfg.optimizer
    if opt["optimizer"] == "AdamW":
        optimizer = AdamW(
            parameters,
            lr=opt["lr"],
            weight_decay=opt["weight_decay"]
            )
    
    elif opt["optimizer"] == "Adam":
        optimizer = Adam(
            parameters,
            lr=opt["lr"],
            weight_decay=opt["weight_decay"]
            )
    
    else:
        raise NotImplementedError
    
    return optimizer


def get_scheduler(cfg, optimizer, num_train_steps):
    sch = cfg.scheduler
    if sch["scheduler"] == "get_linear_schedule_with_warmup":
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=sch["num_warmup_steps"],
            num_training_steps=num_train_steps)
    
    elif sch["scheduler"] == "get_cosine_schedule_with_warmup":
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=sch["num_warmup_steps"],
            num_training_steps=num_train_steps,
            num_cycles=sch["num_cycles"]
            )

    elif sch["scheduler"] == "MultiStepLR":
        scheduler = MultiStepLR(
            optimizer, 
            milestones=sch["milestones"], 
            gamma=sch["gamma"]
        )

    elif sch["scheduler"] == "CosineAnnealingWarmupRestarts":
        
        first_cycle_steps = (num_train_steps // cfg.max_epochs) * cfg.train_batch_size
        print(first_cycle_steps)
        scheduler = CosineAnnealingWarmupRestarts(
            optimizer,
            first_cycle_steps=int(first_cycle_steps),
            cycle_mult=sch['T_mult'],
            max_lr=sch["max_lr"],
            min_lr=sch['min_lr'],
            warmup_steps=sch['warmup_steps'],
            gamma=sch['gamma']
        )
    else:
        raise NotImplementedError
    
    return scheduler


class JigsawModel(pl.LightningModule):
    def __init__(self, cfg):
        super(JigsawModel, self).__init__()
        self.cfg = cfg
        self.total_steps = None
        self.dataset_size = None

        self.backborn = get_backborn(cfg)   
        self.out = nn.Linear(cfg.hidden_size, len(cfg.target_cols))

    def forward(self, inputs):
        x = self.backborn(**inputs)
        x = x[0]
        x = x[:, 0, :]

        x_out = self.out(x)

        return x_out

    def training_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self.forward(inputs)
        loss = self.loss(outputs, targets)
        self.log("train_loss", loss, on_step=True, logger=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self.forward(inputs)
        loss = self.loss(outputs, targets)
        self.log("val_loss", loss, on_step=True, logger=True, prog_bar=True)
        return loss

    def loss(self, outputs, targets):
        loss_fn = nn.MSELoss()
        loss = loss_fn(outputs, targets)
        # loss = torch.sqrt(loss)
        return loss

    def setup(self, stage=None):
        if stage != "fit":
            return

        # calculate total steps
        if self.dataset_size is None:
            dataset = self.trainer._data_connector._train_dataloader_source.dataloader()
            self.dataset_size = len(dataset)
        num_devices = max(1, self.trainer.num_gpus, self.trainer.num_processes)  # gpus=-1だとそれが反映されちゃう
        effective_batch_size = self.cfg.train_batch_size * self.trainer.accumulate_grad_batches * num_devices
        print(self.dataset_size, effective_batch_size)
        self.total_steps = (self.dataset_size // effective_batch_size) * self.cfg.max_epochs

    def configure_optimizers(self):
        optimizer = get_optimizer(self.cfg, parameters=self.parameters())

        if self.cfg.scheduler is None:
            return [optimizer]
        else:
            scheduler = get_scheduler(self.cfg, optimizer, num_train_steps=self.total_steps)
            return [optimizer], [{"scheduler": scheduler, "interval": self.cfg.scheduler["interval"]}]


## Metrics

In [6]:
# =============================
# Metrics
# ============================= 
def get_validation_data_hat(cfg, tokenizer, filename, validation_data):
    validation_data_ = validation_data.copy()
    df = pd.DataFrame({"text":sorted(set(validation_data_["less_toxic"].unique()) |
                                     set(validation_data_["more_toxic"].unique()))})
    
    if filename is None:
        preds = predict_cv(cfg, df, tokenizer, text_col="text")
    else:
        preds = predict(cfg, df, tokenizer, filename, text_col="text")

    if np.ndim(preds) > 1:
        df["preds"] = np.mean(preds, axis=1)  # mean of targets
    else:
        df["preds"] = preds.reshape(-1)

    validation_data_ = (pd.merge(
        validation_data_, df, left_on="less_toxic", right_on="text", how="left").
        rename(columns={"preds":"less_toxic_preds"}).
        drop("text", axis=1))
    
    validation_data_ = (pd.merge(
        validation_data_, df, left_on="more_toxic", right_on="text", how="left").
        rename(columns={"preds":"more_toxic_preds"}).
        drop("text", axis=1))
    
    return validation_data_


def get_score(validation_data_hat):
    less_toxic, more_toxic = validation_data_hat["less_toxic_preds"], validation_data_hat["more_toxic_preds"]
    return np.mean(more_toxic > less_toxic)

## Train & Predict

In [7]:
# =============================
# Train & Predict
# =============================
def class2dict(f):
    return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))


def train_fold(cfg, train_df, valid_df, tokenizer, filename, text_col):

    wandblogger = pl.loggers.WandbLogger(
        project=cfg.competition, 
        config=class2dict(cfg),
        group=f"{cfg.author}_{cfg.name}",  
        name="_".join(filename.split("-")[-2:]),
        job_type="train",
        reinit=True,
        anonymous=None,
        entity=cfg.wandb_entity
        )

    lightning_datamodule = JigsawDataModule(
        cfg=cfg, 
        tokenizer=tokenizer,
        train_df=train_df, 
        valid_df=valid_df, 
        text_col=text_col
        )
    
    lightning_model = JigsawModel(cfg=cfg)
    lightning_model.dataset_size = len(train_df)  # cuz setup donot work?

    checkpoint = pl.callbacks.ModelCheckpoint(
        dirpath=cfg.EXP_MODEL,
        filename=filename,
        save_top_k=1,
        verbose=True,
        monitor="val_loss",
        mode="min",
    )
    lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval="step")
    callbacks = [checkpoint, lr_monitor]

    if cfg.early_stopping:
        early_stopping = pl.callbacks.EarlyStopping(
            monitor="val_loss", 
            min_delta=0.0, 
            patience=8, 
            mode='min', 
        )
        callbacks += [early_stopping]
    
    trainer = pl.Trainer(
        max_epochs=cfg.max_epochs,
        callbacks=callbacks,
        logger=[wandblogger],
        gradient_clip_val=cfg.gradient_clip_val,
        accumulate_grad_batches=cfg.accumulate_grad_batches,
        resume_from_checkpoint=cfg.resume_from_checkpoint,
        deterministic=False,
        gpus=-1,
        precision=16,
    )

    trainer.fit(lightning_model, datamodule=lightning_datamodule)
    wandb.finish(quiet=True)
    torch.cuda.empty_cache()


def get_filname_listdir(dirctory):
    listdir = os.listdir(dirctory)
    out_lst = [os.path.splitext(d)[0] for d in listdir]
    return out_lst


def train_cv(cfg, df, tokenizer, text_col=None, validation_data=None, get_oof=True):
    """cross validation & get oof"""
    oof_df = pd.DataFrame(np.zeros((len(df), len(cfg.target_cols))), columns=cfg.target_cols)

    for i_fold in range(cfg.n_fold):

        if i_fold in cfg.trn_fold:
            filename = f"{cfg.name}-seed{cfg.seed}-fold{i_fold}"
            filelist = get_filname_listdir(cfg.EXP_MODEL)

            val_mask = (df["fold"] == i_fold).astype(bool)
            train_df = df[~val_mask].reset_index(drop=True)
            valid_df = df[val_mask].reset_index(drop=True)

            if not filename in filelist:
                print(f"# --------- # Start Training Fold={i_fold} # --------- #")
                # training
                train_fold(
                    cfg=cfg, 
                    train_df=train_df, 
                    valid_df=valid_df, 
                    tokenizer=tokenizer, 
                    filename=filename, 
                    text_col=text_col
                    )

            # get validation data score
            if validation_data is not None:
                validation_data_hat = get_validation_data_hat(cfg, tokenizer, filename, validation_data)
                val_score = get_score(validation_data_hat)
                log = f"{cfg.name}-seed{cfg.seed}-fold{i_fold}: validation data score={val_score:.4f}"
                cfg.logger.info(log)

            # get validation prediction
            if get_oof:
                preds = predict(
                    cfg=cfg,
                    df=valid_df, 
                    tokenizer=tokenizer, 
                    filename=filename, 
                    text_col=text_col)
                
                oof_df.loc[val_mask] = preds
                return oof_df


def predict(cfg, df, tokenizer, filename, text_col):
    test_dataset = JigsawTestDataset(
        cfg=cfg, tokenizer=tokenizer, df=df, text_col=text_col)
    
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=cfg.valid_batch_size,
        shuffle=False,
        num_workers=cfg.num_workers, 
        pin_memory=True, 
        drop_last=False
        ) 
    
    lightning_model = JigsawModel(cfg=cfg).to(cfg.DEVICE).eval()
    checkpoint_path = os.path.join(cfg.EXP_MODEL, filename + ".ckpt") 
    lightning_model.load_state_dict(torch.load(checkpoint_path)['state_dict'])

    num_targets = len(cfg.target_cols)
    preds = np.zeros((len(df), num_targets))  # N * num targets
    fill_start_idx = 0

    for inputs in tqdm(test_dataloader,total=len(test_dataloader)):
        # get predicted labels by batch
        for k, v in inputs.items():
            inputs[k] = v.to(cfg.DEVICE)

        with torch.no_grad():
            pred = lightning_model(inputs)
            pred = pred.cpu().numpy()  # bs * num targets
        
        fill_end_idx = pred.shape[0] + fill_start_idx  # bs + idx
        preds[fill_start_idx:fill_end_idx] = pred
        fill_start_idx = fill_end_idx
        
    
    del test_dataset, test_dataloader, lightning_model
    gc.collect()

    return preds


def predict_cv(cfg, df, tokenizer, text_col):
    num_targets = len(cfg.target_cols)
    preds = []
    
    for i_fold in range(cfg.n_fold):
        if i_fold in cfg.trn_fold:
            filename =f"{cfg.name}-seed{cfg.seed}-fold{i_fold}"
            preds_fold = predict(cfg, df, tokenizer, filename, text_col)
            preds.append(preds_fold)
    
    preds = np.mean(preds, axis=0)  # fold mean
    return preds


## Load Model

In [8]:
# =============================
# Load Model
# =============================
def get_tokenizer(cfg):

    pretrained_dir = os.path.join(cfg.EXP_MODEL, "Pretrain")
    tokenizer_path = os.path.join(pretrained_dir, "tokenizer_config.json")  # tokenizer.json??
    if not os.path.isfile(tokenizer_path):
        tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
        tokenizer.save_pretrained(pretrained_dir)
    
    else:
        tokenizer = AutoTokenizer.from_pretrained(pretrained_dir)

    return tokenizer


def get_backborn(cfg):
    pretrained_dir = os.path.join(cfg.EXP_MODEL, "Pretrain")
    backborn_path = os.path.join(pretrained_dir, "pytorch_model.bin")
    if not os.path.isfile(backborn_path):
        model_config = AutoConfig.from_pretrained(cfg.model_name)

        # No dropout
        model_config.attention_probs_dropout_prob = 0.0
        model_config.hidden_dropout_prob = 0.0

        backborn = AutoModel.from_pretrained(cfg.model_name, config=model_config)

        backborn.save_pretrained(pretrained_dir)
    
    else:
        model_config = AutoConfig.from_pretrained(pretrained_dir)

        # No dropout
        model_config.attention_probs_dropout_prob = 0.0
        model_config.hidden_dropout_prob = 0.0
        
        if cfg.use_pretrain_model:
            backborn = AutoModel.from_pretrained(pretrained_dir, config=model_config)
        else:
            backborn = AutoModel.from_config(model_config)  # inference 時は pretrain weight いらない：cfg.use_pretrain_model=False

    return backborn

## Create DataSet

In [9]:
# =============================
# Create Data
# =============================
def read_csv(filepath, **kwargs):
    if os.path.isdir(filepath):
        filename = filepath.split("/")[-1]
        filepath = os.path.join(filepath, filename)
        
    try:
        csv_data = pd.read_csv(filepath,  **kwargs)
    except:
        csv_data = pd.read_csv(filepath + ".zip",  **kwargs)

    return csv_data


def text_cleaning(text):
    '''
    ref) # https://www.kaggle.com/manabendrarout/pytorch-roberta-ranking-baseline-jrstc-train

    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    # text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    ipPattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') # Removes IP address
    text = ipPattern.sub(r'', text)
    bikkuri = re.compile('!') # Removes bikkuri
    text = bikkuri.sub(r' ', text)
    text = text.replace('\n','')
    text = text.replace("\'","")
    text = text.replace("|","")
    text = text.replace("=","")
    text = text.replace("F**K", "FUCK")
    text = text.replace("F__K", "FUCK")
    text = text.replace("f**k", "fuck")
    text = text.replace("f__k", "fuck")
    text = text.replace("f*ck", "fuck")    
    text = text.replace("S$X", "SEX")
    text = text.replace("s$x", "sex")
    text = text.replace(" u ", " you ")
    text = text.replace(" u ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace("YOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUUUUUUUUUU", "YOU")
    text = text.strip() # remove spaces at the beginning and at the end of string
    return text


def text_normalization(s:pd.Series):
    x = s.apply(text_cleaning)
    return x


def get_jigsaw_01_dataset(cfg):
    """
    jigsaw-toxic-comment-classification-challenge
    - text_col : "comment_text2
    - target_cols : ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

    """
    jigsaw1_train = read_csv(os.path.join(cfg.INPUT_JIGSAW_01 , "train.csv"))
    jigsaw1_test = read_csv(os.path.join(cfg.INPUT_JIGSAW_01 , "test.csv"))
    jigsaw1_test_label = read_csv(os.path.join(cfg.INPUT_JIGSAW_01 , "test_labels.csv"))
    scoring_mask = jigsaw1_test_label["toxic"] != -1
    jigsaw1_test = pd.merge(jigsaw1_test[scoring_mask], jigsaw1_test_label[scoring_mask], on="id", how="left")
    jigsaw1_train = pd.concat([jigsaw1_train, jigsaw1_test], axis=0).reset_index(drop=True)

    return jigsaw1_train


def get_jigsaw_02_dataset(cfg, cat_threshold=0.5):
    """
    jigsaw-unintended-bias-in-toxicity-classification
    - text_col : "comment_text"
    - target_cols : ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    """
    jigsaw2_data = read_csv(os.path.join(cfg.INPUT_JIGSAW_02 , "all_data.csv"), usecols=["id", "comment_text"])
    jigsaw2_labels = read_csv(os.path.join(cfg.INPUT_JIGSAW_02 , "toxicity_individual_annotations.csv"))
    jigsaw2_agg_labels = jigsaw2_labels.groupby(["id"]).agg("mean")

    if cat_threshold is not None:
        jigsaw2_agg_labels = pd.DataFrame(
            np.where(jigsaw2_agg_labels >= cat_threshold, 1, 0), 
            index=jigsaw2_agg_labels.index,
            columns=jigsaw2_agg_labels.columns)
    
    jigsaw2_train = pd.merge(jigsaw2_data, jigsaw2_agg_labels, on="id", how="left")
    jigsaw2_train = jigsaw2_train.dropna(axis=0).reset_index(drop=True)
    jigsaw2_train = (jigsaw2_train.
                        rename(columns={"identity_attack":"identity_hate"}).
                        drop(["sexual_explicit", "worker"], axis=1))
    
    return jigsaw2_train


def get_ruddit_dataset(cfg):
    """
    Ruddit Dataset
    - text_col : "comment_text"
    - target_cols : "offensiveness_score"
    """
    ruddit_df = read_csv(os.path.join(cfg.INPUT_RUDDIT, "Dataset", "ruddit_with_text.csv"))
    ruddit_df = ruddit_df[~ruddit_df["txt"].isin(["[deleted]", "[removed]"])].reset_index(drop=True)
    # ruddit_df["comment_text"] = text_normalization(ruddit_df["txt"])
    ruddit_df["comment_text"] = ruddit_df["txt"].fillna("none")
    return ruddit_df.drop("txt", axis=1)


def get_fold_idx(cfg, df):
    df["fold"] = -1
    y = df[cfg.target_cols].sum(axis=1)
    cv_strategy = KFold(n_splits=cfg.n_fold, shuffle=True, random_state=cfg.seed)
    for i_fold, (tr_idx, va_idx) in enumerate(cv_strategy.split(X=df, y=y)):
        df.loc[va_idx, "fold"] = i_fold
    
    return df


def get_custom_jigsaw_dataset(cfg, train_data, validation_data):
    """
    ref) https://www.kaggle.com/toru59er/0-866-tfidf-ridge-simple-baseline
    target_cols : ["toxic_score"]
    weighted sum of targets:["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    undersampling
    """

    train_data["toxic_score"] = train_data[cfg.target_cols].sum(axis=1)
    
    # undersample
    toxic_mask = (train_data["toxic_score"] > 0).astype(bool)
    min_len = np.sum(toxic_mask)

    sampled_data = train_data[train_data["toxic_score"] == 0].sample(n=min_len, random_state=cfg.seed)
    train_data = pd.concat([train_data[toxic_mask], sampled_data]).reset_index(drop=True).drop("toxic_score", axis=1)

    val_comment_unq = np.unique(validation_data['less_toxic'].tolist() + validation_data['more_toxic'].tolist())
    duplicate_idx = np.isin(train_data['comment_text'], val_comment_unq)
    train_data = train_data.iloc[~duplicate_idx].reset_index(drop=True)

    return train_data


## Main

In [10]:
print("# ------------------ # Load Data # ------------------ #")

# load tokenizer
tokenizer = get_tokenizer(Config)

comments_to_score = read_csv(os.path.join(Config.INPUT_JIGSAW_04 , "comments_to_score.csv"))
# comments_to_score["text"] = text_normalization(comments_to_score["text"])
sample_submission = read_csv(os.path.join(Config.INPUT_JIGSAW_04 , "sample_submission.csv"))

if not Config.inference_only:

    # load validation data
    validation_data = read_csv(os.path.join(Config.INPUT_JIGSAW_04 , "validation_data.csv"))

    # load train data
    datapath = "PuseudoLabelingJigsaw/toxic-twitter-dataset/PseudoLabelDataset (2).csv"
    train_data = read_csv(f"{Config.drive_path}/Input/{datapath}")
    train_data = get_fold_idx(cfg=Config, df=train_data)

#     train_data["comment_text"] = text_normalization(train_data["comment_text"])
#     validation_data["less_toxic"] = text_normalization(validation_data["less_toxic"])
#     validation_data["more_toxic"] = text_normalization(validation_data["more_toxic"])

    print("# ------------------ # Training # ------------------ #")
    # training
    train_cv(
        cfg=Config, 
        df=train_data, 
        tokenizer=tokenizer, 
        text_col="tweet",  # comment_text
        validation_data=validation_data, 
        get_oof=False)

    print("# ------------------ # Validation # ------------------ #")
    # validation
    validation_data_hat = get_validation_data_hat(
        cfg=Config, 
        tokenizer=tokenizer, 
        filename=None, 
        validation_data=validation_data
        )
    filepath = os.path.join(Config.EXP_PREDS, "validation_data.csv")
    validation_data_hat.to_csv(filepath, index=False)
    score = get_score(validation_data_hat)
    Config.logger.info(f"validation score = {score:.4f}")

print("# ------------------ # Inference # ------------------ #")
preds = predict_cv(
    cfg=Config, 
    df=comments_to_score, 
    tokenizer=tokenizer, 
    text_col="text")

print(preds.shape)
if np.ndim(preds) > 1:
    sub_preds = np.mean(preds, axis=1)  # mean of target
else:
    sub_preds = preds

sample_submission["score"] = sub_preds
filename = Config.name + ".csv" if Config.COLAB else "submission.csv"
sample_submission.to_csv(os.path.join(Config.SUBMISSION, filename), index=False)

# upload output folder to kaggle dataset
if Config.upload_from_colab:
    from kaggle.api.kaggle_api_extended import KaggleApi

    def dataset_create_new(dataset_name, upload_dir):
        dataset_metadata = {}
        dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
        dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
        dataset_metadata['title'] = dataset_name
        with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
            json.dump(dataset_metadata, f, indent=4)
        api = KaggleApi()
        api.authenticate()
        api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')
    dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)

# ------------------ # Load Data # ------------------ #


Downloading:   0%|          | 0.00/211 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/635 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

# ------------------ # Training # ------------------ #
# --------- # Start Training Fold=0 # --------- #


Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at unitary/multilingual-toxic-xlm-roberta were not used when initializing XLMRobertaModel: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at unitary/multilingual-toxic-xlm-roberta and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

45396 8
45392


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

[34m[1mwandb[0m: Currently logged in as: [33mfeedback_prize[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

  0%|          | 0/446 [00:00<?, ?it/s]

[2022-02-05 16:40:28] - Pseudo-Labeling-001-seed2022-fold0: validation data score=0.7159


# --------- # Start Training Fold=1 # --------- #
45396 8
45392


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

  0%|          | 0/446 [00:00<?, ?it/s]

[2022-02-05 18:39:06] - Pseudo-Labeling-001-seed2022-fold1: validation data score=0.7162


# --------- # Start Training Fold=2 # --------- #
45396 8
45392


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

  0%|          | 0/446 [00:00<?, ?it/s]

[2022-02-05 20:37:43] - Pseudo-Labeling-001-seed2022-fold2: validation data score=0.7146


# --------- # Start Training Fold=3 # --------- #
45396 8
45392


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

  0%|          | 0/446 [00:00<?, ?it/s]

[2022-02-05 22:35:52] - Pseudo-Labeling-001-seed2022-fold3: validation data score=0.7150


# --------- # Start Training Fold=4 # --------- #
45396 8
45392


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

  0%|          | 0/446 [00:00<?, ?it/s]

[2022-02-06 00:34:05] - Pseudo-Labeling-001-seed2022-fold4: validation data score=0.7158


# ------------------ # Validation # ------------------ #


  0%|          | 0/446 [00:00<?, ?it/s]

  0%|          | 0/446 [00:00<?, ?it/s]

  0%|          | 0/446 [00:00<?, ?it/s]

  0%|          | 0/446 [00:00<?, ?it/s]

  0%|          | 0/446 [00:00<?, ?it/s]

[2022-02-06 00:48:11] - validation score = 0.7155


# ------------------ # Inference # ------------------ #


  0%|          | 0/236 [00:00<?, ?it/s]

  0%|          | 0/236 [00:00<?, ?it/s]

  0%|          | 0/236 [00:00<?, ?it/s]

  0%|          | 0/236 [00:00<?, ?it/s]

  0%|          | 0/236 [00:00<?, ?it/s]

(7537, 1)
