In [2]:
from transformers import BertTokenizer
from pathlib import Path
import torch

from box import Box
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
from sklearn.model_selection import train_test_split

import datetime

from transformers import AutoTokenizer

from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc

In [7]:
#pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

In [8]:
DATA_PATH = Path('./data/')
LABEL_PATH = Path('./labels/')

#AUG_DATA_PATH = Path('./data/data_augmentation/')

MODEL_PATH=Path('./models/')
LOG_PATH=Path('./logs/')
MODEL_PATH.mkdir(exist_ok=True)

model_state_dict = None
#BERT_PRETRAINED_PATH = Path('./models/base_model')
LOG_PATH.mkdir(exist_ok=True)

OUTPUT_PATH = MODEL_PATH

FINETUNED_PATH_BERT = OUTPUT_PATH/'bert_plain/model_out' 
FINETUNED_PATH_XLNET = OUTPUT_PATH/'xlnet/model_out'


OUTPUT_PATH.mkdir(exist_ok=True)

# Bert

In [9]:
args = Box({
    "run_text": "multilabel toxic comments with freezable layers",
    "train_size": -1,
    "val_size": -1,
    "log_path": LOG_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "toxic_classification_lib",
    "no_cuda": False,
    "bert_model": None,
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 512,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "learning_rate": 5e-5,
    "num_train_epochs": 6,
    "warmup_proportion": 0.0,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": False,
    "fp16_opt_level": "O1",
    "weight_decay": 0.0,
    "adam_epsilon": 1e-8,
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "multi_gpu": False,
    "warmup_steps": 500,
    "logging_steps": 50,
    "eval_all_checkpoints": True,
    "overwrite_output_dir": True,
    "overwrite_cache": False,
    "seed": 42,
    "loss_scale": 128,
    "task_name": 'intent',
    "model_name": 'bert-base-cased',
    "model_type": 'bert'
})

In [10]:

import logging

logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

In [20]:
device = torch.device('cuda:3')

In [12]:
label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [13]:
from fast_bert.prediction import BertClassificationPredictor

In [16]:
type(str(FINETUNED_PATH_BERT))

str

In [17]:
tokenizer = AutoTokenizer.from_pretrained(str(FINETUNED_PATH_BERT), use_fast=True)

05/13/2020 10:06:44 - INFO - transformers.configuration_utils -   loading configuration file models/bert_plain/model_out/config.json
05/13/2020 10:06:44 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "_num_labels": 6,
  "architectures": [
    "BertForMultiLabelSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "la

In [18]:
databunch = BertDataBunch(args['data_dir'], LABEL_PATH, tokenizer=tokenizer, train_file='train.csv', val_file='val.csv',
                          test_data='test.csv', label_file='labels.csv',
                          text_col="comment_text", label_col=label_cols,
                          batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], 
                          multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type)

05/13/2020 10:06:52 - INFO - root -   Loading features from cached file data/cache/cached_bert_train_multi_label_512_train.csv
05/13/2020 10:07:02 - INFO - root -   Loading features from cached file data/cache/cached_bert_dev_multi_label_512_val.csv
05/13/2020 10:07:06 - INFO - root -   Loading features from cached file data/cache/cached_bert_test_multi_label_512_test


In [21]:
learner = BertLearner.from_pretrained_model(
            databunch,
            FINETUNED_PATH_BERT,
            metrics=[],
            device=device,
            logger=None,
            output_dir=None,
            warmup_steps=0,
            multi_gpu=False,
            is_fp16=False,
            multi_label=True,
            logging_steps=0,
        )

05/13/2020 10:08:37 - INFO - transformers.configuration_utils -   loading configuration file models/bert_plain/model_out/config.json
05/13/2020 10:08:37 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "_num_labels": 6,
  "architectures": [
    "BertForMultiLabelSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "la

In [22]:
output = learner.predict_batch(list(pd.read_csv(DATA_PATH/"test.csv")['comment_text'].values))

05/13/2020 10:08:44 - INFO - root -   Writing example 0 of 63978
05/13/2020 10:08:46 - INFO - root -   Writing example 10000 of 63978
05/13/2020 10:08:48 - INFO - root -   Writing example 20000 of 63978
05/13/2020 10:08:51 - INFO - root -   Writing example 30000 of 63978
05/13/2020 10:08:53 - INFO - root -   Writing example 40000 of 63978
05/13/2020 10:08:56 - INFO - root -   Writing example 50000 of 63978
05/13/2020 10:08:59 - INFO - root -   Writing example 60000 of 63978


In [25]:
#pd.DataFrame(output).to_csv(DATA_PATH/'output_bert.csv')
#results = pd.read_csv(DATA_PATH/'output_bert.csv')

In [23]:
preds = pd.DataFrame([{item[0]: item[1] for item in pred} for pred in output])
#preds.to_csv(DATA_PATH/'predictions_bert.csv')

In [25]:
preds.head()

Unnamed: 0,toxic,insult,obscene,identity_hate,threat,severe_toxic
0,0.000597,0.000272,0.000248,0.000241,0.000175,0.000163
1,0.518612,0.023604,0.007075,0.003877,0.000791,0.000709
2,0.069452,0.002453,0.001782,0.000824,0.000218,0.000195
3,0.000662,0.000281,0.000259,0.00023,0.000145,0.000157
4,0.000555,0.000255,0.000246,0.000228,0.000167,0.000164


In [26]:
from sklearn.metrics import accuracy_score

In [27]:
test = pd.read_csv(DATA_PATH/"test.csv")

In [28]:
target = np.array(test.iloc[:,3:])
prediction = np.where(np.array(preds)>0.5, 1, 0)

In [29]:
tox_only_target = target[:,0]
tox_only_prediction = prediction[:,0]

In [30]:
print(accuracy_score(target, prediction))

0.8465097377223421


In [31]:
print(accuracy_score(tox_only_target, tox_only_prediction))

0.9158929632061021


# XLNet

In [None]:
args = Box({
    "run_text": "multilabel toxic comments with freezable layers",
    "train_size": -1,
    "val_size": -1,
    "log_path": LOG_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "toxic_classification_lib",
    "no_cuda": False,
    "bert_model": None,
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 512,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "learning_rate": 5e-5,
    "num_train_epochs": 6,
    "warmup_proportion": 0.0,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": False,
    "fp16_opt_level": "O1",
    "weight_decay": 0.0,
    "adam_epsilon": 1e-8,
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "multi_gpu": False,
    "warmup_steps": 500,
    "logging_steps": 50,
    "eval_all_checkpoints": True,
    "overwrite_output_dir": True,
    "overwrite_cache": False,
    "seed": 42,
    "loss_scale": 128,
    "task_name": 'intent',
    "model_name": 'xlnet-base-cased',
    "model_type": 'xlnet'
})

In [32]:
tokenizer = AutoTokenizer.from_pretrained(str(FINETUNED_PATH_XLNET), use_fast=True)

05/13/2020 10:25:50 - INFO - transformers.configuration_utils -   loading configuration file models/xlnet/model_out/config.json
05/13/2020 10:25:50 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "_num_labels": 6,
  "architectures": [
    "BertForMultiLabelSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_n

In [33]:
databunch = BertDataBunch(args['data_dir'], LABEL_PATH, tokenizer=tokenizer, train_file='train.csv', val_file='val.csv',
                          test_data='test.csv', label_file='labels.csv',
                          text_col="comment_text", label_col=label_cols,
                          batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], 
                          multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type)

05/13/2020 10:26:07 - INFO - root -   Loading features from cached file data/cache/cached_bert_train_multi_label_512_train.csv
05/13/2020 10:26:16 - INFO - root -   Loading features from cached file data/cache/cached_bert_dev_multi_label_512_val.csv
05/13/2020 10:26:20 - INFO - root -   Loading features from cached file data/cache/cached_bert_test_multi_label_512_test


In [34]:
learner = BertLearner.from_pretrained_model(
            databunch,
            FINETUNED_PATH_XLNET,
            metrics=[],
            device=device,
            logger=None,
            output_dir=None,
            warmup_steps=0,
            multi_gpu=False,
            is_fp16=False,
            multi_label=True,
            logging_steps=0,
        )

05/13/2020 10:26:43 - INFO - transformers.configuration_utils -   loading configuration file models/xlnet/model_out/config.json
05/13/2020 10:26:43 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "_num_labels": 6,
  "architectures": [
    "BertForMultiLabelSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_n

In [35]:
output = learner.predict_batch(list(pd.read_csv(DATA_PATH/"test.csv")['comment_text'].values))

05/13/2020 10:26:57 - INFO - root -   Writing example 0 of 63978
05/13/2020 10:26:59 - INFO - root -   Writing example 10000 of 63978
05/13/2020 10:27:02 - INFO - root -   Writing example 20000 of 63978
05/13/2020 10:27:04 - INFO - root -   Writing example 30000 of 63978
05/13/2020 10:27:07 - INFO - root -   Writing example 40000 of 63978
05/13/2020 10:27:10 - INFO - root -   Writing example 50000 of 63978
05/13/2020 10:27:12 - INFO - root -   Writing example 60000 of 63978


In [36]:
pd.DataFrame(output).to_csv(DATA_PATH/'output_xlnet.csv')
results = pd.read_csv(DATA_PATH/'output_xlnet.csv')

In [37]:
preds = pd.DataFrame([{item[0]: item[1] for item in pred} for pred in output])
preds.to_csv(DATA_PATH/'predictions_xlnet.csv')

In [40]:
prediction = np.where(np.array(preds)>0.5, 1, 0)
tox_only_prediction = prediction[:,0]

In [41]:
print(accuracy_score(target, prediction))

0.8488074025446247


In [42]:
print(accuracy_score(tox_only_target, tox_only_prediction))

0.916830785582544
