In [1]:
from transformers import BertTokenizer
from pathlib import Path
import torch

from box import Box
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
from sklearn.model_selection import train_test_split

import datetime

from transformers import AutoTokenizer

from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc

In [2]:
torch.cuda.empty_cache()

In [3]:
pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

In [4]:
DATA_PATH = Path('./data/')
LABEL_PATH = Path('./labels/')

AUG_DATA_PATH = Path('./data/data_augmentation/')

MODEL_PATH=Path('./models/')
LOG_PATH=Path('./logs/')
MODEL_PATH.mkdir(exist_ok=True)

model_state_dict = None
BERT_PRETRAINED_PATH = Path('./models/base_model')
LOG_PATH.mkdir(exist_ok=True)

OUTPUT_PATH = MODEL_PATH/'output'

FINETUNED_PATH = OUTPUT_PATH/'model_out' 


OUTPUT_PATH.mkdir(exist_ok=True)

In [5]:
args = Box({
    "run_text": "multilabel toxic comments with freezable layers",
    "train_size": -1,
    "val_size": -1,
    "log_path": LOG_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "toxic_classification_lib",
    "no_cuda": False,
    "bert_model": BERT_PRETRAINED_PATH,
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 512,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "learning_rate": 5e-5,
    "num_train_epochs": 6,
    "warmup_proportion": 0.0,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": False,
    "fp16_opt_level": "O1",
    "weight_decay": 0.0,
    "adam_epsilon": 1e-8,
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "multi_gpu": False,
    "warmup_steps": 500,
    "logging_steps": 50,
    "eval_all_checkpoints": True,
    "overwrite_output_dir": True,
    "overwrite_cache": False,
    "seed": 42,
    "loss_scale": 128,
    "task_name": 'intent',
    "model_name": 'bert-base-cased',
    "model_type": 'bert'
})

In [6]:

import logging

logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

In [7]:
device = torch.device('cuda:4')

In [8]:
label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [9]:
from fast_bert.prediction import BertClassificationPredictor

In [10]:
type(str(FINETUNED_PATH))

str

In [11]:
tokenizer = AutoTokenizer.from_pretrained(str(FINETUNED_PATH), use_fast=True)

05/06/2020 03:02:23 - INFO - transformers.configuration_utils -   loading configuration file models/output/model_out/config.json
05/06/2020 03:02:23 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "_num_labels": 6,
  "architectures": [
    "BertForMultiLabelSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_

In [12]:
databunch = BertDataBunch(args['data_dir'], LABEL_PATH, tokenizer=tokenizer, train_file='train.csv', val_file='val.csv',
                          test_data='test.csv', label_file='labels.csv',
                          text_col="comment_text", label_col=label_cols,
                          batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], 
                          multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type)

05/06/2020 03:02:23 - INFO - root -   Loading features from cached file data/cache/cached_bert_train_multi_label_512_train.csv
05/06/2020 03:02:34 - INFO - root -   Loading features from cached file data/cache/cached_bert_dev_multi_label_512_val.csv
05/06/2020 03:02:37 - INFO - root -   Loading features from cached file data/cache/cached_bert_test_multi_label_512_test


In [13]:
learner = BertLearner.from_pretrained_model(
            databunch,
            FINETUNED_PATH,
            metrics=[],
            device=device,
            logger=None,
            output_dir=None,
            warmup_steps=0,
            multi_gpu=False,
            is_fp16=False,
            multi_label=True,
            logging_steps=0,
        )

05/06/2020 03:02:38 - INFO - transformers.configuration_utils -   loading configuration file models/output/model_out/config.json
05/06/2020 03:02:38 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "_num_labels": 6,
  "architectures": [
    "BertForMultiLabelSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_

In [22]:
output = learner.predict_batch(list(pd.read_csv(DATA_PATH/"test.csv")['comment_text'].values))

05/06/2020 03:12:18 - INFO - root -   Writing example 0 of 63978
05/06/2020 03:12:21 - INFO - root -   Writing example 10000 of 63978
05/06/2020 03:12:23 - INFO - root -   Writing example 20000 of 63978
05/06/2020 03:12:26 - INFO - root -   Writing example 30000 of 63978
05/06/2020 03:12:28 - INFO - root -   Writing example 40000 of 63978
05/06/2020 03:12:31 - INFO - root -   Writing example 50000 of 63978
05/06/2020 03:12:34 - INFO - root -   Writing example 60000 of 63978


In [23]:
pd.DataFrame(output).to_csv(DATA_PATH/'output_bert.csv')

In [24]:
results = pd.read_csv(DATA_PATH/'output_bert.csv')

In [25]:
preds = pd.DataFrame([{item[0]: item[1] for item in pred} for pred in output])

In [35]:
preds.iloc[1595:1601]

Unnamed: 0,toxic,insult,obscene,identity_hate,threat,severe_toxic
1595,0.973945,0.706804,0.65024,0.012982,0.001271,0.006624
1596,0.000603,0.000252,0.000233,0.000228,0.000162,0.000158
1597,0.786447,0.066781,0.05724,0.050285,0.010325,0.004203
1598,0.989117,0.903307,0.732539,0.032157,0.003122,0.025135
1599,0.001438,0.000286,0.000374,0.000278,0.000155,0.000138
1600,0.993964,0.944056,0.980737,0.029374,0.005204,0.100472


In [27]:
test = pd.read_csv(DATA_PATH/"test.csv")

In [34]:
test.iloc[1595:1601]

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
1595,3915,06b32f1e5707afc9,U wot? U wot? U wot? U wot? U wot? U wot? U wot? U wot? I reverted it myself. No need to be such a douchebag.,1,0,1,0,1,0
1596,3920,06b52e91bc528426,""" \n : The site does not meet WP:RS. Please also see our conflict of interest policy and our page regarding single-purpose accounts. If you continue to add the links, your account will be blocked. """,0,0,0,0,0,0
1597,3921,06b59f9d648b3239,"i know she was doing some porn lesbian stuff and i think everybody knows, but i consider it shouldn't appear on her wiki page cause it is not approppriate !",0,0,0,0,0,0
1598,3925,06b6f55efea0654f,"==Somewhere== \n Why are you being retarded? Your backstory is under fire. You are a b-h, so go fk it all. Fk You!",1,0,1,0,1,0
1599,3926,06b7562f03712738,"::Well look, you don't have to preach to me about any of this... to be quite honest I'm actually on the fence with many of these changes, and there some I just flat out oppose. But that doesn't mean that this arguement shouldn't get a fair, impartial reckoning. It is clear that there are many predatory, greedy trial lawyers in the world. It's equally clear that there are many predatory, greedy corporations in the world. And caught in the middle are industrial workers, white collar workers, doctors, patients and about a million lobbies on either side that deal their cards from the sleeve. \n\n ::And the debate is not worthless, because we do live in a democracy. Our laws aren't static, and are constantly up for reevaluation. As you know, many of these kinds of changes have already been passed at the state level, which means that there are large groups of people who agree with their content. My brother is both a Democrat and an Emergency MD, and recently had cause to rethink some of his positions on this when he saw his hospital being destaffed of specialists due to several colleagues being unable to pay their premiums. It is a complex, multifaceted debate, and there are legitimate players on each side of it who want the same thing... better laws and better justice. My $0.02",0,0,0,0,0,0
1600,3927,06b75897e90137aa,"Wow, you're a funny motherfucker, you know that? Don't waste your talent here, you should be on the goddamn radio. Seriously though, vandalizing wikipedia for kicks? You need a hobby. Read a book, go outside, jerk off, just do something productive.",1,0,1,0,1,0
