<a href="https://colab.research.google.com/github/martin-fabbri/colab-notebooks/blob/master/nlp/bert/bert_toxic_comments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qq kaggle
!pip install -qq transformers

[K     |████████████████████████████████| 1.3MB 8.0MB/s 
[K     |████████████████████████████████| 1.1MB 41.1MB/s 
[K     |████████████████████████████████| 2.9MB 58.4MB/s 
[K     |████████████████████████████████| 890kB 54.5MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
!mkdir -p ~/.kaggle
!cp /content/gdrive/My\ Drive/kaggle/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -q -c jigsaw-unintended-bias-in-toxicity-classification -p /content/dataset
!unzip -o /content/dataset/train.csv.zip

Mounted at /content/gdrive
Archive:  /content/dataset/train.csv.zip
  inflating: train.csv               


In [3]:
import fastai
from fastai.text import *
from fastai.callbacks import *

from transformers import PreTrainedModel
from transformers import PreTrainedTokenizer
from transformers import PretrainedConfig
from transformers import RobertaForSequenceClassification
from transformers import RobertaTokenizer
from transformers import RobertaConfig
from transformers import AdamW
from tqdm.notebook import tqdm

fastai.__version__

'1.0.61'

In [4]:
def seed_all(seed_value):
  random.seed(seed_value)
  np.random.seed(seed_value)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  torch.manual_seed(seed_value)

seed = 777
seed_all(seed)

In [5]:
def reduce_mem_usage(df):
  for col in tqdm(df.columns):
    col_type = df[col].dtype
    
    if col_type == object:
      df[col] = df[col].astype('category')
    else:
      c_min = df[col].min()
      c_max = df[col].max() 
      if str(col_type)[:3] == 'int':
          if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
            df[col] = df[col].astype(np.int8)
          elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
            df[col] = df[col].astype(np.int16)
          elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
            df[col] = df[col].astype(np.int32)
          elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
            df[col] = df[col].astype(np.int64)  
      else:
          if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
            df[col] = df[col].astype(np.float16)
          elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
            df[col] = df[col].astype(np.float32)
          else:
            df[col] = df[col].astype(np.float64)

  return df

In [6]:
train_df = pd.read_csv('train.csv')
train_df = reduce_mem_usage(train_df)
train_df.head()

HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))




Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:41.987077+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:42.870083+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:45.222647+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:47.601894+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893555,haha you guys are a bunch of losers.,0.021271,0.0,0.021271,0.872559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-09-29 10:50:48.488476+00,2,,2006,rejected,0,0,0,1,0,0.0,4,47


In [7]:
# fastai.text.transform.BaseTokenizer
class TransformerBaseTokenizer(BaseTokenizer):
  '''
  Wrapper aroud PreTrainedTokenizer to be compatible with fast.ai
  '''
  def __init__(self, pretrained_tokenizer: PreTrainedTokenizer, 
               model_type='bert', **kwargs):
    self._pretrained_tokenizer = pretrained_tokenizer
    self.max_seq_len = pretrained_tokenizer.model_max_length
    self.model_type = model_type

  def __call__(self, *args, **kargs):
    return self

  def tokenizer(self, t:str) -> List[str]:
    '''
    Limits the maximum sequence length and add the special tokens
    ''' 
    CLS = self._pretrained_tokenizer.cls_token
    SEP = self._pretrained_tokenizer.sep_token
    if self.model_type in ['roberta']:
      tokens = self._pretrained_tokenizer.tokenize(t, add_prefix_space=True)
      tokens = [CLS] + tokens[:self.max_seq_len - 2] + [SEP]
    else:
      tokens = self._pretrained_tokenizer.tokenize(t)
      if self.model_type in ['xlnet']:
        tokens = tokens[:self.max_seq_len - 2] + [SEP] + [CLS]
      else:
        tokens = [CLS] + tokens[:self.max_seq_len - 2] + [SEP]
    return tokens

In [8]:
use_fp16 = False
bs = 32

# transformers.tokenization_roberta.RobertaTokenizer
transformer_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
transformer_base_tokenizer = TransformerBaseTokenizer(
    pretrained_tokenizer=transformer_tokenizer, 
    model_type='roberta'
)

# fastai.text.transform.Tokenizer
fastai_tokenizer = Tokenizer(
    tok_func=transformer_base_tokenizer,
    pre_rules=[],
    post_rules=[]
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [9]:
# fastai.text.transform.Vocab
class TransformersVocab(Vocab):
  def __init__(self, tokenizer: PreTrainedTokenizer):
    super(TransformersVocab, self).__init__(itos = [])
    self.tokenizer = tokenizer
  
  def numericalize(self, t:Collection[str]) -> List[int]:
    return self.tokenizer.convert_tokens_to_ids(t)

  def textify(self, nums:Collection[int], sep=' ') -> List[str]:
    nums = np.array(nums).tolist()
    if sep is None:
      return self.tokenizer.convert_ids_to_tokens(nums)
    return sep.join(self.tokenizer.convert_ids_to_tokens(nums)) 

  def __getstate__(self):
    return {'itos': self.itos, 'tokenizer': self.tokenizer}

  def __setstate__(self, state: dict):
    self.itos = state['itos']
    self.tokenizer = state['tokenizer']
    self.stoi = collections.defaultdict(
        int,
        {v:k for k,v in enumerate(self.itos)}
    )

In [10]:
transformer_vocab =  TransformersVocab(tokenizer=transformer_tokenizer)
# fastai.text.data.NumericalizeProcessor
numericalize_processor = NumericalizeProcessor(vocab=transformer_vocab)
# fastai.text.data.TokenizeProcessor
tokenize_processor = TokenizeProcessor(
    tokenizer=fastai_tokenizer,
    include_bos=False, 
    include_eos=False
)
transformer_processor = [tokenize_processor, numericalize_processor]

In [11]:
tokens = transformer_tokenizer.tokenize('Salut c est moi, Hello it s me')
tokens

['Sal', 'ut', 'Ġc', 'Ġest', 'Ġmo', 'i', ',', 'ĠHello', 'Ġit', 'Ġs', 'Ġme']

In [12]:
ids = transformer_tokenizer.convert_tokens_to_ids(tokens)
ids

[18111, 1182, 740, 3304, 7458, 118, 6, 20920, 24, 579, 162]

In [13]:
transformer_tokenizer.convert_ids_to_tokens(ids)

['Sal', 'ut', 'Ġc', 'Ġest', 'Ġmo', 'i', ',', 'ĠHello', 'Ġit', 'Ġs', 'Ġme']

In [14]:
pad_first = False
pad_idx = transformer_tokenizer.pad_token_id
pad_idx

1

In [15]:
databunch = (TextList.from_df(
    train_df,
    cols='comment_text',
    processor=transformer_processor)
  .split_by_rand_pct(0.1, seed=seed)
  .label_from_df(cols='target')
  .databunch(bs=bs, pad_first=pad_first, pad_idx=pad_idx))

In [16]:
databunch.show_batch()

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  idx_min = (t != self.pad_idx).nonzero().min()


text,target
"<s> Ġ"" ðĿ Ļ ı ðĿ Ļ Ŀ ðĿ Ļ ļ Ġ ðĿ Ļ Ĺ ðĿ Ļ ŀ ðĿ Ļ ľ Ġ ðĿ Ļ ¥ ðĿ Ļ § ðĿ Ļ ¤ ðĿ Ļ Ĺ ðĿ Ļ ¡ ðĿ Ļ ļ ðĿ Ļ ¢ Ġ ðĿ Ļ ŀ ðĿ Ļ ¨ Ġ ðĿ Ļ ĸ ðĿ Ļ ¡ ðĿ Ļ ¡ ðĿ Ļ ĸ ðĿ Ļ ® ðĿ Ļ ŀ ðĿ",0.0
<s> ĠA : Ġ($ 10 . 36 Ġmil Ġ( U AA ĠAthletics )) Ġ/ Ġ($ 377 Ġmil Ġ( UA ĠBudget )) Ġ= Ġ2 . 7 %. Ġ Ċ Maybe Ġyou Ġshould Ġgo Ġback Ġto Ġschool . Ċ Ċ B : ĠHere Ġare Ġmy Ġschool Ġfees . Ċ Ċ - Sch Ġof ĠBus Ġ1 ĠTime ĠSem ester ĠLab ĉ $ 34 . 00 Ċ - A th let ic / Rec,0.0
"<s> ĠLet 's Ġsee . ĠWe Ġneed Ġfunding Ġfor Ġthe Ġport . ĠLike , ĠNow . ĠHow Ġabout Ġthese : Ċ Close Ġthe ĠSp en ard ĠL IO , Ġsell Ġit Ġoff Ġfor Ġat Ġleast Ġ$ 10 Ġmillion . ĠWe 'll Ġlose Ġ$ 2 Ġmillion , Ġbut Ġit 's Ġa Ġquicker Ġsale Ġand Ġwe Ġcan Ġpenal ize Ġour Ġso - called Ġ' rep rest atives ' Ġover Ġ10 Ġyears Ġto",0.19995117
"<s> ĠWhen ĠI Ġfirst Ġread Ġabout Ġthe ĠKh adr Ġsettlement Ġyesterday , Ġthe Ġfirst Ġthing Ġthat Ġimmediately Ġcame Ġto Ġmind Ġwas Ġa Ġscene Ġfrom Ġthe Ġexcellent , Ġunderrated Ġ90 s Ġflick , ĠA ĠBronx ĠTale . Ċ In Ġit , Ġthe Ġlocal Ġmafia Ġboss , ĠSonny Ġ( Ch azz ĠPal min ter i ) Ġwatches Ġas ĠCal og ero Ġ( a . k . a ."" C ,"" Ġplayed Ġby",0.0
"<s> ĠTerr ified Ġby ĠTrump 's Ġwords Ġand Ġactions Ġand Ġthose Ġof Ġsome Ġof Ġhis Ġsupporters , Ġeh ? Ġ ĠWhat Ġabout Ġthe Ġwords Ġand Ġactions Ġof Ġthose Ġwho Ġclearly Ġdidn 't Ġsupport ĠTrump ? Ġ Ġ Ċ Ċ http :// w reg . com / 2016 / 12 / 21 / miss iss ippi - author ities - make - ar rest - in - burning - of -",0.0


In [17]:
class CustomTransformerModel(nn.Module):
  def __init__(self, transformer_model: PreTrainedModel):
    super(CustomTransformerModel, self).__init__()
    self.transformer = transformer_model

  def forward(self, input_ids, attention_mask=None):
    attention_mask = (input_ids!=pad_idx).type(input_ids.type())
    logits = self.transformer(input_ids, attention_mask=attention_mask)[0]
    return logits

In [18]:
config = RobertaConfig.from_pretrained('roberta-base')
config.num_labels = 1
config.use_bfloat16 = use_fp16
print(config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…


RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
}



In [19]:
transformer_model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base', 
    config=config
)
custom_transformer_model = CustomTransformerModel(
    transformer_model=transformer_model
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [20]:
CustomAdamW = partial(AdamW, correct_bias=False)
learner = Learner(
    databunch,
    custom_transformer_model,
    opt_func=CustomAdamW,
    metrics=[accuracy, error_rate]
)
learner.callbacks.append(ShowGraph(learner))
if use_fp16:
  learner = learner.to_fp16()

In [22]:
learner

Learner(data=TextClasDataBunch;

Train: LabelList (1624387 items)
x: TextList
<s> ĠThis Ġis Ġso Ġcool . ĠIt 's Ġlike , Ġ' would Ġyou Ġwant Ġyour Ġmother Ġto Ġread Ġthis ?? ' ĠReally Ġgreat Ġidea , Ġwell Ġdone ! </s>,<s> ĠThank Ġyou !! ĠThis Ġwould Ġmake Ġmy Ġlife Ġa Ġlot Ġless Ġanxiety - inducing . ĠKeep Ġit Ġup , Ġand Ġdon 't Ġlet Ġanyone Ġget Ġin Ġyour Ġway ! </s>,<s> ĠIs Ġthis Ġsomething ĠI 'll Ġbe Ġable Ġto Ġinstall Ġon Ġmy Ġsite ? ĠWhen Ġwill Ġyou Ġbe Ġreleasing Ġit ? </s>,<s> Ġhaha Ġyou Ġguys Ġare Ġa Ġbunch Ġof Ġlosers . </s>,<s> Ġur Ġa Ġsh * tty Ġcomment . </s>
y: FloatList
0.0,0.0,0.0,0.8935547,0.6665039
Path: .;

Valid: LabelList (180487 items)
x: TextList
<s> ĠRead Ġthe Ġdescription ... this Ġwas Ġ" pre race ". ĠOrgan izers Ġprobably Ġhad Ġno Ġcontrol . </s>,<s> ĠThere Ġis Ġno Ġplace Ġof Ġ Ġ Ġfire Ġwhere Ġpeople Ġare Ġtortured Ġafter Ġthey Ġdie . Ġ ĠThat Ġis Ġnot Ġtaught Ġin Ġthe ĠBible , Ġit Ġis Ġa Ġtwisted Ġteaching Ġto Ġscare Ġpeople Ġto Ġgive Ġmoney . </s>,<s> ĠI Ġhave Ġp

In [21]:
list_layers = [learner.model.transformer.roberta.embeddings,
              learner.model.transformer.roberta.encoder.layer[0],
              learner.model.transformer.roberta.encoder.layer[1],
              learner.model.transformer.roberta.encoder.layer[2],
              learner.model.transformer.roberta.encoder.layer[3],
              learner.model.transformer.roberta.encoder.layer[4],
              learner.model.transformer.roberta.encoder.layer[5],
              learner.model.transformer.roberta.encoder.layer[6],
              learner.model.transformer.roberta.encoder.layer[7],
              learner.model.transformer.roberta.encoder.layer[8],
              learner.model.transformer.roberta.encoder.layer[9],
              learner.model.transformer.roberta.encoder.layer[10],
              learner.model.transformer.roberta.encoder.layer[11],
              learner.model.transformer.roberta.pooler]

learner.split(list_layers)
num_groups = len(learner.layer_groups)

AttributeError: ignored

In [23]:
learner.freeze_to(-1)
learner.lr_find()
learner.recorder.plot(suggestion=True)

epoch,train_loss,valid_loss,accuracy,error_rate,time


RuntimeError: ignored

In [None]:
learner.recorder.plot(skip_end=10, suggestion=True)

In [None]:
learner.fit_one_cycle(1,max_lr=4e-05,moms=(0.8,0.7))
learner.save('stage_1')

In [None]:
learner.freeze_to(-2)
lr = 1e-5
learner.fit_one_cycle(1, max_lr=slice(lr*0.95**num_groups, lr), moms=(0.8, 0.9))
learner.save('stage_2')

In [None]:
learner.freeze_to(-3)
learner.fit_one_cycle(1, max_lr=slice(lr*0.95**num_groups, lr), moms=(0.8, 0.9))
learner.save('stage_3')

In [None]:
learner.unfreeze()
learner.fit_one_cycle(2, max_lr=slice(lr*0.95**num_groups, lr), moms=(0.8, 0.9))
learner.save('stage_4')