<a href="https://colab.research.google.com/github/martin-fabbri/colab-notebooks/blob/master/nlp/bert/bert_toxic_comments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qq kaggle
!pip install -qq transformers

[K     |████████████████████████████████| 1.3MB 17.4MB/s 
[K     |████████████████████████████████| 890kB 60.3MB/s 
[K     |████████████████████████████████| 1.1MB 54.3MB/s 
[K     |████████████████████████████████| 2.9MB 59.2MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [8]:
from google.colab import drive
drive.mount('/content/gdrive')
!mkdir -p ~/.kaggle
!cp /content/gdrive/My\ Drive/kaggle/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -q -c jigsaw-unintended-bias-in-toxicity-classification -p /content/dataset
!unzip -o /content/dataset/train.csv.zip

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Archive:  /content/dataset/train.csv.zip
  inflating: train.csv               


In [4]:
import fastai
from fastai.text import *
from fastai.callbacks import *

from transformers import PreTrainedModel
from transformers import PreTrainedTokenizer
from transformers import PretrainedConfig
from transformers import RobertaForSequenceClassification
from transformers import RobertaTokenizer
from transformers import RobertaConfig
from transformers import AdamW
from tqdm.notebook import tqdm

fastai.__version__

'1.0.61'

In [5]:
def seed_all(seed_value):
  random.seed(seed_value)
  np.random.seed(seed_value)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  torch.manual_seed(seed_value)

seed = 777
seed_all(seed)

In [7]:
def reduce_mem_usage(df):
  for col in tqdm(df.columns):
    col_type = df[col].dtype
    
    if col_type == object:
      df[col] = df[col].astype('category')
    else:
      c_min = df[col].min()
      c_max = df[col].max() 
      if str(col_type)[:3] == 'int':
          if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
            df[col] = df[col].astype(np.int8)
          elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
            df[col] = df[col].astype(np.int16)
          elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
            df[col] = df[col].astype(np.int32)
          elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
            df[col] = df[col].astype(np.int64)  
      else:
          if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
            df[col] = df[col].astype(np.float16)
          elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
            df[col] = df[col].astype(np.float32)
          else:
            df[col] = df[col].astype(np.float64)

  return df

In [9]:
train_df = pd.read_csv('train.csv')
train_df = reduce_mem_usage(train_df)
train_df.head()

HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))




Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:41.987077+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:42.870083+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:45.222647+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:47.601894+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893555,haha you guys are a bunch of losers.,0.021271,0.0,0.021271,0.872559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-09-29 10:50:48.488476+00,2,,2006,rejected,0,0,0,1,0,0.0,4,47


In [21]:
# fastai.text.transform.BaseTokenizer
class TransformerBaseTokenizer(BaseTokenizer):
  '''
  Wrapper aroud PreTrainedTokenizer to be compatible with fast.ai
  '''
  def __init__(self, pretrained_tokenizer: PreTrainedTokenizer, 
               model_type='bert', **kwargs):
    self._pretrained_tokenizer = pretrained_tokenizer
    self.max_seq_len = pretrained_tokenizer.model_max_length
    self.model_type = model_type

  def __call__(self, *args, **kargs):
    return self

  def tokenizer(self, t:str) -> List[str]:
    '''
    Limits the maximum sequence length and add the special tokens
    ''' 
    CLS = self._pretrained_tokenizer.cls_token
    SEP = self._pretrained_tokenizer.sep_token
    if self.model_type in ['roberta']:
      tokens = self._pretrained_tokenizer.tokenize(t, add_prefix_space=True)
      tokens = [CLS] + tokens[:self.max_seq_len - 2] + [SEP]
    else:
      tokens = self._pretrained_tokenizer.tokenize(t)
      if self.model_type in ['xlnet']:
        tokens = tokens[:self.max_seq_len - 2] + [SEP] + [CLS]
      else:
        tokens = [CLS] + tokens[:self.max_seq_len - 2] + [SEP]


In [35]:
use_fp16 = False
bs = 32

# transformers.tokenization_roberta.RobertaTokenizer
transformer_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
transformer_base_tokenizer = TransformerBaseTokenizer(
    pretrained_tokenizer=transformer_tokenizer, 
    model_type='roberta'
)

# fastai.text.transform.Tokenizer
fastai_tokenizer = Tokenizer(
    tok_func=transformer_base_tokenizer,
    pre_rules=[],
    post_rules=[]
)

In [32]:
# fastai.text.transform.Vocab
class TransformersVocab(Vocab):
  def __init__(self, tokenizer: PreTrainedTokenizer):
    super(TransformersVocab, self).__init__(itos = [])
    self.tokenizer = tokenizer
  
  def numericalize(self, t:Collection[str]) -> List[int]:
    return self.tokenizer.convert_tokenks_to_ids(t)

  def textify(self, nums:Collection[int], sep=' ') -> List[str]:
    nums = np.array(nums).tolist()
    if sep is not None:
      sep = ''
    return sep.join(self.tokenizer.convert_ids_to_tokens(nums)) 

  def __getstate__(self):
    return {'itos': self.itos, 'tokenizer': self.tokenizer}

  def __setstate__(self, state: dict):
    self.itos = state['itos']
    self.tokenizer = state['tokenizer']
    self.stoi = collections.defaultdict(
        int,
        {v:k for k,v in enumerate(self.itos)}
    )

In [39]:
transformer_vocab =  TransformersVocab(tokenizer = transformer_tokenizer)
# fastai.text.data.NumericalizeProcessor
numericalize_processor = NumericalizeProcessor(vocab=transformer_vocab)
# fastai.text.data.TokenizeProcessor
tokenize_processor = TokenizeProcessor(
    tokenizer=fastai_tokenizer,
    include_bos=False, 
    include_eos=False
)
transformer_processor = [tokenize_processor, numericalize_processor]

In [43]:
tokens = transformer_tokenizer.tokenize('Salut c est moi, Hello it s me')
tokens

['Sal', 'ut', 'Ġc', 'Ġest', 'Ġmo', 'i', ',', 'ĠHello', 'Ġit', 'Ġs', 'Ġme']

In [44]:
ids = transformer_tokenizer.convert_tokens_to_ids(tokens)
ids

[18111, 1182, 740, 3304, 7458, 118, 6, 20920, 24, 579, 162]

In [45]:
transformer_tokenizer.convert_ids_to_tokens(ids)

['Sal', 'ut', 'Ġc', 'Ġest', 'Ġmo', 'i', ',', 'ĠHello', 'Ġit', 'Ġs', 'Ġme']