In [1]:
!pip install bert-tensorflow
!pip install pandas
!pip install transformers # https://huggingface.co/transformers/installation.html#with-pip



In [2]:
import os
import time
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

import transformers
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer

print(f"Tensorflow version: {tf.version.VERSION}")
print(f"Num GPUs Available: {len(tf.config.experimental.list_physical_devices('GPU'))}")

Tensorflow version: 2.1.1
Num GPUs Available: 1


In [3]:
DATA_PATH =  "./toxicity-detection/jigsaw-multilingual-toxic-comment-classification"

### Load data and balance toxicity

In [4]:
train1 = pd.read_csv(os.path.join(DATA_PATH, 'jigsaw-toxic-comment-train.csv'))
train2 = pd.read_csv(os.path.join(DATA_PATH, 'jigsaw-unintended-bias-train.csv'))
train2.toxic = train2.toxic.round().astype(int)

valid = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
sub = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'))

In [5]:
print(f"Train1 toxic comments {len(train1[['comment_text', 'toxic']].query('toxic==1'))}")
print(f"Train1 non-toxic comments {len(train1[['comment_text', 'toxic']].query('toxic==0'))}")
print(f"Proportion train1 toxic comments {len(train1[['comment_text', 'toxic']].query('toxic==1')) / len(train1[['comment_text', 'toxic']]) * 100}")
print(f"Proportion train2 non-toxic comments {len(train1[['comment_text', 'toxic']].query('toxic==0')) / len(train1[['comment_text', 'toxic']]) * 100}\n")

print(f"Train2 toxic comments {len(train2[['comment_text', 'toxic']].query('toxic==1'))}")
print(f"Train2 non-toxic comments {len(train2[['comment_text', 'toxic']].query('toxic==0'))}")
print(f"Proportion train2 toxic comments {len(train2[['comment_text', 'toxic']].query('toxic==1')) / len(train2[['comment_text', 'toxic']]) * 100}")
print(f"Proportion train2 non-toxic comments {len(train2[['comment_text', 'toxic']].query('toxic==0')) / len(train2[['comment_text', 'toxic']]) * 100}")

Train1 toxic comments 21384
Train1 non-toxic comments 202165
Proportion train1 toxic comments 9.565688059441108
Proportion train2 non-toxic comments 90.4343119405589

Train2 toxic comments 112226
Train2 non-toxic comments 1789968
Proportion train2 toxic comments 5.899818840770185
Proportion train2 non-toxic comments 94.1001811592298


In [6]:
# Downsample the train dataframe to 50% - 50%
train = pd.concat([
    train1[['comment_text', 'toxic']].query('toxic==1'),
    train1[['comment_text', 'toxic']].query('toxic==0').sample(n = 25000, random_state = 0),
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n = 150000, random_state = 0)
])

In [7]:
print(len(train))
print(f"Final dataset toxic proportion {len(train[['comment_text', 'toxic']].query('toxic==1')) / len(train[['comment_text', 'toxic']]) * 100}")
print(f"Final dataset non-toxic proportion {len(train[['comment_text', 'toxic']].query('toxic==0')) / len(train[['comment_text', 'toxic']]) * 100}")

308610
Final dataset toxic proportion 43.294125271378114
Final dataset non-toxic proportion 56.705874728621886


### Tokenization

In [8]:
!pip install numba



In [9]:
import numba
import warnings
warnings.filterwarnings("ignore")

@numba.jit()
def fast_encode(texts, tokenizer, chunk_size = 256, maxlen = 512):
    
    # Maximum sequence size for BERT is 512, 
    # so we wll truncate any comment that is longer than this.
    tokenizer.enable_truncation(max_length = maxlen)
    
    # Finally, we need to pad our input so it will have the 
    # same size of 512. It means that for any comment that is 
    # shorter than 512 tokens, we wll add zeros to reach 512 tokens.
    tokenizer.enable_padding(max_length = maxlen)
    
    all_ids = []   
    # tqdm progress bar: len(texts) // chunk_size
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i + chunk_size].tolist()
        # Tokenize current text chunk
        encs = tokenizer.encode_batch(text_chunk)
        # Extending the list is squeezing the list
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [10]:
# Tokenization for xlm (does not support truncation)
@numba.jit()
def encode(texts, tokenizer, maxlen = 512):
    
    enc_dict = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks = False,
        return_token_type_ids = False,
        pad_to_max_length = True,
        max_length = maxlen
    )
    
    return np.array(enc_dict['input_ids'])

In [155]:
models = []

# mBERT
distilbert_tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
models.append(distilbert_tokenizer)
print('distilbert')
# Save the loaded tokenizer locally
! mkdir toxicity-detection/tokenizers/distilbert
distilbert_model.save_pretrained('./toxicity-detection/tokenizers/distilbert')

# XLM
xlm_tokenizer = transformers.AutoTokenizer.from_pretrained('xlm-mlm-100-1280')
models.append(xlm_tokenizer)
print('xlm')
# Save the loaded tokenizer locally
! mkdir toxicity-detection/tokenizers/xlm
distilbert_model.save_pretrained('./toxicity-detection/tokenizers/xlm')

# XLM-RoBERTa
xlm_roberta_tokenizer = transformers.AutoTokenizer.from_pretrained('xlm-roberta-large')
models.append(xlm_roberta_tokenizer)
print('xlm-roberta')
# Save the loaded tokenizer locally
! mkdir toxicity-detection/tokenizers/xlmroberta
distilbert_model.save_pretrained('./toxicity-detection/tokenizers/xlmroberta')

distilbert
xlm
xlm-roberta


('./toxicity-detection/tokenizers/xlmroberta/vocab.txt',
 './toxicity-detection/tokenizers/xlmroberta/special_tokens_map.json',
 './toxicity-detection/tokenizers/xlmroberta/added_tokens.json')

In [141]:
! mkdir test
transformers.AutoTokenizer.from_pretrained('xlm-mlm-100-1280')

('./test/vocab.json',
 './test/merges.txt',
 './test/special_tokens_map.json',
 './test/added_tokens.json')

In [158]:
%%time
# 17.3 s
# x_train = fast_encode(train.comment_text.astype(str), fast_tokenizer, maxlen = MAX_LEN)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.91 µs


In [159]:
%%time
# 16.8 s
# x_train = fast_encode(train.comment_text.astype(str), fast_tokenizer, maxlen = MAX_LEN)

CPU times: user 0 ns, sys: 3 µs, total: 3 µs
Wall time: 6.44 µs


In [11]:
# Configuration
MAX_LEN = 256

# Reload it with the huggingface tokenizers library
tokenizer = BertWordPieceTokenizer('./toxicity-detection/tokenizers/distilbert/vocab.txt', lowercase = False)

x_train = fast_encode(train.comment_text.astype(str), tokenizer, maxlen = MAX_LEN)
x_valid = fast_encode(valid.comment_text.astype(str), tokenizer, maxlen = MAX_LEN)
x_test = fast_encode(test.content.astype(str), tokenizer, maxlen = MAX_LEN)

y_train = train.toxic.values
y_valid = valid.toxic.values

HBox(children=(FloatProgress(value=0.0, max=1206.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))




In [12]:
BATCH_SIZE = 16

# Prepare train dataset
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
)

# Prepare validation dataset
valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
)

# Prepare test dataset
test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

In [13]:
# Optimizer
def set_optimizer(type = 'adam', lr = 1e-5):
    if type == 'adam':
        return Adam(lr = lr)
    elif type == 'adadelta':
        return Adadelta(lr = lr)
    else: 
        return None

In [14]:
# Build model
def build_model(transformer, optimizer = set_optimizer('adam'), dropout_prop = 0.35, 
                loss = 'binary_crossentropy', max_len = 512):
    # Instantiate tensor with input word IDs
    input_word_ids = Input(shape = (max_len,), dtype = tf.int32, name ='input_word_ids')
    
    # Forward through transformer network
    # Hugginface transformers have multiple outputs, embeddings are at the first one
    sequence_output = transformer(input_word_ids)[0]
    # We slice the first position, the paper says it is not worse than pooling
    x = sequence_output[:, 0, :]
    
    x = Dense(512, activation = 'relu')(x)
    
    # Apply dropout to dense layer
    x = tf.keras.layers.Dropout(dropout_prop)(x)
    x = Dense(256, activation = 'relu')(x)
    
    # Apply dropout to dense layer
    x = tf.keras.layers.Dropout(dropout_prop)(x)
    x = Dense(128, activation = 'relu')(x)
    
    # Apply dropout to dense layer
    x = tf.keras.layers.Dropout(dropout_prop)(x)
    x = Dense(64, activation = 'relu')(x)
    
    # Apply dropout to dense layer
    x = tf.keras.layers.Dropout(dropout_prop)(x)
    x = Dense(32, activation = 'relu')(x)
    
    # Apply dropout to dense layer and final layer to ouput toxicity probability
    x = tf.keras.layers.Dropout(dropout_prop)(x)
    out = Dense(1, activation = 'sigmoid', name = 'toxicity_prob_head')(x)
    
    model = Model(inputs = input_word_ids, outputs = out)
    model.compile(optimizer, loss = loss, metrics = ['accuracy', tf.keras.metrics.AUC()])
    
    return model

In [15]:
# Hyperparameters
EPOCHS = 5
BATCH_SIZE = BATCH_SIZE 
OPTIMIZER = 'adam'
LR = 0.0001
DROPOUT = 0.35

In [16]:
transformer_layer = (
    transformers.TFDistilBertModel
    .from_pretrained('distilbert-base-multilingual-cased')
)

model = build_model(transformer_layer, optimizer = set_optimizer(OPTIMIZER, lr = LR), dropout_prop = DROPOUT, max_len = MAX_LEN)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=910749124.0, style=ProgressStyle(descri…


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 256)]             0         
_________________________________________________________________
tf_distil_bert_model (TFDist ((None, 256, 768),)       134734080 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dense (Dense)                (None, 512)               393728    
_________________________________________________________________
dropout_19 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_20 (Dropout)         (None, 256)               0    

In [17]:
checkpoint_filepath = f'./toxicity-detection/models/checkpoints'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_filepath,
    save_weights_only = True,
    monitor = 'loss',
    mode = 'max',
    save_best_only = True)

In [23]:
model_checkpoint_callback

<tensorflow.python.keras.callbacks.ModelCheckpoint at 0x7f0f3b40f320>

In [None]:
# TRAIN!
n_steps = x_train.shape[0] // BATCH_SIZE
start = time.time()
train_history = model.fit(
    train_dataset,
    steps_per_epoch = n_steps,
    validation_data = valid_dataset,
    epochs = EPOCHS,
    callbacks = [model_checkpoint_callback],
    verbose = 1
)
training_time = time.time() - start

Train for 19288 steps, validate for 500 steps
Epoch 1/5

In [22]:
train_history.history

{'loss': [0.046408504734211396,
  0.05378133220449028,
  0.053325059956694114,
  0.05243607540994543,
  0.05120615940909028],
 'accuracy': [0.98176324, 0.9805773, 0.98103094, 0.98106986, 0.98087543],
 'auc': [0.99837035, 0.99799216, 0.9980141, 0.9980736, 0.99809664],
 'val_loss': [0.8810894889831543,
  0.8000532178878784,
  0.7619827425479889,
  0.7598212015628815,
  0.7141085361242294],
 'val_accuracy': [0.15375, 0.15375, 0.15375, 0.15375, 0.15375],
 'val_auc': [0.5, 0.5, 0.5, 0.5, 0.5]}

In [19]:
! echo "$training_time" > ./toxicity-detection/models/distilbert/train.log

In [24]:
training_time/3600, print("Average train loss: ", np.average(train_history.history['loss']))

Average train loss:  0.0514314263428863


(14.959554066260656, None)

In [25]:
# VALIDATE!
start = time.time()
n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch = n_steps,
    epochs = EPOCHS * 2
)
validation_time = time.time() - start

Train for 500 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
! echo "$validation_time" > ./toxicity-detection/models/distilbert/validate.log

In [29]:
print(f"Training time: {training_time}")
print(f"Average train accuracy: {np.average(train_history.history['accuracy'])}")
print(f"Validation time: {validation_time}")
print(f"Average validation accuracy: {np.average(train_history_2.history['accuracy'])}")

Training time: 53854.39463853836
Average train accuracy: 0.9810633659362793
Validation time: 2780.5180530548096
Average validation accuracy: 0.8454625010490417


In [30]:
# !mkdir -p models
print('Start saving')
model.save(f'./toxicity-detection/models/distilbert/deep_distilbert_batch{BATCH_SIZE}_epochs{EPOCHS}_maxlen{MAX_LEN}')
print('End saving')

Start saving
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ./toxicity-detection/models/distilbert/deep_distilbert_batch16_epochs5_maxlen256/assets
End saving


In [None]:
sub['toxic'] = model.predict(test_dataset, verbose = 1)
sub.to_csv('submission.csv', index = False)

In [16]:
simple = f'./toxicity-detection/models/distilbert/distilbert_batch16_epochs3_maxlen192'
deep = f'./toxicity-detection/models/distilbert/deep_distilbert_batch16_epochs5_maxlen256'
simple_model = tf.keras.models.load_model(simple)
deep_model = tf.keras.models.load_model(deep)

In [18]:
simple_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
tf_distil_bert_model (TFDist multiple                  134734080 
_________________________________________________________________
tf_op_layer_strided_slice (T multiple                  0         
_________________________________________________________________
dropout_19 (Dropout)         multiple                  0         
_________________________________________________________________
toxicity_prob_head (Dense)   multiple                  769       
Total params: 134,734,849
Trainable params: 134,734,849
Non-trainable params: 0
_________________________________________________________________


In [19]:
deep_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 256)]             0         
_________________________________________________________________
tf_distil_bert_model (TFDist multiple                  134734080 
_________________________________________________________________
tf_op_layer_strided_slice (T multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  393728    
_________________________________________________________________
dropout_19 (Dropout)         multiple                  0         
_________________________________________________________________
dense_1 (Dense)              multiple                  131328    
_________________________________________________________________
dropout_20 (Dropout)         multiple                  0     

In [23]:
tokenizer

Tokenizer(vocabulary_size=119547, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False, wordpieces_prefix=##)

In [21]:
def toxic(text, maxlen, model):
    word = pd.DataFrame(data = {'content': [text]})
    word_test = fast_encode(word.content.astype(str), tokenizer, maxlen = maxlen)
    word_test_dataset = (
        tf.data.Dataset
        .from_tensor_slices(word_test)
        .batch(BATCH_SIZE)
    )
    pred = model.predict(word_test_dataset, verbose = 1)
    return f'Toxic {np.round(pred[0][0] * 100, 3)}%'

In [24]:
text = 'hi'
print(toxic(text, maxlen = 192, model = simple_model))
print(toxic(text, maxlen = MAX_LEN, model = deep_model))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Toxic 12.497%


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Toxic 26.063%


In [None]:
x_test.shape

In [36]:
test

Unnamed: 0,id,content,lang
0,0,Doctor Who adlı viki başlığına 12. doctor olar...,tr
1,1,"Вполне возможно, но я пока не вижу необходимо...",ru
2,2,"Quindi tu sei uno di quelli conservativi , ...",it
3,3,Malesef gerçekleştirilmedi ancak şöyle bir şey...,tr
4,4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...,tr
...,...,...,...
63807,63807,"No, non risponderò, come preannunciato. Prefer...",it
63808,63808,"Ciao, I tecnici della Wikimedia Foundation sta...",it
63809,63809,innnazitutto ti ringrazio per i ringraziamenti...,it
63810,63810,Kaç olumlu oy gerekiyor? Şu an 7 oldu. Hayır...,tr


In [37]:
model.predict(test_dataset, verbose=1)



In [63]:
lang = 'ru'
test_try = pd.concat((test[test.lang == lang].content, sub.toxic[test.lang == lang]), axis = 1)
test_try[test_try.toxic > 0.99].content.values[6]

'Вот скажите мне дураку! Растолкуйте! ЧТО ЖЕ ВЫ ЗА МУЖИК ТО ТАКОЙ??? Что кроме Ваших модераторов никто неимеет права создавать статьи в Википедии??? Зачем была удалена статья о Алине Кизияровой??? Я хотел ещё раньше создать статью о сериале Дальнобойщики-4 у меня тоже удаляли Ваши коллеги! Наверняка Вам много платят с удовольствием бы поработал как Вы! И Легко и хорошооплачиваемо! Вот это мечта быть на Вашем месте!!! Но Вы мне растолкуете неужели нельзя никому кроме модераторов ничего создавать тут??? Денис-Кошкарёв '

In [86]:
import re
#train[re.search('not prepared',train.comment_text)]
train[train.comment_text.str.contains("your address")]

Unnamed: 0,comment_text,toxic
19045,Marcus Qwertyus \n\n Your a freeking homosexua...,1
29398,==Please read unblock request carefully\n\nThi...,0
40763,"Sorry SITUSH, there is no contact us section o...",0
68580,"Where do you live, you little shit? \n\nGive u...",1
86945,You seem to have something against me. If you ...,1
89003,Thank you for being such a politically correct...,1
97667,"""\n\nSo I will start with a criticism of the q...",0
105685,"""\n\nElon refers to BFTS as Big FALCON Test St...",0
120380,"""\n\nDiscuss this topic\nPlease type the follo...",0
121335,You're dead \n\nIt won't be hard finding your ...,1


In [88]:
train.loc[27288].values

array([["I'm not prepared for you, I promise you 13 of January \n\n                         /´¯/) \n                      ,/¯  // \n                     /    / / \n             /´¯/'   '/´¯¯`•¸ \n          /'/   /    /       /¨¯\\ \n        ('(   ´(  ´      ,~/'   ') \n         \\                 \\/    / \n             \\           _ •´ \n            \\              ( \n              \\             \\\n\nVersion 2\n                         /´¯/) \n                      ,/¯  // \n                     /    / / \n             /´¯/'   '/´¯¯`•¸ \n          /'/   /    /       /¨¯\\ \n        ('(   ´(  ´      ,~/'   ') \n         \\                 \\/    / \n             \\           _ •´ \n            \\              ( \n              \\             \\",
        0],
       ['My plan A is to see this collection of fools and their bedfellows in the other party dumped and replaced by people who appreciate the needs of working Alaskans in the cities and villages and have a different priority t

In [91]:
def clean_text(text, lang='en'):
    text = str(text)
    text = re.sub(r'[0-9"]', '', text)
    text = re.sub(r'#[\S]+\b', '', text)
    text = re.sub(r'@[\S]+\b', '', text)
    text = re.sub(r'https?\S+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [92]:
clean_text(train.loc[27288].values)

"[[I'm not prepared for you, I promise you of January \\n\\n /´¯/) \\n ,/¯ // \\n / / / \\n /´¯/' '/´¯¯`•¸ \\n /'/ / / /¨¯\\\\ \\n ('( ´( ´ ,~/' ') \\n \\\\ \\\\/ / \\n \\\\ _ •´ \\n \\\\ ( \\n \\\\ \\\\\\n\\nVersion \\n /´¯/) \\n ,/¯ // \\n / / / \\n /´¯/' '/´¯¯`•¸ \\n /'/ / / /¨¯\\\\ \\n ('( ´( ´ ,~/' ') \\n \\\\ \\\\/ / \\n \\\\ _ •´ \\n \\\\ ( \\n \\\\ \\\\ ] ['My plan A is to see this collection of fools and their bedfellows in the other party dumped and replaced by people who appreciate the needs of working Alaskans in the cities and villages and have a different priority than protecting the Alaskan tax haven and giveaways to the oil companies.' ]]"