In [1]:
!pip install bert-tensorflow
!pip install pandas
!pip install transformers # https://huggingface.co/transformers/installation.html#with-pip



In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

import transformers
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer

print(tf.version.VERSION)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

2.2.0
Num GPUs Available:  0


In [3]:
DATA_PATH =  "./jigsaw-multilingual-toxic-comment-classification"

In [9]:
# Training data from our first competition,
# https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data
wiki_toxic_comment_data = "jigsaw-toxic-comment-train.csv"

wiki_toxic_comment_train = pandas.read_csv(os.path.join(DATA_PATH, wiki_toxic_comment_data))
wiki_toxic_comment_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
# Tokenizer
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    tokenizer.enable_truncation(max_length = maxlen)
    tokenizer.enable_padding(max_length = maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i + chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [96]:
# Optimizer
def set_optimizer(type = 'adam', lr = 1e-5):
    if type == 'adam':
        return Adam(lr = lr)
    elif type == 'adadelta':
        return Adadelta(lr = lr)
    else: 
        return None

In [97]:
# Build model
def build_model(transformer, optimizer = set_optimizer('adam'), dropout_prop = 0.35, 
                loss = 'binary_crossentropy', max_len = 512):
    # Instantiate tensor with input word IDs
    input_word_ids = Input(shape = (max_len,), dtype = tf.int32, name ='input_word_ids')
    
    # Forward through transformer network
    # Hugginface transformers have multiple outputs, embeddings are at the first one
    sequence_output = transformer(input_word_ids)[0]
    # We slice the first position, the paper says it is not worse than pooling
    cls_token = sequence_output[:, 0, :]
    
    # Apply dropout to dense layer and final layer to ouput toxicity probability
    x = tf.keras.layers.Dropout(dropout_prop)(cls_token)
    out = Dense(1, activation = 'sigmoid', name = 'toxicity_prob_head')(x)
    
    model = Model(inputs = input_word_ids, outputs = out)
    model.compile(optimizer, loss = loss, metrics = ['accuracy', tf.keras.metrics.AUC()])
    
    return model

In [6]:
#tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
# Save the loaded tokenizer locally
#tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('tokenizers/distilbert/vocab.txt', lowercase=False)
fast_tokenizer

Tokenizer(vocabulary_size=119547, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False, wordpieces_prefix=##)

### Load data and balance toxicity

In [18]:
train1 = pd.read_csv(os.path.join(DATA_PATH, 'jigsaw-toxic-comment-train.csv'))
train2 = pd.read_csv(os.path.join(DATA_PATH, 'jigsaw-unintended-bias-train.csv'))
train2.toxic = train2.toxic.round().astype(int)

valid = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
sub = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'))

In [118]:
print(f"Train1 toxic comments {len(train1[['comment_text', 'toxic']].query('toxic==1'))}")
print(f"Train1 non-toxic comments {len(train1[['comment_text', 'toxic']].query('toxic==0'))}")
print(f"Proportion train1 toxic comments {len(train1[['comment_text', 'toxic']].query('toxic==1')) / len(train1[['comment_text', 'toxic']]) * 100}")
print(f"Proportion train2 non-toxic comments {len(train1[['comment_text', 'toxic']].query('toxic==0')) / len(train1[['comment_text', 'toxic']]) * 100}\n")

print(f"Train2 toxic comments {len(train2[['comment_text', 'toxic']].query('toxic==1'))}")
print(f"Train2 non-toxic comments {len(train2[['comment_text', 'toxic']].query('toxic==0'))}")
print(f"Proportion train2 toxic comments {len(train2[['comment_text', 'toxic']].query('toxic==1')) / len(train2[['comment_text', 'toxic']]) * 100}")
print(f"Proportion train2 non-toxic comments {len(train2[['comment_text', 'toxic']].query('toxic==0')) / len(train2[['comment_text', 'toxic']]) * 100}")

Train1 toxic comments 21384
Train1 non-toxic comments 202165
Proportion train1 toxic comments 9.565688059441108
Proportion train2 non-toxic comments 90.4343119405589

Train2 toxic comments 112226
Train2 non-toxic comments 1789968
Proportion train2 toxic comments 5.899818840770185
Proportion train2 non-toxic comments 94.1001811592298


In [122]:
# Downsample the train dataframe to 50% - 50%
train = pd.concat([
    train1[['comment_text', 'toxic']].query('toxic==1'),
    train1[['comment_text', 'toxic']].query('toxic==0').sample(n = 25000, random_state = 0),
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n = 150000, random_state = 0)
])

In [123]:
print(len(train))
print(f"Final dataset toxic proportion {len(train[['comment_text', 'toxic']].query('toxic==1')) / len(train[['comment_text', 'toxic']]) * 100}")
print(f"Final dataset non-toxic proportion {len(train[['comment_text', 'toxic']].query('toxic==0')) / len(train[['comment_text', 'toxic']]) * 100}")

308610
Final dataset toxic proportion 43.294125271378114
Final dataset non-toxic proportion 56.705874728621886


### Tokenization

In [21]:
# Configuration
EPOCHS = 3
BATCH_SIZE = 16 
MAX_LEN = 192

x_train = fast_encode(train.comment_text.astype(str), fast_tokenizer, maxlen = MAX_LEN)
x_valid = fast_encode(valid.comment_text.astype(str), fast_tokenizer, maxlen = MAX_LEN)
x_test = fast_encode(test.content.astype(str), fast_tokenizer, maxlen = MAX_LEN)

y_train = train.toxic.values
y_valid = valid.toxic.values

HBox(children=(FloatProgress(value=0.0, max=1898.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))




In [23]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

In [24]:
transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-multilingual-cased')
    )
model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=910749124.0, style=ProgressStyle(descri…


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
tf_distil_bert_model (TFDist ((None, 192, 768),)       134734080 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 769       
Total params: 134,734,849
Trainable params: 134,734,849
Non-trainable params: 0
_________________________________________________________________


In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Train for 30360 steps, validate for 500 steps
Epoch 1/3

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 2/3

In [None]:
n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS*2
)

In [None]:
sub['toxic'] = model.predict(test_dataset, verbose=1)
sub.to_csv('submission.csv', index=False)

In [None]:
!mkdir -p models
model.save(f'models/distilbert_batch{BATCH_SIZE}_epochs{EPOCHS}_maxlen{MAX_LEN}') 

In [6]:
with tf.device('cpu:0'):
    new_model = tf.keras.models.load_model(f'models/CPU_distilbert_batch16_epochs3_maxlen192')

In [7]:
new_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________


In [11]:
def toxic(text):
    word = pd.DataFrame(data = {'content': [text]})
    print(word)
    word_test = fast_encode(word.content.astype(str), fast_tokenizer, maxlen = 192)
    print(word_test)
    word_test_dataset = (
        tf.data.Dataset
        .from_tensor_slices(word_test)
        .batch(16)
    )
    print(word_test_dataset)
    pred = new_model.predict(word_test_dataset, verbose = 1)
    print(pred)
    return f'Toxic {np.round(pred[0][0] * 100, 3)}%'

In [12]:
toxic("Hola")

content
0    Hola



[[  101 20220 10330   102     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0

ValueError: in user code:

    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:1147 predict_function  *
        outputs = self.distribute_strategy.run(
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:1122 predict_step  **
        return self(x, training=False)
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py:927 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/saving/saved_model/utils.py:71 return_outputs_and_add_losses
        outputs, losses = fn(inputs, *args, **kwargs)
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/saving/saved_model/utils.py:170 wrap_with_training_arg
        lambda: replace_training_and_call(False))
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/utils/tf_utils.py:65 smart_cond
        pred, true_fn=true_fn, false_fn=false_fn, name=name)
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/smart_cond.py:56 smart_cond
        return false_fn()
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/saving/saved_model/utils.py:170 <lambda>
        lambda: replace_training_and_call(False))
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/saving/saved_model/utils.py:165 replace_training_and_call
        return wrapped_call(*args, **kwargs)
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py:580 __call__
        result = self._call(*args, **kwds)
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py:627 _call
        self._initialize(args, kwds, add_initializers_to=initializers)
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py:506 _initialize
        *args, **kwds))
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/eager/function.py:2446 _get_concrete_function_internal_garbage_collected
        graph_function, _, _ = self._maybe_define_function(args, kwargs)
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/eager/function.py:2777 _maybe_define_function
        graph_function = self._create_graph_function(args, kwargs)
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/eager/function.py:2667 _create_graph_function
        capture_by_value=self._capture_by_value),
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/func_graph.py:981 func_graph_from_py_func
        func_outputs = python_func(*func_args, **func_kwargs)
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py:441 wrapped_fn
        return weak_wrapped_fn().__wrapped__(*args, **kwds)
    /Users/Margi7/anaconda3/lib/python3.6/site-packages/tensorflow/python/saved_model/function_deserialization.py:261 restored_function_body
        "\n\n".join(signature_descriptions)))

    ValueError: Could not find matching function to call loaded from the SavedModel. Got:
      Positional arguments (3 total):
        * Tensor("inputs:0", shape=(None, 192), dtype=int64)
        * False
        * None
      Keyword arguments: {}
    
    Expected these arguments to match one of the following 4 option(s):
    
    Option 1:
      Positional arguments (3 total):
        * TensorSpec(shape=(None, 192), dtype=tf.int32, name='input_word_ids')
        * True
        * None
      Keyword arguments: {}
    
    Option 2:
      Positional arguments (3 total):
        * TensorSpec(shape=(None, 192), dtype=tf.int32, name='inputs')
        * False
        * None
      Keyword arguments: {}
    
    Option 3:
      Positional arguments (3 total):
        * TensorSpec(shape=(None, 192), dtype=tf.int32, name='input_word_ids')
        * False
        * None
      Keyword arguments: {}
    
    Option 4:
      Positional arguments (3 total):
        * TensorSpec(shape=(None, 192), dtype=tf.int32, name='inputs')
        * True
        * None
      Keyword arguments: {}


In [36]:
test

Unnamed: 0,id,content,lang
0,0,Doctor Who adlı viki başlığına 12. doctor olar...,tr
1,1,"Вполне возможно, но я пока не вижу необходимо...",ru
2,2,"Quindi tu sei uno di quelli conservativi , ...",it
3,3,Malesef gerçekleştirilmedi ancak şöyle bir şey...,tr
4,4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...,tr
...,...,...,...
63807,63807,"No, non risponderò, come preannunciato. Prefer...",it
63808,63808,"Ciao, I tecnici della Wikimedia Foundation sta...",it
63809,63809,innnazitutto ti ringrazio per i ringraziamenti...,it
63810,63810,Kaç olumlu oy gerekiyor? Şu an 7 oldu. Hayır...,tr


In [37]:
sub['toxic'] = model.predict(test_dataset, verbose=1)



In [63]:
lang = 'ru'
test_try = pd.concat((test[test.lang == lang].content, sub.toxic[test.lang == lang]), axis = 1)
test_try[test_try.toxic > 0.99].content.values[6]

'Вот скажите мне дураку! Растолкуйте! ЧТО ЖЕ ВЫ ЗА МУЖИК ТО ТАКОЙ??? Что кроме Ваших модераторов никто неимеет права создавать статьи в Википедии??? Зачем была удалена статья о Алине Кизияровой??? Я хотел ещё раньше создать статью о сериале Дальнобойщики-4 у меня тоже удаляли Ваши коллеги! Наверняка Вам много платят с удовольствием бы поработал как Вы! И Легко и хорошооплачиваемо! Вот это мечта быть на Вашем месте!!! Но Вы мне растолкуете неужели нельзя никому кроме модераторов ничего создавать тут??? Денис-Кошкарёв '

In [86]:
import re
#train[re.search('not prepared',train.comment_text)]
train[train.comment_text.str.contains("your address")]

Unnamed: 0,comment_text,toxic
19045,Marcus Qwertyus \n\n Your a freeking homosexua...,1
29398,==Please read unblock request carefully\n\nThi...,0
40763,"Sorry SITUSH, there is no contact us section o...",0
68580,"Where do you live, you little shit? \n\nGive u...",1
86945,You seem to have something against me. If you ...,1
89003,Thank you for being such a politically correct...,1
97667,"""\n\nSo I will start with a criticism of the q...",0
105685,"""\n\nElon refers to BFTS as Big FALCON Test St...",0
120380,"""\n\nDiscuss this topic\nPlease type the follo...",0
121335,You're dead \n\nIt won't be hard finding your ...,1


In [88]:
train.loc[27288].values

array([["I'm not prepared for you, I promise you 13 of January \n\n                         /´¯/) \n                      ,/¯  // \n                     /    / / \n             /´¯/'   '/´¯¯`•¸ \n          /'/   /    /       /¨¯\\ \n        ('(   ´(  ´      ,~/'   ') \n         \\                 \\/    / \n             \\           _ •´ \n            \\              ( \n              \\             \\\n\nVersion 2\n                         /´¯/) \n                      ,/¯  // \n                     /    / / \n             /´¯/'   '/´¯¯`•¸ \n          /'/   /    /       /¨¯\\ \n        ('(   ´(  ´      ,~/'   ') \n         \\                 \\/    / \n             \\           _ •´ \n            \\              ( \n              \\             \\",
        0],
       ['My plan A is to see this collection of fools and their bedfellows in the other party dumped and replaced by people who appreciate the needs of working Alaskans in the cities and villages and have a different priority t

In [91]:
def clean_text(text, lang='en'):
    text = str(text)
    text = re.sub(r'[0-9"]', '', text)
    text = re.sub(r'#[\S]+\b', '', text)
    text = re.sub(r'@[\S]+\b', '', text)
    text = re.sub(r'https?\S+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [92]:
clean_text(train.loc[27288].values)

"[[I'm not prepared for you, I promise you of January \\n\\n /´¯/) \\n ,/¯ // \\n / / / \\n /´¯/' '/´¯¯`•¸ \\n /'/ / / /¨¯\\\\ \\n ('( ´( ´ ,~/' ') \\n \\\\ \\\\/ / \\n \\\\ _ •´ \\n \\\\ ( \\n \\\\ \\\\\\n\\nVersion \\n /´¯/) \\n ,/¯ // \\n / / / \\n /´¯/' '/´¯¯`•¸ \\n /'/ / / /¨¯\\\\ \\n ('( ´( ´ ,~/' ') \\n \\\\ \\\\/ / \\n \\\\ _ •´ \\n \\\\ ( \\n \\\\ \\\\ ] ['My plan A is to see this collection of fools and their bedfellows in the other party dumped and replaced by people who appreciate the needs of working Alaskans in the cities and villages and have a different priority than protecting the Alaskan tax haven and giveaways to the oil companies.' ]]"