In [1]:
import os

In [2]:

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors



In [3]:
def seed_all(seed=0):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
seed = 42
seed_all(42)

## Helper Functions

In [4]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [5]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=True, 
        return_token_type_ids=True,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'], dtype=np.int32), np.array(enc_di['attention_mask'], dtype=np.int32), np.array(enc_di["token_type_ids"], dtype=np.int32)

In [6]:
def build_model(transformer, max_len=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    sequence_output = transformer((input_word_ids, input_mask, segment_ids))[0]
    cls_token = sequence_output[:, 0, :]
    gp = tf.keras.layers.GlobalMaxPooling1D()(sequence_output)
    ap = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
    stack = tf.keras.layers.concatenate([gp, ap], axis=1)
    stack = tf.keras.layers.Dropout(0.2)(stack)
    out = Dense(1, activation='sigmoid')(stack)
    
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=0.2e-5), loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC()])
    
    return model

## TPU Configs

In [7]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [8]:
AUTO = tf.data.experimental.AUTOTUNE

# Data access
#GCS_DS_PATH = KaggleDatasets().get_gcs_path()

# Configuration
EPOCHS = 2
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192
MODEL = 'jplu/tf-xlm-roberta-large'

## Create fast tokenizer

In [9]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=513.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




In [10]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-es-cleaned.csv
/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-it-cleaned.csv
/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-ru.csv
/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-fr.csv
/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-tr.csv
/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-it.csv
/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-ru-cleaned.csv
/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-tr-cleaned.csv
/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-pt-cleaned.csv
/kaggle/input/jigsaw-train-multilingual-coments-google-api/j

## Load text data into memory

In [11]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
train2.toxic = train2.toxic.round().astype(int)

train3 = pd.read_csv("/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-ru-cleaned.csv")
train4 = pd.read_csv("/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-it-cleaned.csv")
train5 = pd.read_csv("/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-pt-cleaned.csv")
train6 = pd.read_csv("/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-es-cleaned.csv")
train7 = pd.read_csv("/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-tr-cleaned.csv")
train8 = pd.read_csv("/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-fr-cleaned.csv")
valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')

In [12]:
#Combine train1 with a subset of train2
# train = pd.concat([
#     train1[['comment_text', 'toxic']],
#     train2[['comment_text', 'toxic']].query('toxic==1'),
#     train2[['comment_text', 'toxic']].query('toxic==0').sample(n=100000, random_state=0)
# ])

train_ru = pd.concat([
    train3[['comment_text', 'toxic']].query('toxic==1'),
    train3[['comment_text', 'toxic']].query('toxic==0').sample(n=100000, random_state=0)
])
train_it = pd.concat([
    train4[['comment_text', 'toxic']].query('toxic==1'),
    train4[['comment_text', 'toxic']].query('toxic==0').sample(n=100000, random_state=0)
])
train_pt= pd.concat([
    train5[['comment_text', 'toxic']].query('toxic==1'),
    train5[['comment_text', 'toxic']].query('toxic==0').sample(n=100000, random_state=0)
])
train_es= pd.concat([
    train6[['comment_text', 'toxic']].query('toxic==1'),
    train6[['comment_text', 'toxic']].query('toxic==0').sample(n=100000, random_state=0)
])
train_tr= pd.concat([
    train7[['comment_text', 'toxic']].query('toxic==1'),
    train7[['comment_text', 'toxic']].query('toxic==0').sample(n=100000, random_state=0)
])
train_fr= pd.concat([
    train8[['comment_text', 'toxic']].query('toxic==1'),
    train8[['comment_text', 'toxic']].query('toxic==0').sample(n=100000, random_state=0)
])

In [13]:
%%time 

# x_train = regular_encode(train.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_train_ru = regular_encode(train_ru.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_train_it = regular_encode(train_it.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_train_pt = regular_encode(train_pt.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_train_es = regular_encode(train_es.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_train_tr = regular_encode(train_tr.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_train_fr = regular_encode(train_fr.comment_text.values, tokenizer, maxlen=MAX_LEN)

x_valid = regular_encode(valid.comment_text.values, tokenizer, maxlen=MAX_LEN)

x_test = regular_encode(test.content.values, tokenizer, maxlen=MAX_LEN)


# y_train = train.toxic.values
y_train_ru = train_ru.toxic.values
y_train_it = train_it.toxic.values
y_train_pt = train_pt.toxic.values
y_train_es = train_es.toxic.values
y_train_tr = train_tr.toxic.values
y_train_fr = train_fr.toxic.values
y_valid = valid.toxic.values

CPU times: user 13min 53s, sys: 5.2 s, total: 13min 58s
Wall time: 13min 58s


## Build datasets objects

In [14]:
# train_dataset = (
#     tf.data.Dataset
#     .from_tensor_slices((x_train, y_train))
#     .repeat()
#     .shuffle(len(train))
#     .batch(BATCH_SIZE)
#     .prefetch(AUTO)
# )

train_ru_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train_ru, y_train_ru))
    .repeat()
    .shuffle(len(train_ru))
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

train_it_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train_it, y_train_it))
    .repeat()
    .shuffle(len(train_it))
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

train_pt_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train_pt, y_train_pt))
    .repeat()
    .shuffle(len(train_pt))
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

train_es_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train_es, y_train_es))
    .repeat()
    .shuffle(len(train_es))
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

train_tr_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train_tr, y_train_tr))
    .repeat()
    .shuffle(len(train_tr))
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

train_fr_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train_fr, y_train_fr))
    .repeat()
    .shuffle(len(train_fr))
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

# test_dataset = (
#     tf.data.Dataset
#     .from_tensor_slices(x_test)
#     .batch(BATCH_SIZE)
# )

## Load model into the TPU

In [15]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3271420488.0, style=ProgressStyle(descr…


CPU times: user 2min 18s, sys: 46.5 s, total: 3min 5s
Wall time: 3min 33s


In [16]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 192)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 192)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 192)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode ((None, 192, 1024),  559890432   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

## Train Model

First, we train on the subset of the training set, which is completely in English.
<p>测试集没有英文，这个可训可不训</p>

In [17]:
# n_steps = train.shape[0] // BATCH_SIZE
# train_history = model.fit(
#     train_dataset,
#     steps_per_epoch=n_steps,
#     validation_data=valid_dataset,
#     epochs=EPOCHS,
#     shuffle=False,
# )

In [18]:
n_steps = train_fr.shape[0] // BATCH_SIZE
train_fr_history = model.fit(
    train_fr_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=2,
    shuffle=False,
)

Epoch 1/2


  num_elements)


Epoch 2/2


In [19]:
n_steps = train_tr.shape[0] // BATCH_SIZE
train_tr_history = model.fit(
    train_tr_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS,
    shuffle=False,
)

Epoch 1/2
Epoch 2/2


In [20]:
n_steps = train_es.shape[0] // BATCH_SIZE
train_es_history = model.fit(
    train_es_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=2,
    shuffle=False,
)

Epoch 1/2
Epoch 2/2


In [21]:
n_steps = train_pt.shape[0] // BATCH_SIZE
train_pt_history = model.fit(
    train_pt_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=2,
    shuffle=False,
)

Epoch 1/2
Epoch 2/2


In [22]:
import gc
gc.collect()

77483

In [23]:
n_steps = train_ru.shape[0] // BATCH_SIZE
train_ru_history = model.fit(
    train_ru_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=2,
    shuffle=False,
)

Epoch 1/2
Epoch 2/2


In [24]:
n_steps = train_it.shape[0] // BATCH_SIZE
train_it_history = model.fit(
    train_it_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=2,
    shuffle=False,
)

Epoch 1/2
Epoch 2/2


Now that we have pretty much saturated the learning potential of the model on english only data, we train it for one more epoch on the `validation` set, which is significantly smaller but contains a mixture of different languages.

In [25]:
n_steps = valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=2
)

Epoch 1/2
Epoch 2/2


## Submission

In [26]:
sub['toxic'] = model.predict(x_test, verbose=1)
sub.to_csv('submission.csv', index=False)

