In [None]:
import os
import csv

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
import re
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

In [None]:
files_translated = [
"../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-es.csv",
"../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-fr.csv",
"../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-it.csv",
"../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-pt.csv",
"../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-ru.csv",
"../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-tr.csv"]

In [None]:
def one_train(files_translated):
    train = []

    for file in files_translated:
        lang = file.split('-')[-1].split('.')[0]
        df = pd.read_csv(file).dropna(subset=['comment_text', 'toxic'])
        df.loc[:, 'lang'] = lang
        train.append(df[['comment_text', 'lang', 'toxic']])
    untinded_train = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv').sample(frac=0.2)
    untinded_train.loc[:, 'lang'] = 'en'
    untinded_train['toxic'] = untinded_train['toxic'].astype(int).astype(str)
    
    hate = pd.read_csv('../input/hatespeechmulti/cleaned_data_hatespeech.csv')
    
    train.append(untinded_train[['comment_text', 'lang', 'toxic']])
    train.append(hate[['comment_text', 'lang', 'toxic']])
    
    train = pd.concat(train, axis=0)
    train_pos = train[train.toxic == '1']
    train = pd.concat([train[train.toxic=='0'].sample(frac=0.2), train_pos], axis=0).sample(frac=0.5)
    train.to_csv('train_translated.csv', index=None)
    
def regular_encode(texts, tokenizer, maxlen=512):
    enc = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=True, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc['input_ids']), np.array(enc['attention_mask'])

In [None]:
one_train(files_translated)

In [None]:
train = pd.read_csv('train_translated.csv')

In [None]:
train.shape

In [None]:
valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')
submission = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

# Data access
#GCS_DS_PATH = KaggleDatasets().get_gcs_path()

# Configuration
EPOCHS = 1
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 300
MODEL = 'jplu/tf-xlm-roberta-large'

In [None]:
BATCH_SIZE

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
def replate_stars(text):
    d = [('б**дь', 'блядь'), ('викип**оры', 'википидоры'),
         ('п**ды', 'пизды'), ('ж***', 'жопа'), ('зае**л', 'заебал'),
         ('за**ал', 'заебал'), ('c****a', 'сука'), ('чернож***е', 'черножопые'),
         ('х**', 'хуй'), ('X**', 'Xуй'), ('х**ня', 'хуйня'), ('п****', 'пизда'), ('б**ть', 'блять'), ('пи***ж', 'пиздеж'), 
         ('вики*****оф', 'википидоров'), ('ё***ый', "ебаный"), ('Пид***ий', "Пидорский"),
         ('а**ели', "охуели"), ('Бл**ь', 'Блять'), ('н****й', 'нахуй'), ('ох***и', "охуели"),
         ('**нулись', "ебанулись"), ('х***ю', "хуею"), ('ПИ***ПЕДИЮ', 'ПИДОРОПЕДИЮ'), ('С**ная', "Ссаная"),
         ('c**ное', 'ссаное'), ('Пи**ец', 'Пиздец'), ('выё**вайся', "выебывайся"), ('бл***','блять'),
         ('c**zo', "cazzo"), ('c**zi', 'cazzi'), ('пи**еж', "пиздеж"), ('г***о', 'говно'),
         ('пиз**тины', "пизды"), ('нас***ь', "насрать"), ('пи**р', 'пидор'), ('б***ь', "блядь"),
         ('m****', "merde"), ('f**k', "fuck")
        ]
    for w1, w2 in d:
        text = text.replace(w1, w2)
    return text
    

In [None]:
valid['comment_text'] = valid['comment_text'].apply(replate_stars)
test['content'] = test['content'].apply(replate_stars)

In [None]:
%%time

x_train, mask_train = regular_encode(train.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_valid, mask_valid = regular_encode(valid.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_test, mask_test = regular_encode(test.content.values, tokenizer, maxlen=MAX_LEN)

y_train = train.toxic.values
y_valid = valid.toxic.values

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices(((x_train, mask_train), y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices(((x_valid, mask_valid), y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(((x_test, mask_test), ))
    .batch(BATCH_SIZE)
)

In [None]:
def build_model(transformer, max_len=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
    sequence_output = transformer(input_word_ids, attention_mask=attention_mask)[0]
    cls_token = sequence_output[:, 0, :]
    drop = Dropout(0.1)(cls_token)
    out = Dense(1, activation='sigmoid', name='output')(drop)
    
    model = Model(inputs=[input_word_ids, attention_mask], outputs=out)
#     learning_rate = CustomSchedule(0, 2e-6, 500, 10000)
    model.compile(Adam(2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

In [None]:
EPOCHS = 4
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

In [None]:
n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=5
)

In [None]:
sub['toxic'] = model.predict(test_dataset)

In [None]:
submission=sub[['id','toxic']]

In [None]:
submission.to_csv('submission.csv',index=False)