# Toxic Comment Classification Challenge
## BERT - TensorFlow 2 & Hugging Face Transformers Library

In [None]:
#DON'T TOUCH!!
!pip install transformers
!pip install tensorflow==2.1.0
!pip install tensorflow-gpu

In [None]:
dataset_directory = '../input/aldon-preprocessed'

In [None]:
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras

## 1. Data Pipeline
- Loading the datasets from CSVs
- Preprocessing (Tokenization, Truncation & Padding)
- Creating efficient data pipelines using tf.data

In [None]:
train_path = '../input/aldon-preprocessed-2/preprocessed_train_data.csv'
test_path = '../input/aldon-preprocessed-2/preprocessed_test_data.csv'
test_labels_path = '../input/aldon-preprocessed-2/test_label.csv'
subm_path = '../input/aldon-preprocessed/sample_submission.csv'

In [None]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'insult', 'threat', 'identity_hate']

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_test_labels = pd.read_csv(test_labels_path)
df_test_labels = df_test_labels.set_index('id')

df_train.head()

In [None]:
df_test.head()

In [None]:
#Checking types of each columns from df_train
empDfObj = pd.DataFrame(df_train, columns=['id', 'comment_text', 'toxic', 'severe_toxic','obscene','threat','insult','indentity_hate'])
print('Data type of each column of Dataframe :')
dataTypeSeries = empDfObj.dtypes

print(dataTypeSeries)

In [None]:
#Checking types of each columns from df_test
empDfObj = pd.DataFrame(df_test, columns=['id', 'comment_text'])
print('Data type of each column of Dataframe :')
dataTypeSeries = empDfObj.dtypes

print(dataTypeSeries)

In [None]:
#Initialize Bert tokenizer and masks
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences

bert_model_name = 'bert-base-multilingual-uncased'

tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)
MAX_LEN = 128

def tokenize_sentences(sentences, tokenizer, max_seq_len = 128):
    tokenized_sentences = []

    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
                            sentence,                  # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_seq_len,  # Truncate all sentences.
                    )
        
        tokenized_sentences.append(tokenized_sentence)

    return tokenized_sentences

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []

    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)

    return np.asarray(attention_masks)

input_ids = tokenize_sentences(df_train['comment_text'], tokenizer, MAX_LEN)
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
attention_masks = create_attention_masks(input_ids)


In [None]:
from sklearn.model_selection import train_test_split

labels =  df_train[label_cols].values
test_labels = df_test[label_cols].values
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=0, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=0, test_size=0.1)

train_size = len(train_inputs)
validation_size = len(validation_inputs)

In [None]:
#Check if TPU is available
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.MirroredStrategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
#Create train and validation datasets
BATCH_SIZE=32 
NR_EPOCHS=2
def create_dataset(data_tuple, epochs=1, batch_size=32, buffer_size=10000, train=True):
    dataset = tf.data.Dataset.from_tensor_slices(data_tuple)
    if train:
        dataset = dataset.shuffle(buffer_size=buffer_size)
    dataset = dataset.repeat(epochs)
    dataset = dataset.batch(batch_size)
    if train:
        dataset = dataset.prefetch(1)
    
    return dataset

train_dataset = create_dataset((train_inputs, train_masks, train_labels), epochs=NR_EPOCHS, batch_size=BATCH_SIZE)
validation_dataset = create_dataset((validation_inputs, validation_masks, validation_labels), epochs=NR_EPOCHS, batch_size=BATCH_SIZE)

## 2. BERT Model
- Load the pretrained BERT base-model from Transformers library
- Take the first hidden-state from BERT output (corresponding to CLS token) and feed it into a Dense layer with 6 neurons and sigmoid activation (Classifier). The outputs of this layer can be interpreted as probabilities for each of the 6 classes.

In [None]:
from transformers import TFBertModel

from tensorflow.keras.layers import Dense, Flatten

class BertClassifier(tf.keras.Model):    
        def __init__(self, bert: TFBertModel, num_classes: int):
            super().__init__()
            self.bert = bert
            self.classifier = Dense(num_classes, activation='sigmoid')

        @tf.function
        def call(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
            outputs = self.bert(input_ids,
                                   attention_mask=attention_mask,
                                   token_type_ids=token_type_ids,
                                   position_ids=position_ids,
                                   head_mask=head_mask)
            cls_output = outputs[1]
            cls_output = self.classifier(cls_output)

            return cls_output
        
model = BertClassifier(TFBertModel.from_pretrained(bert_model_name), len(label_cols))

## 3. Training Loop
- Use BinaryCrossentropy as loss function (is calculated for each of the output 6 output neurons ...that's like training 6 binary classification tasks at the same time) 
- Use the Adam optimizer 
- AUC evaluation metrics

In [None]:
import time
from transformers import create_optimizer
from sklearn.metrics import roc_curve, auc
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle


lw = 2
steps_per_epoch = train_size // BATCH_SIZE
validation_steps = validation_size // BATCH_SIZE

# | Loss Function
loss_object = tf.keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
train_loss = tf.keras.metrics.Mean(name='train_loss')
validation_loss = tf.keras.metrics.Mean(name='test_loss')

# | Optimizer 
warmup_steps = steps_per_epoch // 3
total_steps = steps_per_epoch * NR_EPOCHS - warmup_steps
optimizer = tf.keras.optimizers.Adam(lr=2e-5)

# | Metrics
train_auc_metrics = [tf.keras.metrics.AUC() for i in range(len(label_cols))]
validation_auc_metrics = [tf.keras.metrics.AUC() for i in range(len(label_cols))]

@tf.function
def train_step(model, token_ids, masks, labels):
    labels = tf.dtypes.cast(labels, tf.float32)
    
    with tf.GradientTape() as tape:
        predictions = model(token_ids, attention_mask=masks)
        loss = loss_object(labels, predictions)
    #Loss Function into gradient
    gradients = tape.gradient(loss, model.trainable_variables)
    #Apply gradient to optimizer
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)

    for i, auc in enumerate(train_auc_metrics):
        auc.update_state(labels[:,i], predictions[:,i])
        
@tf.function
def validation_step(model, token_ids, masks, labels):
    labels = tf.dtypes.cast(labels, tf.float32)

    predictions = model(token_ids, attention_mask=masks, training=False)
    v_loss = loss_object(labels, predictions)

    validation_loss(v_loss)
    for i, auc in enumerate(validation_auc_metrics):
        auc.update_state(labels[:,i], predictions[:,i])
        
fpr = dict()
tpr = dict()
roc_auc = dict()
n_class = 6

        
def train(model, train_dataset, val_dataset, train_steps_per_epoch, val_steps_per_epoch, epochs):
    for epoch in range(epochs):
        print('=' * 50, f"EPOCH {epoch+1}", '=' * 50)

        start = time.time()

        for i, (token_ids, masks, labels) in enumerate(tqdm(train_dataset, total=train_steps_per_epoch)):
            train_step(model, token_ids, masks, labels)
            if i % 500 == 0:
                print(f'\nTrain Step: {i}, Loss: {train_loss.result()}')
                for i, label_name in enumerate(label_cols):
                    print(f"{label_name} roc_auc {train_auc_metrics[i].result()}")
                    train_auc_metrics[i].reset_states()

        for i, (token_ids, masks, labels) in enumerate(tqdm(val_dataset, total=val_steps_per_epoch)):
            validation_step(model, token_ids, masks, labels)

        print(f'\nEpoch {epoch+1}, Validation Loss: {validation_loss.result()}, Time: {time.time()-start}\n')

        for i, label_name in enumerate(label_cols):
            print(f"{label_name} roc_auc {validation_auc_metrics[i].result()}")
            validation_auc_metrics[i].reset_states()

        print('\n')
    


train(model, train_dataset, validation_dataset, steps_per_epoch, validation_steps, NR_EPOCHS)


## 4. Run predictions on test-set & save submission

In [None]:
test_input_ids = tokenize_sentences(df_test['comment_text'], tokenizer, MAX_LEN)
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
test_attention_masks = create_attention_masks(test_input_ids)

In [None]:
# Metrics
labelstest = np.array(labels)
test_auc_metrics = [tf.keras.metrics.AUC() for i in range(len(label_cols))]

TEST_BATCH_SIZE = 32
test_steps = len(df_test) // TEST_BATCH_SIZE

test_dataset = create_dataset((test_input_ids, test_attention_masks, test_labels), batch_size=TEST_BATCH_SIZE, train=False, epochs=1)

df_submission = pd.read_csv(subm_path, index_col='id')

for i, (token_ids, masks, labels) in enumerate(tqdm(test_dataset, total=test_steps)):
    sample_ids = df_test.iloc[i*TEST_BATCH_SIZE:(i+1)*TEST_BATCH_SIZE]['id'] 
    predictions = model(token_ids, attention_mask=masks).numpy()
    for i, auc in enumerate(test_auc_metrics):
        auc.update_state(labels[:,i], predictions[:,i])
    
    df_submission.loc[sample_ids, label_cols] = predictions

In [None]:
label_acc=[0,0,0,0,0,0] 
print("Testing Accuracy : ")
for i, label_name in enumerate(label_cols):
    print(f"{label_name} roc_auc {test_auc_metrics[i].result()}")
    label_acc[i] = test_auc_metrics[i].result()
    test_auc_metrics[i].reset_states()
    
    
labelnames = np.array(label_acc)
df_submission2 = df_submission[:3001]
labelspred = df_submission2[label_cols].values
labelstrue = df_test[label_cols].values

In [None]:
df_submission.to_csv('submission.csv')

In [None]:
def show_graph(labels, predictions, label):
    #For graphs
   
    for i in range(6):
        fpr[i], tpr[i], _ = roc_curve(labels[:,i], predictions[:,i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Plot all ROC curves
    plt.figure(1)
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'slategrey', 'firebrick', 'springgreen'])
    for i, color in zip(range(n_class), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw,
        label='ROC curve of class '+ label_cols[i] + ' : ' + str(label[i]) )

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves of all 6 classes')
    plt.legend(loc="lower right")
    plt.show()
    


In [None]:
show_graph(labelstrue, labelspred, labelnames)