# Setups

In [None]:
!pip install tensorflow transformers pandas scikit-learn spacy wordcloud gensim
!pip install matplotlib seaborn
!pip install line_profiler

In [None]:
import re
import os
import sys
from pathlib import Path

import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, auc

import matplotlib.pyplot as plt
%matplotlib inline

## Google Colab

In [None]:
def google_colab_info():
    gpu_info = !nvidia-smi
    gpu_info = '\n'.join(gpu_info)
    if gpu_info.find('failed') >= 0:
        print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
        print('and then re-execute this cell.')
    else:
        print(gpu_info)

    from psutil import virtual_memory
    ram_gb = virtual_memory().total / 1e9
    print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

    if ram_gb < 20:
        print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
        print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
        print('re-execute this cell.')
    else:
        print('You are using a high-RAM runtime!')

In [None]:
try:
    import google.colab
    IN_GOOGLE_COLAB = True
    google_colab_info()
    
    DATA_PATH="/content/drive/MyDrive/data/jigsaw-toxic-comment-classification-challenge.zip"
    google.colab.drive.mount('/content/drive')
except:
    IN_GOOGLE_COLAB = False
    DATA_PATH = input("Enter the data archive path") 

## Pandas

In [None]:
import pandas as pd
pd.options.display.max_colwidth = 1000

## Transformers

In [None]:
from transformers import (
    PreTrainedModel,
    DistilBertTokenizerFast,
    TFDistilBertForSequenceClassification,
    TFTrainer,
    TFTrainingArguments
)

In [None]:
class SavePretrainedCallback(tf.keras.callbacks.Callback):
    # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
    # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
    # that saves the model with this method after each epoch.
    def __init__(self, output_dir, **kwargs):
        super().__init__()
        self.output_dir = output_dir

    def on_epoch_end(self, epoch, logs=None):
        self.model.save_pretrained(self.output_dir)


In [None]:
class Runner:
    # ================================================================================
    # Class
    # ================================================================================
    USE_HF_TRAINER = False
    _model_name = 'distilbert-base-cased'
    _tokenizer = DistilBertTokenizerFast.from_pretrained(_model_name)

    # ================================================================================
    # Instance
    # ================================================================================
    # --------------------------------------------------------------------------------
    # Instance properties
    # --------------------------------------------------------------------------------
    @property
    def batch_size(self):
        assert self._batch_size > 0
        return self._batch_size

    @property
    def X(self):
        """Training DataSet"""
        return self._X

    @property
    def V(self):
        """Validation DataSet"""
        return self._V

    @property
    def model_name(self):
        """HuggingFace pretrained model name"""
        return self._model_name

    @property
    def model(self):
        """Model"""
        return self._model

    @property
    def learning_rate(self):
        return self._learning_rate

    @property
    def num_epochs(self):
        return self._num_epochs

    @property
    def tokenizer(self):
        """"""
        return self._tokenizer

    @property
    def max_sequence_length(self):
        """"""
        return self._max_sequence_length

    @property
    def trainer(self):
        """"""
        return self._trainer

    @property
    def output_directory(self):
        """Directory to save models, etc"""
        return self._output_directory

    # --------------------------------------------------------------------------------
    # Instance initialization
    # --------------------------------------------------------------------------------
    def __init__(
            self,
            training_data,
            training_label,
            validation_data,
            validation_label,
            max_sequence_length=256,
            batch_size=16,
            learning_rate=5e-5,
            num_epochs=3,
            output_directory="./output"
    ):
        # --------------------------------------------------------------------------------
        # Keras Model
        # --------------------------------------------------------------------------------
        assert learning_rate > 0.0
        self._learning_rate = learning_rate
        self._model = None

        assert num_epochs > 0
        self._num_epochs = num_epochs

        assert batch_size > 0
        self._batch_size = batch_size

        self._output_directory = output_directory
        Path(self.output_directory).mkdir(parents=True, exist_ok=True)

        # --------------------------------------------------------------------------------
        # HuggingFace
        # --------------------------------------------------------------------------------
        self._model = TFDistilBertForSequenceClassification.from_pretrained(self.model_name)
        self._trainer = None
        assert 128 <= max_sequence_length <= 512
        self._max_sequence_length = max_sequence_length

        # --------------------------------------------------------------------------------
        # TensorFlow DataSet
        # --------------------------------------------------------------------------------
        assert np.all(np.isin(training_label, [0, 1]))
        assert np.all(np.isin(validation_label, [0, 1]))
        self._X = tf.data.Dataset.from_tensor_slices((
            dict(self.tokenizer(
                training_data,
                truncation=True,
                padding=True,
                max_length=self.max_sequence_length,
                return_tensors="tf"
            )),
            training_label
        ))
        self._V = tf.data.Dataset.from_tensor_slices((
            dict(self.tokenizer(
                validation_data,
                truncation=True,
                padding=True,
                max_length=self.max_sequence_length,
                return_tensors="tf"
            )),
            validation_label
        ))


    # --------------------------------------------------------------------------------
    # Instance methods
    # --------------------------------------------------------------------------------
    def _hf_train(self):
        self._training_args = TFTrainingArguments(
            output_dir='./results',             # output directory
            num_train_epochs=3,                 # total number of training epochs
            per_device_train_batch_size=self.batch_size,     # batch size per device during training
            per_device_eval_batch_size=self.batch_size,      # batch size for evaluation
            warmup_steps=500,                   # number of warmup steps for learning rate scheduler
            weight_decay=0.01,                  # strength of weight decay
            logging_dir='./logs',               # directory for storing logs
            logging_steps=10,
        )

        # with self._training_args.strategy.scope():
        #     self._model = TFDistilBertForSequenceClassification.from_pretrained(self.model_name)

        self._trainer = TFTrainer(
            model=self.model,
            args=self._training_args,   # training arguments
            train_dataset=self.X,       # training dataset
            eval_dataset=self.V         # evaluation dataset
        )
        self.trainer.train()

    def _keras_train(self):
        optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
        self.model.compile(
            optimizer=optimizer, 
            # loss=self.model.compute_loss,
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics = ["accuracy"]
        )
        self.model.summary()
        self.model.fit(
            self.X.shuffle(1000).batch(self.batch_size).prefetch(1),
            epochs=self.num_epochs,
            batch_size=self.batch_size,
            validation_data=self.V.shuffle(1000).batch(self.batch_size).prefetch(1),
            # callbacks=[SavePretrainedCallback(output_dir=self.output_directory)],
        )

    def train(self):
        if self.USE_HF_TRAINER:
            self._hf_train()
        else:
            self._keras_train()

    def evaluate(self, data, label):
        assert np.all(np.isin(label, [0, 1]))
        test_dataset = tf.data.Dataset.from_tensor_slices((
            dict(self.tokenizer(
                data,
                truncation=True,
                padding=True,
                max_length=self.max_sequence_length,
                return_tensors="tf"
            )),
            label
        ))
        evaluation = self.model.evaluate(
            test_dataset.shuffle(1000).batch(self.batch_size).prefetch(1)
        )
        return evaluation

    def predict(self, data):
        tokens = dict(self.tokenizer(
            data,
            truncation=True,
            padding=True,
            max_length=self.max_sequence_length,
            return_tensors="tf"
        ))
        logits = self.model.predict(tokens)["logits"]
        return tf.nn.softmax(logits)

    def save(self, path_to_dir=None):
        if path_to_dir is None or len(path_to_dir) == 0:
            path_to_dir = self.output_directory
        Path(path_to_dir).mkdir(parents=True, exist_ok=True)
        if self.USE_HF_TRAINER:
            self.trainer.save_model(path_to_dir)  
        else:
            self.model.save_pretrained(path_to_dir)

    def load(self, path_to_dir=None):
        if path_to_dir is None or len(path_to_dir) == 0:
            path_to_dir = self.output_directory
        if os.path.isdir(path_to_dir) and os.access(path_to_dir, os.R_OK):
            self._model = TFDistilBertForSequenceClassification.from_pretrained(path_to_dir)


---
# Data

First, upload data to 

In [None]:
!unzip -o $DATA_PATH
!unzip -o train.csv.zip
!unzip -o test.csv.zip
!unzip -o test_labels.csv.zip

In [None]:
raw_train = pd.read_csv("./train.csv")
raw_test_data = pd.read_csv("./test.csv")
raw_test_label = pd.read_csv("./test_labels.csv")
raw_test = pd.merge(raw_test_data, raw_test_label, left_on='id', right_on='id', how='inner')

## Training (Raw)

In [None]:
raw_train.head()

In [None]:
raw_train.describe()

In [None]:
raw_train[raw_train['toxic'] > 0].head(5)

## Test (Raw)
The label value -1 is not clear. Remove the rows where value is -1.

> test_labels.csv - labels for the test data; value of -1 indicates it was not used for scoring

In [None]:
raw_test = raw_test[(raw_test['toxic'] > 0)]   # Removing rows where 'toxic' label > 0 is sufficicent

In [None]:
raw_test.describe()

In [None]:
raw_test.head()

---
# BERT Fine Tuning

* [Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification](https://github.com/huggingface/transformers/issues/5421#issuecomment-652582854)


```
Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_transform', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
```

> This is expected, and tells you that you won't have good performance with your BertForSequenceClassification model before you fine-tune it slightly_smiling_face.



In [None]:
BATCH_SIZE = 32
MAX_SEQUENTH_LENGTH = 256

In [None]:
def run(category):
    print("--------------------------------------------------------------------------------")
    print(f"{category}")
    print("--------------------------------------------------------------------------------")

    data = raw_train['comment_text'].tolist()
    label = raw_train[category].tolist()
    
    train_data, validation_data, train_label, validation_label = train_test_split(
        data,
        label,
        test_size=.2,
        shuffle=True
    )
    runner = Runner(
        training_data=train_data,
        training_label=train_label,
        validation_data=validation_data,
        validation_label=validation_label,
        batch_size=BATCH_SIZE,
        max_sequence_length=MAX_SEQUENTH_LENGTH,
        output_directory="/content/drive/MyDrive/data/model_C{}_B{}_L{}".format(
            category.upper(), BATCH_SIZE, MAX_SEQUENTH_LENGTH
        )
    )
    
    runner.train()
    runner.save()
    runners[category] = runner

    test_data = raw_test['comment_text'].tolist()
    test_label = raw_test[category].tolist()
    evaluation = runner.evaluate(test_data, test_label)
    evaluations[category] = evaluation
    print(f"Evaluation: (accuracy):{evaluation}")

In [None]:
categories = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
runners = {}
evaluations = {}

for category in categories:
    run(category)


In [None]:
test_data = raw_test['comment_text'].tolist()
categories = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

for category in categories:
    index = np.random.randint(0, len(test_data))
    data = test_data[index]

    prediction = runners[category].predict(data)
    print(f"category: {category} prediction: {prediction}\ndata: {data}")