## Download and Import Dependencies

In [3]:
pip install transformers 

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |▏                               | 10kB 23.9MB/s eta 0:00:01[K     |▎                               | 20kB 30.7MB/s eta 0:00:01[K     |▌                               | 30kB 20.6MB/s eta 0:00:01[K     |▋                               | 40kB 24.2MB/s eta 0:00:01[K     |▉                               | 51kB 19.7MB/s eta 0:00:01[K     |█                               | 61kB 16.8MB/s eta 0:00:01[K     |█▏                              | 71kB 15.3MB/s eta 0:00:01[K     |█▎                              | 81kB 16.1MB/s eta 0:00:01[K     |█▍                              | 92kB 15.2MB/s eta 0:00:01[K     |█▋                              | 102kB 16.4MB/s eta 0:00:01[K     |█▊                              | 112kB 16.4MB/s eta 0:00:01[K     |██                              | 

In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from tqdm import tqdm_notebook

from transformers import AutoTokenizer,BertTokenizer,TFBertModel,TFOpenAIGPTModel,OpenAIGPTTokenizer,DistilBertTokenizer, TFDistilBertModel,XLMTokenizer, TFXLMModel, TFAutoModel, AutoTokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from sklearn.metrics import confusion_matrix, accuracy_score, auc, classification_report, f1_score
from sklearn.model_selection import train_test_split, KFold

import tensorflow_hub as hub
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *

import warnings
warnings.filterwarnings("ignore")

# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Data

In [5]:
full_clean_df = pd.read_excel("../data/full_clean_df.xlsx", index_col=0)

In [6]:
labels_name_list = ['NotHate', 'Racist', 'Sexist', 'Homophobe', 'Religion', 'OtherHate']

## Tokenize Data

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased',truncation=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




In [8]:
def single_encoding_function(text,tokenizer, max_seq_length=30, name='BERT'):
    input_ids=[]
    if name=='BERT':
        tokenizer.pad_token ='[PAD]'
    elif name=='OPENAIGPT2':
        tokenizer.pad_token='<unk>'
    elif name=='Transformer XL':
        tokenizer.pad_token= tokenizer.eos_token
    elif name=='DistilBert':
        tokenizer.pad_token='[PAD]'
    
    for sentence in tqdm(text):
        encoded=tokenizer.encode(sentence,max_length=max_seq_length,pad_to_max_length=True)
        input_ids.append(encoded)
    return input_ids

In [9]:
# X=np.array(single_encoding_function(full_clean_df['tweets_train'].values.tolist(),tokenizer,name="BERT"))
# y=np.array(full_clean_df[labels_name_list])

In [10]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=12)

In [11]:
X, X_test, y, y_test =  train_test_split(np.array(single_encoding_function(full_clean_df['tweets_train'].values.tolist(),tokenizer,name="BERT")), np.array(full_clean_df[labels_name_list]), test_size=0.33, random_state=12)

  0%|          | 0/143277 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 143277/143277 [00:35<00:00, 4005.40it/s]


In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=12)

In [13]:
X_train.shape

(64316, 30)

# Tensorflow BERT: Train-Test Iteration

In [14]:
EPOCHS=4
LEARNING_RATE=1e-5
early_stopping=early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=10,
    mode='max',
    restore_best_weights=True)

In [15]:
BATCH_SIZE=32
steps_per_epoch = X_train.shape[0] // BATCH_SIZE

In [16]:
def make_data(X_train, y_train, X_val, y_val, batch_size):
    train = (
        tf.data.Dataset
        .from_tensor_slices((X_train, y_train))
        .repeat()
        .shuffle(2048, seed=123)
        .batch(BATCH_SIZE)
        .prefetch(AUTO))

    test = (
        tf.data.Dataset
        .from_tensor_slices((X_test, y_test))
        .batch(batch_size)
        .cache()
        .prefetch(AUTO)
    )

    return train,test

In [17]:
AUTO = tf.data.experimental.AUTOTUNE
strategy = tf.distribute.get_strategy()

In [18]:
def build_model(transformer_layer, max_len=30):
    input_word_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer_layer(input_word_ids)[0]
    
    cls_token = sequence_output[:, 0, :]
    new = tf.keras.layers.Dense(256, activation='sigmoid')(cls_token)
    out = tf.keras.layers.Dense(6, activation='sigmoid')(new)
    
    model = tf.keras.Model(inputs=input_word_ids, outputs=out)
    return model

In [19]:
def compile_model(name, max_seq_length=30, LEARNING_RATE=LEARNING_RATE):
    with strategy.scope():
        METRICS = [
          tf.keras.metrics.BinaryAccuracy(name='accuracy'),
          tf.keras.metrics.Precision(name='precision'),
          tf.keras.metrics.Recall(name='recall')]
        if name=='bert-base-uncased':
            transformer_layer = (
                TFBertModel.from_pretrained(name)
            )
        elif name=='openai-gpt':
            transformer_layer = (
                TFOpenAIGPTModel.from_pretrained(name)
            )
        elif name=='distilbert-base-cased':
            transformer_layer = (
                TFDistilBertModel.from_pretrained(name)
            )
        elif name=='xlm-mlm-en-2048':
            transformer_layer = (
                TFBertModel.from_pretrained(name)
            )
        elif name=='jplu/tf-xlm-roberta-large':
            transformer_layer = (
                TFAutoModel.from_pretrained(name)
            )
        model = build_model(transformer_layer, max_len=max_seq_length)
        model.compile(optimizer=tf.keras.optimizers.Adam(
        learning_rate=LEARNING_RATE), loss='binary_crossentropy', metrics=METRICS)
    return model

In [20]:
train, test = make_data(X_train, y_train, X_val, y_val, BATCH_SIZE)
model = compile_model('distilbert-base-cased')

history = model.fit(
    train, steps_per_epoch=steps_per_epoch,
    epochs=EPOCHS, callbacks=[early_stopping], validation_data=test)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=354041576.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertModel: ['vocab_projector', 'activation_13', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [21]:
y_predict=model.predict(X_val, verbose=1)
y_predict[ y_predict> 0.5] = 1
y_predict[y_predict <= 0.5] = 0



In [22]:
print(f1_score(y_val, y_predict, average='macro'))

0.5024776711681103


In [23]:
print(classification_report(y_val, y_predict))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96     28950
           1       0.57      0.30      0.39      9837
           2       0.59      0.30      0.40      4348
           3       0.71      0.65      0.68      2388
           4       0.62      0.07      0.13       468
           5       0.70      0.34      0.46      4664

   micro avg       0.85      0.71      0.77     50655
   macro avg       0.69      0.44      0.50     50655
weighted avg       0.80      0.71      0.74     50655
 samples avg       0.89      0.78      0.79     50655



## Tensorflow-BERT 5 Fold Cross Validation

In [None]:
# def one_fold(X_train, y_train, X_test, y_test, BATCH_SIZE):
#     train, test = make_data(X_train, y_train, X_test, y_test, BATCH_SIZE)
#     model = compile_model('distilbert-base-cased')

#     history = model.fit(
#         train, steps_per_epoch=steps_per_epoch,
#         epochs=EPOCHS, callbacks=[early_stopping], validation_data=test)
#     y_predict=model.predict(X_test, verbose=1)
#     y_predict[ y_predict> 0.5] = 1
#     y_predict[y_predict <= 0.5] = 0
#     score = f1_score(y_test, y_predict, average='macro')
#     print(f'Average f1 macro non-empty is {score}')
#     return score

In [None]:
# kf = KFold(n_splits=2, random_state=123)
# kf.get_n_splits(X)
# scores = []
# for train_index, test_index in kf.split(X):
#     print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]
#     scores.append(one_fold(X_train, y_train, X_test, y_test, BATCH_SIZE))