In [1]:
!pip install transformers[sentencepiece]

Collecting transformers[sentencepiece]
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 7.5MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 52.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 52.3MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963

In [2]:
from google.colab import drive

import numpy as np
import pandas as pd
from tqdm import *

import tensorflow as tf
from tensorflow.keras.metrics import AUC
from tensorflow import keras
from keras import backend as K  #for f1

from transformers import TFDistilBertModel, DistilBertConfig
from transformers import DistilBertTokenizerFast

import matplotlib.pyplot as plt

from sklearn.metrics import classification_report

In [3]:
drive.mount('/content/drive', force_remount = True)
root_dir = '/content/drive/MyDrive/Toxic_comment_classification_Maggio_Monti/dataset/'

Mounted at /content/drive


In [4]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [5]:
params = {'MAX_LENGTH': 128,
          'EPOCHS': 6,
          'LEARNING_RATE': 5e-5,
          'FT_EPOCHS': 6,
          'OPTIMIZER': 'adam',
          'FT_LEARNING_RATE': 2e-5,
          'BATCH_SIZE': 64,
          'NUM_STEPS': 64,
          'DISTILBERT_DROPOUT': 0.2,
          'DISTILBERT_ATT_DROPOUT': 0.2,
          'LAYER_DROPOUT': 0.2,
          'KERNEL_INITIALIZER': 'GlorotNormal',
          'BIAS_INITIALIZER': 'zeros',
          'POS_PROBA_THRESHOLD': 0.5,          
          'ADDED_LAYERS': 'Dense 256, Dense 32, Dropout 0.2',
          'LR_SCHEDULE': '5e-5 for 6 epochs, Fine-tune w/ adam for 6 epochs @2e-5',
          'FREEZING': 'All DistilBERT layers frozen for 6 epochs, then unfrozen for 6',
          'CALLBACKS': '[early_stopping monitoring val_loss w/ patience=0]',
          'RANDOM_STATE':42
          }

In [6]:
def build_model(transformer, max_length=params['MAX_LENGTH']):
    """""""""
    Template for building a model off of the BERT or DistilBERT architecture
    for a binary classification task.
    
    Input:
      - transformer:  a base Hugging Face transformer model object (BERT or DistilBERT)
                      with no added classification head attached.
      - max_length:   integer controlling the maximum number of encoded tokens 
                      in a given sequence.
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
    """""""""
    
    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=params['RANDOM_STATE']) 
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    
    # DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]
    
    # We only care about DistilBERT's output for the [CLS] token, which is located
    # at index 0.  Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]
    
    D1 = tf.keras.layers.Dropout(params['LAYER_DROPOUT'],
                                 seed=params['RANDOM_STATE']
                                )(cls_token)
    
    X = tf.keras.layers.Dense(256,
                              activation='relu',
                              kernel_initializer=weight_initializer,
                              bias_initializer='zeros'
                              )(D1)
    
    D2 = tf.keras.layers.Dropout(params['LAYER_DROPOUT'],
                                 seed=params['RANDOM_STATE']
                                )(X)
    
    X = tf.keras.layers.Dense(32,
                              activation='relu',
                              kernel_initializer=weight_initializer,
                              bias_initializer='zeros'
                              )(D2)
    
    D3 = tf.keras.layers.Dropout(params['LAYER_DROPOUT'],
                                 seed=params['RANDOM_STATE']
                                )(X)
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(6, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  # CONSIDER USING CONSTRAINT
                                   bias_initializer='zeros'
                                   )(D3)
    
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
    
    auc_score = AUC(multi_label=True)
    # Compile the model
    model.compile(tf.keras.optimizers.Adam(lr=params['LEARNING_RATE']), 
                  loss='binary_crossentropy',
                  metrics=[auc_score, f1_m])
    
    return model

In [None]:
# The bare, pretrained DistilBERT transformer model outputting raw hidden-states 
# and without any specific head on top.
config = DistilBertConfig(dropout=params['DISTILBERT_DROPOUT'], 
                          attention_dropout=params['DISTILBERT_ATT_DROPOUT'], 
                          output_hidden_states=True)
distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

# Build model
model = build_model(distilBERT)
model.load_weights('/content/drive/MyDrive/Toxic_comment_classification_Maggio_Monti/models/model_freeze_w.h5')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.




  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

import pandas as pd
test = pd.read_csv(root_dir + 'dataset_clean/test_clean.csv', index_col=0)

In [None]:
test

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text
0,0001ea8717f6de06,0,0,0,0,0,0,thank you for understanding i think very highl...
1,000247e83dcc1211,0,0,0,0,0,0,dear god this site is horrible
2,0002f87b16116a7f,0,0,0,0,0,0,somebody will invariably try to add religion r...
3,0003e1cccfd5a40a,0,0,0,0,0,0,it says it right there that it is a type the t...
4,00059ace3e3e9a53,0,0,0,0,0,0,before adding a new product to the list make s...
...,...,...,...,...,...,...,...,...
63973,fff8f64043129fa2,0,0,0,0,0,0,jerome i see you never got around to this im n...
63974,fff9d70fe0722906,0,0,0,0,0,0,lucky bastard http wikimediafoundation org wik...
63975,fffa8a11c4378854,0,0,0,0,0,0,shame on you all you want to speak about gays ...
63976,fffac2a094c8e0e2,1,0,1,0,1,0,mel gibson is a nazi bitch who makes shitty mo...


In [None]:
x_test = test['comment_text'].values
y_test = test.drop(["comment_text", 'id'] , axis=1)


In [None]:
x_test

array(['thank you for understanding i think very highly of you and would not revert without discussion ',
       'dear god this site is horrible ',
       'somebody will invariably try to add religion really you mean the way people have invariably kept adding religion to the samuel beckett infobox and why do you bother bringing up the long dead completely non existent influences issue youre just flailing making up crap on the fly for comparison the only explicit acknowledgement in the entire amos oz article that he is personally jewish is in the categories ',
       ...,
       'shame on you all you want to speak about gays and not about romanians ',
       'mel gibson is a nazi bitch who makes shitty movies he has so much buttsex that his asshole is now big enough to be considered a country ',
       'unicorn lair discovery supposedly a unicorn lair has been discovered in pyongyang north korea the lair is supposedly associated with king dongmyeong of goguryeo who supposedly rode a uni

In [None]:
padded_ids_test = []
mask_ids_test = []
for i in tqdm(range(len(x_test))):
  encoding = tokenizer(str(x_test[i]), max_length=128 , padding ="max_length", truncation=True)
  input_ids , attention_id = encoding["input_ids"] , encoding["attention_mask"] 
  padded_ids_test.append(input_ids)
  mask_ids_test.append(attention_id)

100%|██████████| 63978/63978 [00:16<00:00, 3838.14it/s]


In [None]:
test_id = np.array(padded_ids_test)
test_mask = np.array(mask_ids_test)

test_id = np.squeeze(test_id) 
test_mask =  np.squeeze(test_mask) 

In [None]:
y_pred = model.predict([test_id, test_mask])



In [None]:
y_pred = y_pred >= 0.5

In [None]:
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.63      0.59      0.61      6090
           1       0.34      0.34      0.34       367
           2       0.67      0.57      0.62      3691
           3       1.00      0.00      0.00       211
           4       0.65      0.46      0.54      3427
           5       1.00      0.00      0.00       712

   micro avg       0.64      0.51      0.57     14498
   macro avg       0.72      0.33      0.35     14498
weighted avg       0.66      0.51      0.55     14498
 samples avg       0.96      0.95      0.92     14498



In [None]:
model.evaluate([test_id, test_mask], y_test, return_dict=True)



{'auc_1': 0.9462966918945312,
 'f1_m': 0.48706507682800293,
 'loss': 0.07981007546186447}

# Fine Tune

In [None]:
# Build model
model = build_model(distilBERT)
model.load_weights('/content/drive/MyDrive/Toxic_comment_classification_Maggio_Monti/models/model_UNfreeze_w.h5')



  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [None]:
y_pred = model.predict([test_id, test_mask]) > 0.5



In [None]:
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.54      0.89      0.68      6090
           1       0.37      0.47      0.42       367
           2       0.60      0.81      0.69      3691
           3       0.56      0.54      0.55       211
           4       0.66      0.72      0.69      3427
           5       0.60      0.61      0.60       712

   micro avg       0.58      0.80      0.67     14498
   macro avg       0.56      0.67      0.60     14498
weighted avg       0.58      0.80      0.67     14498
 samples avg       0.92      0.98      0.91     14498



In [None]:
model.evaluate([test_id, test_mask], y_test, return_dict=True)



{'auc_2': 0.9731385707855225,
 'f1_m': 0.61812424659729,
 'loss': 0.07336844503879547}