## *Note*: you need to turn on the GPU in this Notebook due the large number of parameters of BERT

In [None]:
# clone our github repository
!git clone https://github.com/Azzam-Radman/Toxic-Spans-Detection.git

In [1]:
# import standard libraries
import pandas as pd
import numpy as np
# prevent truncation of long sentences during displaying
pd.set_option('display.max_colwidth', None)

In [2]:
# read the datasets
try:
    # will be implimented if the repo is cloned
    df1 = pd.read_excel('/content/Toxic-Spans-Detection/src/dataset/tokenized_1.xlsx')
    df2 = pd.read_excel('/content/Toxic-Spans-Detection/src/dataset/tokenized_ready.xlsx')
except:
    # else read the datasets directly from the repo
    df1 = pd.read_excel('https://github.com/Azzam-Radman/Toxic-Spans-Detection/blob/main/src/dataset/tokenized_1.xlsx?raw=true')
    df2 = pd.read_excel('https://github.com/Azzam-Radman/Toxic-Spans-Detection/blob/main/src/dataset/tokenized_ready.xlsx?raw=true')

In [3]:
import tensorflow as tf

In [4]:
print(tf.__version__)

2.7.0


In [5]:
!pip install -U tensorflow-text==2.7.0

Collecting tensorflow-text==2.7.0
  Downloading tensorflow_text-2.7.0-cp37-cp37m-manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.0 MB/s 
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.7.0


In [6]:
import tensorflow_hub as hub
import tensorflow_text as tf_text

In [7]:
!pip install openpyxl



In [8]:
# concatenate the two splits of the dataset
df1 = df1.iloc[:1798, :].reset_index(drop=True)
df2 = df2.iloc[1798:, :].reset_index(drop=True)
# concatenate the two splits of the dataset
df = pd.concat([df1, df2], axis=0).reset_index(drop=True)
# display the head of the dataset
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142
0,اربد,فيها,جامعات,اكثر,من,عمان,...,وفيها,قد,عمان,ونص,لعيبه,المنتخب,منها,...,و,80,%,من,مطربين,الاردن,منها,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,...,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,الحلو,انكم,بتحكوا,على,اساس,انو,الاردن,ما,فيه,فساد,سرقات,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,...,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad
3,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,كله,رائع,بجد,ربنا,يكرمك,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,...,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad


In [9]:
# function to remove the 'pad' cells in the tokens rows
# and remove the corresponding NaNs in the labels rows
def remove_pad_nan(word_row, label_row):
    """
    args:
    word_row: the row containing words (tokens) with "pad" tokens to remove
    label_row: the label containig labels (0 or 1) with NaNs to remove
    returns two lists with words and labels without pads and NaNs.
    
    """
    
    word_list = word_row.tolist()
    label_list = label_row.tolist()
    
    word_list_cleaned = [word for word in word_list if word != 'pad']
    label_list_cleaned = [label for label in label_list if label is not np.nan]
    
    pair = [word_list_cleaned, label_list_cleaned]
    return pair

In [10]:
pairs = [] # initialize a list to hold the pairs
# loop over each couple of rows and pass them to the previous function to extract
# the cleaned rows without padding and NaNs
for i in range(len(df)):
    if i%2 == 0:
        pairs.append(remove_pad_nan(df.iloc[i, :], df.iloc[i+1, :]))

In [11]:
# add a space after each token to reconstruct the sentences 
pairs_with_spaces = [] # initialize a list to hold the new pairs

# loop over the pairs
for pair in pairs:
    words_with_spaces = [] # initialize a list for each iter in the loop to hold the tokens with the spaces
    labels_with_spaces = [] # initialize a list for each iter in the loop to hold the labels with the spaces
    len_one_pair = len(pair[0]) # extract the lenght of the list
    
    for i in range(len_one_pair): # loop over the list elements and add the space after each token and each label
        words_with_spaces.extend([pair[0][i], ' '])
        labels_with_spaces.extend([pair[1][i], ' '])
    
    new_pair = [words_with_spaces, labels_with_spaces] # create a new pair list of the tokens and labels with spaces
    pairs_with_spaces.append(new_pair) # append new_pair list to the pairs_with_spaces list

In [12]:
# remove last space which was added after the last token and after the last label
for pair in pairs_with_spaces:
    pair[0].pop(-1)
    pair[1].pop(-1)

In [13]:
# loop over each pair of the pairs_with_spaces list
# if the label is 1 (toxic) extract its span
# the span is the indexes of the underlying characters of each token
for pair in pairs_with_spaces:
    # length of the words or labels list
    len_one_pair = len(pair[0])
    # initialize the toxic spans list and a counter
    toxic_spans_list = []
    counter = 0
    # loop over each item in each list
    for idx in range(len_one_pair):
        # extract word label from the label list
        word_label = pair[1][idx]
        # get the word length from the word list
        if isinstance(pair[0][idx], str):
            len_word = len(pair[0][idx]) # in case the token is a string extract its length
        else: 
            len_word = 1 # else if the token is digit, its length is 1
            
        if word_label == 0:
            counter += len_word # increment the counter by the number of characters of this token
        elif word_label == ' ': # in case the token is a space increment the counter by 1
            counter += 1
        else:
            toxic_spans = list(range(counter, counter+len_word)) # create a list of the toxic span of this token
            toxic_spans_list.extend(toxic_spans) # extend the toxic spans list with the spans of the current toxic token
            counter += len_word # increment the counter by 1
            
    pair.append(toxic_spans_list) # append the toxic spans list after each pair to the original pair list

In [14]:
# reconstruct the sentences with the spans only 
last_pairs = [] # initialize a list to hold the final pairs (sentence, toxic spans)
for pair in pairs_with_spaces: # loop over each pair
    sentence = ''.join(map(str, pair[0])) # construct the sentece and ensure each element is a sting instance
    spans = pair[-1] # the last list in each pair is the toxic spans
    new_last_pair = [sentence, spans] # construct a new list with the sentece and toxic spans only
    last_pairs.append(new_last_pair) # append this list to the last_pairs list

In [15]:
zeros = np.zeros((len(df)//2, 2)) # initialize a zeros array to hold the place of the sentences and spans the 
                                  # final dataframe
train_df = pd.DataFrame(zeros, columns=['Sentence', 'Spans']) # construct the final dataframe, all values are 
                                                              # initialized with zeros

train_df['Spans'] = train_df['Spans'].astype('str') # change the Spans columns data type to string to accept lists
for i in range(len(last_pairs)): # loop over each pair and populate the dataframe
    train_df.iloc[i, 0] = last_pairs[i][0]
    train_df.iat[i, 1] = last_pairs[i][1]

In [16]:
def get_setence_embedding(sentences):
  """
  this function preprocesses the text to be ready to
  feed to the BERT model, then the 768-length vectors
  are extracted for each sentence
  return: the encoded sentence
  """
  preprocessed_text = bert_preprocessing(sentences)
  encoded_text = bert_encoder(preprocessed_text)
  return encoded_text['pooled_output']

In [17]:
# preprocessing layer provided in TensorFlow Hub
bert_preprocessing = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3')
# BERT layer provided in Tensorflow Hub
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4')

In [18]:
from tqdm import tqdm

In [24]:
train_df.head()

Unnamed: 0,Sentence,Spans
0,اربد فيها جامعات اكثر من عمان ... وفيها قد عمان ونص لعيبه المنتخب منها ... و 80 % من مطربين الاردن منها,[]
1,الحلو انكم بتحكوا على اساس انو الاردن ما فيه فساد سرقات,[]
2,كله رائع بجد ربنا يكرمك,[]
3,لسانك قذر يا قمامه,"[6, 7, 8, 13, 14, 15, 16, 17]"
4,انا داشره وغير متزوجه ولدي علاقات مشبوه واحشش واحيانا اهرب مخدرات و اجيد التسليك احب ان انكب نفسي وعلاقتي بالمنزل متوتره جد,"[4, 5, 6, 7, 8, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 73, 74, 75, 76, 77, 78, 79, 88, 89, 90, 91, 93, 94, 95, 96, 98, 99, 100, 101, 102, 103, 104, 106, 107, 108, 109, 110, 111, 112, 114, 115, 116, 117, 118, 119]"


In [28]:
embeddings_list = []
for i in tqdm(range(len(train_df))):
  sentence = [train_df.iloc[i, 0]]
  embed = get_setence_embedding(sentence)
  embeddings_list.append(embed)

100%|██████████| 1799/1799 [04:20<00:00,  6.91it/s]


In [29]:
zeros = np.zeros((len(embeddings_list), 768))
bert_embeddings_df = pd.DataFrame(zeros, columns=[f'{i}' for i in range(768)])
bert_embeddings_df.shape

(1799, 768)

In [30]:
for i in range(len(bert_embeddings_df)):
  bert_embeddings_df.iloc[i, :] = embeddings_list[i][0].numpy()

In [33]:
train_df['is_toxic'] = train_df['Spans'].apply(lambda x: int(0) if x==[] else 1)
train_df.head()

Unnamed: 0,Sentence,Spans,is_toxic
0,اربد فيها جامعات اكثر من عمان ... وفيها قد عمان ونص لعيبه المنتخب منها ... و 80 % من مطربين الاردن منها,[],0
1,الحلو انكم بتحكوا على اساس انو الاردن ما فيه فساد سرقات,[],0
2,كله رائع بجد ربنا يكرمك,[],0
3,لسانك قذر يا قمامه,"[6, 7, 8, 13, 14, 15, 16, 17]",1
4,انا داشره وغير متزوجه ولدي علاقات مشبوه واحشش واحيانا اهرب مخدرات و اجيد التسليك احب ان انكب نفسي وعلاقتي بالمنزل متوتره جد,"[4, 5, 6, 7, 8, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 73, 74, 75, 76, 77, 78, 79, 88, 89, 90, 91, 93, 94, 95, 96, 98, 99, 100, 101, 102, 103, 104, 106, 107, 108, 109, 110, 111, 112, 114, 115, 116, 117, 118, 119]",1


In [35]:
# define the model to create the 128 vectors
def my_model():
  inputs = tf.keras.layers.Input(shape=(768,))
  custom_embeddings = tf.keras.layers.Dense(128, activation='relu', name='custom_embeddings')(inputs)
  final_output = tf.keras.layers.Dense(1, activation='sigmoid')(custom_embeddings)
  model = tf.keras.Model(inputs=inputs, outputs=final_output)
  return model

In [38]:
# define, compile and train the model
model = my_model()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=tf.keras.metrics.AUC())
model.fit(bert_embeddings_df, train_df['is_toxic'], epochs=20, batch_size=64, validation_split=0.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f4644989790>

In [40]:
# extract the output from the layer named "custom_embeddings" where the outputs are 128-dimensional vectors
layer_name = 'custom_embeddings'
intermediate_model = tf.keras.Model(inputs=model.input,
                                    outputs=model.get_layer(layer_name).output)
embeddings_128 = intermediate_model.predict(bert_embeddings_df, batch_size=64)

In [41]:
# save the embeddings to a CSV file
pd.DataFrame(embeddings_128, columns=[f'{i}' for i in range(embeddings_128.shape[1])]).to_csv('bert_embeddings_128.csv', index=False)