In [None]:
# clone our github repository
!git clone https://github.com/Azzam-Radman/Toxic-Spans-Detection.git

Cloning into 'Toxic-Spans-Detection'...


In [1]:
# import standard libraries
import pandas as pd
import numpy as np
# prevent truncation of long sentences during displaying
pd.set_option('display.max_colwidth', None)

In [2]:
# read the datasets
try:
    # will be implimented if the repo is cloned
    df1 = pd.read_excel('/content/Toxic-Spans-Detection/src/dataset/tokenized_1.xlsx')
    df2 = pd.read_excel('/content/Toxic-Spans-Detection/src/dataset/tokenized_ready.xlsx')
except:
    # else read the datasets directly from the repo
    df1 = pd.read_excel('https://github.com/Azzam-Radman/Toxic-Spans-Detection/blob/main/src/dataset/tokenized_1.xlsx?raw=true')
    df2 = pd.read_excel('https://github.com/Azzam-Radman/Toxic-Spans-Detection/blob/main/src/dataset/tokenized_ready.xlsx?raw=true')

In [3]:
# concatenate the two splits of the dataset
df1 = df1.iloc[:1798, :].reset_index(drop=True)
df2 = df2.iloc[1798:, :].reset_index(drop=True)
# concatenate the two splits of the dataset
df = pd.concat([df1, df2], axis=0).reset_index(drop=True)
# display the head of the dataset
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,133,134,135,136,137,138,139,140,141,142
0,اربد,فيها,جامعات,اكثر,من,عمان,...,وفيها,قد,عمان,...,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad
1,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2,الحلو,انكم,بتحكوا,على,اساس,انو,الاردن,ما,فيه,فساد,...,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad
3,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
4,كله,رائع,بجد,ربنا,يكرمك,pad,pad,pad,pad,pad,...,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad


In [4]:
# function to remove the 'pad' cells in the tokens rows
# and remove the corresponding NaNs in the labels rows
def remove_pad_nan(word_row, label_row):
    """
    args:
    word_row: the row containing words (tokens) with "pad" tokens to remove
    label_row: the label containig labels (0 or 1) with NaNs to remove
    returns two lists with words and labels without pads and NaNs.
    
    """
    
    word_list = word_row.tolist()
    label_list = label_row.tolist()
    
    word_list_cleaned = [word for word in word_list if word != 'pad']
    label_list_cleaned = [label for label in label_list if label is not np.nan]
    
    pair = [word_list_cleaned, label_list_cleaned]
    return pair

In [5]:
pairs = [] # initialize a list to hold the pairs
# loop over each couple of rows and pass them to the previous function to extract
# the cleaned rows without padding and NaNs
for i in range(len(df)):
    if i%2 == 0:
        pairs.append(remove_pad_nan(df.iloc[i, :], df.iloc[i+1, :]))

In [6]:
# ensure the lenght of each token row with its label row are equal
# if there is a fault in the process the counter will be greater than 0
counter = 0
for pair in pairs: # loop over each pair of rows (tokens and labels)
    if len(pair[0]) != len(pair[1]):
        counter += 1
        print(pair)
print(counter)
# since the counter value is still zero, every thing is fine

0


In [7]:
# display the first pair
pairs[0]

[['اربد',
  'فيها',
  'جامعات',
  'اكثر',
  'من',
  'عمان',
  '...',
  'وفيها',
  'قد',
  'عمان',
  'ونص',
  'لعيبه',
  'المنتخب',
  'منها',
  '...',
  'و',
  80,
  '%',
  'من',
  'مطربين',
  'الاردن',
  'منها'],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [8]:
# add a space after each token to reconstruct the sentences 
pairs_with_spaces = [] # initialize a list to hold the new pairs

# loop over the pairs
for pair in pairs:
    words_with_spaces = [] # initialize a list for each iter in the loop to hold the tokens with the spaces
    labels_with_spaces = [] # initialize a list for each iter in the loop to hold the labels with the spaces
    len_one_pair = len(pair[0]) # extract the lenght of the list
    
    for i in range(len_one_pair): # loop over the list elements and add the space after each token and each label
        words_with_spaces.extend([pair[0][i], ' '])
        labels_with_spaces.extend([pair[1][i], ' '])
    
    new_pair = [words_with_spaces, labels_with_spaces] # create a new pair list of the tokens and labels with spaces
    pairs_with_spaces.append(new_pair) # append new_pair list to the pairs_with_spaces list

In [9]:
# ensure the lenght of each token row with its label row are equal
# if there is a fault in the process the counter will be greater than 0
counter = 0
for pair in pairs_with_spaces: # loop over each pair of rows (tokens and labels)
    if len(pair[0]) != len(pair[1]):
        counter += 1
        
print(counter)
# since the counter value is still zero, every thing is fine

0


In [10]:
# remove last space which was added after the last token and after the last label
for pair in pairs_with_spaces:
    pair[0].pop(-1)
    pair[1].pop(-1)

In [11]:
# ensure the lenght of each token row with its label row are equal
# if there is a fault in the process the counter will be greater than 0
counter = 0
for pair in pairs_with_spaces: # loop over each pair of rows (tokens and labels)
    if len(pair[0]) != len(pair[1]):
        counter += 1
        
print(counter)
# since the counter value is still zero, every thing is fine

0


In [12]:
# display the first pair
pairs_with_spaces[0]

[['اربد',
  ' ',
  'فيها',
  ' ',
  'جامعات',
  ' ',
  'اكثر',
  ' ',
  'من',
  ' ',
  'عمان',
  ' ',
  '...',
  ' ',
  'وفيها',
  ' ',
  'قد',
  ' ',
  'عمان',
  ' ',
  'ونص',
  ' ',
  'لعيبه',
  ' ',
  'المنتخب',
  ' ',
  'منها',
  ' ',
  '...',
  ' ',
  'و',
  ' ',
  80,
  ' ',
  '%',
  ' ',
  'من',
  ' ',
  'مطربين',
  ' ',
  'الاردن',
  ' ',
  'منها'],
 [0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0]]

In [13]:
# loop over each pair of the pairs_with_spaces list
# if the label is 1 (toxic) extract its span
# the span is the indexes of the underlying characters of each token
for pair in pairs_with_spaces:
    # length of the words or labels list
    len_one_pair = len(pair[0])
    # initialize the toxic spans list and a counter
    toxic_spans_list = []
    counter = 0
    # loop over each item in each list
    for idx in range(len_one_pair):
        # extract word label from the label list
        word_label = pair[1][idx]
        # get the word length from the word list
        if isinstance(pair[0][idx], str):
            len_word = len(pair[0][idx]) # in case the token is a string extract its length
        else: 
            len_word = 1 # else if the token is digit, its length is 1
            
        if word_label == 0:
            counter += len_word # increment the counter by the number of characters of this token
        elif word_label == ' ': # in case the token is a space increment the counter by 1
            counter += 1
        else:
            toxic_spans = list(range(counter, counter+len_word)) # create a list of the toxic span of this token
            toxic_spans_list.extend(toxic_spans) # extend the toxic spans list with the spans of the current toxic token
            counter += len_word # increment the counter by 1
            
    pair.append(toxic_spans_list) # append the toxic spans list after each pair to the original pair list

In [14]:
# display the first pair
pairs_with_spaces[0]

[['اربد',
  ' ',
  'فيها',
  ' ',
  'جامعات',
  ' ',
  'اكثر',
  ' ',
  'من',
  ' ',
  'عمان',
  ' ',
  '...',
  ' ',
  'وفيها',
  ' ',
  'قد',
  ' ',
  'عمان',
  ' ',
  'ونص',
  ' ',
  'لعيبه',
  ' ',
  'المنتخب',
  ' ',
  'منها',
  ' ',
  '...',
  ' ',
  'و',
  ' ',
  80,
  ' ',
  '%',
  ' ',
  'من',
  ' ',
  'مطربين',
  ' ',
  'الاردن',
  ' ',
  'منها'],
 [0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0,
  ' ',
  0],
 []]

In [15]:
# reconstruct the sentences with the spans only 
last_pairs = [] # initialize a list to hold the final pairs (sentence, toxic spans)
for pair in pairs_with_spaces: # loop over each pair
    sentence = ''.join(map(str, pair[0])) # construct the sentece and ensure each element is a sting instance
    spans = pair[-1] # the last list in each pair is the toxic spans
    new_last_pair = [sentence, spans] # construct a new list with the sentece and toxic spans only
    last_pairs.append(new_last_pair) # append this list to the last_pairs list

In [16]:
zeros = np.zeros((len(df)//2, 2)) # initialize a zeros array to hold the place of the sentences and spans the 
                                  # final dataframe
train_df = pd.DataFrame(zeros, columns=['Sentence', 'Spans']) # construct the final dataframe, all values are 
                                                              # initialized with zeros

In [17]:
train_df['Spans'] = train_df['Spans'].astype('str') # change the Spans columns data type to string to accept lists
for i in range(len(last_pairs)): # loop over each pair and populate the dataframe
    train_df.iloc[i, 0] = last_pairs[i][0]
    train_df.iat[i, 1] = last_pairs[i][1]

In [18]:
# show some examples of the final dataset
train_df[1200: 1220]

Unnamed: 0,Sentence,Spans
1200,لا تحزن الدنيا ما فيها اشي تحزن عليه,[]
1201,لا تحزن ان الله معك,[]
1202,لا تحزن على دنا فانيه بل احزن على اخره باقيه,[]
1203,لا تخسر قيمتك بكلمه ، ولا تفقد احترامك بزله ، ولا تجعل همك في الدنيا هو حب الناس لك ، فالناس قلوبهم متقلبه قد تحبك اليوم و تكرهك غدا,[]
1204,لا تعليق,[]
1205,لا تفهمي غلط حبيبتي,[]
1206,لا تقنطوا من رحمه الله فهي وسعت كل شيء مهما عصينا فلنا رب كبير رحيم ورحمن,[]
1207,لا تياسوا من روح الله ، الله تعالى لا يعجزه شيء ، فكم من مريض شفاه ، وكم من فقير اغناه ، وكم من مشرف على الهلاك نجاه ، وكم من ضال هداه,[]
1208,لا حول ولا قوه الا بالله اذا اخذ ما اوهب اسقط ما اوجب,[]
1209,لا حول ولا قوه الا بالله العظيم لم اكن اعلم ان نسبه ذكاء العرب منخفضه جدا الى هذا الحد,"[52, 53, 54, 55, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 70, 71, 72, 74, 75, 76, 78, 79, 80, 82, 83, 84, 85]"


In [19]:
# save the dataset to a CSV file
train_df.to_csv('train_ready.csv', index=False)