In [7]:
# clone our repository
!git clone https://github.com/Azzam-Radman/Toxic-Spans-Detection.git

fatal: destination path 'Toxic-Spans-Detection' already exists and is not an empty directory.


In [1]:
# import libraries
from tqdm import tqdm # to show progress bar in for loops
import pandas as pd # data reading and preprocessing library
import numpy as np # math operations liabrary
from nltk.tokenize import TweetTokenizer # tokenizer
pd.set_option('display.max_colwidth', None) # to prevent the truncation of a cell content during display

In [2]:
# read the data
try:
    # implemented if the repo is cloned
    df = pd.read_csv('/content/Toxic-Spans-Detection/src/dataset/AJGT.csv')
except:
    # data read from the repo directly
    df = pd.read_csv(r'https://raw.githubusercontent.com/Azzam-Radman/Toxic-Spans-Detection/main/src/dataset/AJGT.csv')
df.head()

Unnamed: 0,ID,Feed,Sentiment
0,1,اربد فيها جامعات اكثر من عمان ... وفيها قد عمان ونص لعيبه المنتخب منها ... و 80 % من مطربين الاردن منها,Positive
1,2,الحلو انكم بتحكوا على اساس انو الاردن ما فيه فساد سرقات,Negative
2,3,كله رائع بجد ربنا يكرمك,Positive
3,4,لسانك قذر يا قمامه,Negative
4,5,​انا داشره وغير متزوجه ولدي علاقات مشبوه واحشش واحيانا اهرب مخدرات و اجيد التسليك احب ان انكب نفسي وعلاقتي بالمنزل متوتره جد,Negative


In [3]:
# show count values of the label
df['Sentiment'].value_counts()

Positive    900
Negative    900
Name: Sentiment, dtype: int64

In [4]:
# show shape of the dataset
df.shape

(1800, 3)

In [5]:
# construct a tokenizer 
tknzr2 = TweetTokenizer()

# function to tokenize a text in the form of a string
def custom_tokenizer(text_data):
    return tknzr2.tokenize(text_data)

In [6]:
# try tokening the first sentence in the dataset
custom_tokenizer(df['Feed'][0])

['اربد',
 'فيها',
 'جامعات',
 'اكثر',
 'من',
 'عمان',
 '...',
 'وفيها',
 'قد',
 'عمان',
 'ونص',
 'لعيبه',
 'المنتخب',
 'منها',
 '...',
 'و',
 '80',
 '%',
 'من',
 'مطربين',
 'الاردن',
 'منها']

In [7]:
# iterate over sentences to see the length of the longest sentence
max_ = 0 # initialize max_ with 0
for i in range(df.shape[0]):
    len_ = len(custom_tokenizer(df['Feed'][i])) # the lenght of the sentence of row i
    if len_ > max_:
        max_ = len_

# print the length of the longest sentence
print(max_)

143


In [8]:
# construct a placeholder ----> a dataframe with all zeors (# rows = the original dataset number of rows,
# number of columnst = the longest sentence)
tokenized_df = pd.DataFrame(np.zeros((df.shape[0], 143)), columns=[f"{i}" for i in range(143)])
tokenized_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,133,134,135,136,137,138,139,140,141,142
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# loop over each sentence, tokenize it and pad the remaining of the max lenght (143) with a "pad" token
for i in tqdm(range(df.shape[0])):
    len_ = len(custom_tokenizer(df['Feed'][i]))
    list_of_tokens = custom_tokenizer(df['Feed'][i])
    list_of_tokens += ['pad'] * (143 - len_)
    tokenized_df.iloc[i, :] = list_of_tokens

100%|███████████████████████████████████████| 1800/1800 [00:37<00:00, 47.74it/s]


In [10]:
# show the head of the dataset
tokenized_df.head(4)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,133,134,135,136,137,138,139,140,141,142
0,اربد,فيها,جامعات,اكثر,من,عمان,...,وفيها,قد,عمان,...,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad
1,الحلو,انكم,بتحكوا,على,اساس,انو,الاردن,ما,فيه,فساد,...,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad
2,كله,رائع,بجد,ربنا,يكرمك,pad,pad,pad,pad,pad,...,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad
3,لسانك,قذر,يا,قمامه,pad,pad,pad,pad,pad,pad,...,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad


In [11]:
# set a new index ---> to the rows
# this benificial in adding a new empty row under each original row 
# this empty row will be used for hand labelling
new_idx = list(range(0, 1800*2, 2))
tokenized_df.index = new_idx
# reindex and add a new empty row after each original row
tokenized_df_2 = tokenized_df.reindex(range(0, 3601, 1))
tokenized_df_2 = tokenized_df_2.fillna('')

In [12]:
tokenized_df_2.head(8)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,133,134,135,136,137,138,139,140,141,142
0,اربد,فيها,جامعات,اكثر,من,عمان,...,وفيها,قد,عمان,...,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad
1,,,,,,,,,,,...,,,,,,,,,,
2,الحلو,انكم,بتحكوا,على,اساس,انو,الاردن,ما,فيه,فساد,...,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad
3,,,,,,,,,,,...,,,,,,,,,,
4,كله,رائع,بجد,ربنا,يكرمك,pad,pad,pad,pad,pad,...,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad
5,,,,,,,,,,,...,,,,,,,,,,
6,لسانك,قذر,يا,قمامه,pad,pad,pad,pad,pad,pad,...,pad,pad,pad,pad,pad,pad,pad,pad,pad,pad
7,,,,,,,,,,,...,,,,,,,,,,


In [13]:
# save the dataset as a CSV file
tokenized_df_2.to_csv('tokenized.csv')