In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('../../../../dataset/id-multi-label-hate-speech-and-abusive-language-detection/re_dataset.csv', encoding='latin-1')
alay_dict = pd.read_csv('../../../../dataset/id-multi-label-hate-speech-and-abusive-language-detection/new_kamusalay.csv', encoding='latin-1', header=None)
alay_dict = alay_dict.rename(columns={0: 'original', 1: 'replacement'})

In [3]:
def lowercase(text):
    return text.lower()

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('rt',' ',text) # Remove every retweet symbol
    text = re.sub('user',' ',text) # Remove every username
    text = re.sub('url', ' ', text) # Remove every URL
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
    text = re.sub(r'\b(?:x[a-fA-F0-9]{2}\s*)+\b', '', text) # Remove emoji bytecode
    text = re.sub('  +', ' ', text) # Remove extra spaces
    return text
    
def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) 
    return text

alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
def normalize_alay(text):
    return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])

def preprocess(text):
    text = lowercase(text)
    text = remove_nonaplhanumeric(text)
    text = remove_unnecessary_char(text)
    text = normalize_alay(text) 

    return text

In [4]:
data['Tweet'] = data['Tweet'].apply(preprocess)

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
train_data.sort_values(by=['Tweet'], inplace=True)

train_labels = train_data.columns[1:]
val_labels = val_data.columns[1:]

# Extract features and labels for training and validation
X_train = train_data['Tweet'].values
y_train = train_data[train_labels].values
X_val = val_data['Tweet'].values
y_val = val_data[val_labels].values

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

print(X_train[-3:])

acquired_data = pd.read_csv('../acquired_data/hsd-coreset-1-data-10535.csv')
acquired_data.sort_values(by=['processed_text'], inplace=True)

acq_X_train = acquired_data['processed_text'].values
acq_y_train = acquired_data[acquired_data.columns[1:]].values

print(acq_X_train.shape, acq_y_train.shape)
print(acq_X_train[-3:])

(10535,) (10535, 12)
(2634,) (2634, 12)
['yusril kelompok islam te indas di era jokowi fakta 2019 ganti presiden'
 'zaman now segalanya jelas dan data sudah jelas jika klaimnya benar kasih jika klaimnya tidak benar tidak usah repot jangan ganti presiden 2019 nanti juga ketahuan makhluk makhluk yang jago melintir fakta haiz hidup kok mencari penderitaan kerja woi biar dapur bisa dan'
 'zaman susilo bambang yudhoyono biaya makan sekeluarga dalam sehari cukup rupiah 50 000 zaman sekarang rupiah 120 000 sekolah dasar rupiah 150 000 baru cukup gaji pegawai negeri sipil sudah 4 tahun tidak naik kasihan nasib rakyat yang gajinya rata rata dua juta sekolah dasar 3 juta mumet mereka mungkin mereka berdoa oh lamanya']
(10535,) (10535, 12)
['yusril halang halang i pembubaran hati mau lengserkan jokowi '
 'yusril kelompok islam te indas di era jokowi fakta 2019 ganti presiden'
 'zaman now segalanya jelas dan data sudah jelas jika klaimnya benar kasih jika klaimnya tidak benar tidak usah repot jang

In [5]:
def verify(ori_x, acquired_x):
    ori_x_sorted = np.sort(ori_x)
    acquired_x_sorted = np.sort(acquired_x)

    if np.array_equal(ori_x_sorted, acquired_x_sorted):
        print("ori_x and acquired_x contain the same elements (ignoring order).")
    else:
        print("ori_x and acquired_x have different elements.")

    set_ori_x = set(ori_x)
    set_acquired_x = set(acquired_x)

    print("Elements in ori_x but not in acquired_x:", len(set_ori_x - set_acquired_x))
    print("Elements in acquired_x but not in ori_x:", len(set_acquired_x - set_ori_x))

    diff_indices = np.where(ori_x != acquired_x)[0]  # Get indices where elements differ
    print("Mismatched indices:", diff_indices)
    print("ori_x mismatches:", ori_x[diff_indices])

In [6]:
verify(X_train, acq_X_train)

ori_x and acquired_x have different elements.
Elements in ori_x but not in acquired_x: 110
Elements in acquired_x but not in ori_x: 0
Mismatched indices: [   99   100   101 ... 10532 10533 10534]
ori_x mismatches: [' abu gosok janda sudah pak dungu lagi yang begini jadi pengikutnya presiden malu memalukan saja jokowi saja '
 ' acara tidak mutu ini ulah mantan aktivis bagaimana nih organisasi masyarakat mahasiswa gembong partai komunis indonesia yang jadi musuh bebuyutan himpunan mahasiswa islam '
 ' acting nya murahan jadi enggak laku di pasaran cebong ya begitu dungu nya permanen haha kebanyakan minum air kencing bagong tumpul itu otak '
 ...
 'yusril kelompok islam te indas di era jokowi fakta 2019 ganti presiden'
 'zaman now segalanya jelas dan data sudah jelas jika klaimnya benar kasih jika klaimnya tidak benar tidak usah repot jangan ganti presiden 2019 nanti juga ketahuan makhluk makhluk yang jago melintir fakta haiz hidup kok mencari penderitaan kerja woi biar dapur bisa dan'
 '