In [1]:
import os
import tensorflow as tf
import malaya
import json

In [2]:
import re

tokenizer = malaya.preprocessing._SocialTokenizer().tokenize

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    tokenized = tokenizer(string)
    tokenized = [malaya.stem.naive(w) for w in tokenized]
    tokenized = [w.lower() for w in tokenized if len(w) > 1]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

def clean_label(label):
    string = re.sub('[^A-Za-z\- ]+', ' ', label)
    return re.sub(r'[ ]+', ' ', string.lower()).strip()

In [3]:
import pandas as pd

df = pd.read_csv('toxic-bm.csv')
df = df.dropna()
df.shape

(40911, 7)

In [11]:
df.head()

Unnamed: 0,text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,penjelasan mengapa pengeditan yang dibuat di b...,0,0,0,0,0,0
1,d&#39;aww dia sepadan dengan warna latar belak...,0,0,0,0,0,0
2,hey man saya tidak cuba untuk mengedit peperan...,0,0,0,0,0,0
3,lebih tidak boleh membuat apa-apa cadangan seb...,0,0,0,0,0,0
4,anda tuan adalah wira saya apa-apa peluang and...,0,0,0,0,0,0


In [13]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
Y = df[list_classes].values.tolist()
X = df['text'].tolist()
len(Y), len(X)

(40911, 40911)

In [10]:
files = [i for i in os.listdir(os.getcwd()) if 'json' in i]
files

['toxic5.json',
 'toxic7.json',
 'toxic0.json',
 'toxic3.json',
 'toxic2.json',
 'toxic6.json',
 'toxic4.json',
 'toxic1.json']

In [14]:
for file in files:
    with open(file) as fopen:
        data = json.load(fopen)
    for x, y in data:
        X.append(x)
        Y.append(y)

In [15]:
len(Y), len(X)

(192029, 192029)

In [16]:
from tqdm import tqdm

for i in tqdm(range(len(X))):
    X[i] = preprocessing(X[i])

100%|██████████| 192029/192029 [01:51<00:00, 1719.26it/s]


In [None]:
with open('tokenized.json', 'w') as fopen:
    json.dump({'x':X, 'y': Y}, fopen)