In [1]:
import re
from unidecode import unidecode
import malaya
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import itertools

In [2]:
tokenizer = malaya.preprocessing._SocialTokenizer().tokenize
rules_normalizer = malaya.texts._tatabahasa.rules_normalizer
rejected = ['wkwk', 'http', 'https', 'lolol', 'hahaha']

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    string = ''.join(''.join(s)[:2] for _, s in itertools.groupby(unidecode(string)))
    tokenized = tokenizer(string)
    tokenized = [malaya.stem.naive(w) for w in tokenized]
    tokenized = [w.lower() for w in tokenized if len(w) > 1]
    tokenized = [w for w in tokenized if all([r not in w for r in rejected])]
    tokenized = [rules_normalizer.get(w, w) for w in tokenized]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

In [3]:
import os
files = [i for i in os.listdir(os.getcwd()) if 'translated' in i]
files

['translated-joy',
 'translated-love',
 'translated-fear',
 'translated-sadness',
 'translated-surprise',
 'translated-anger']

In [4]:
texts, labels = [], []
for file in files:
    with open(file) as fopen:
        dataset = fopen.readlines()
    print(len(dataset))
    texts.extend(dataset)
    labels.extend([file.split('-')[1]] * len(dataset))

19587
15232
19058
16053
9712
18873


In [5]:
files = [i for i in os.listdir(os.getcwd()) if 'malaysia.json' in i]
files

['sadness-twitter-malaysia.json',
 'surprise-twitter-malaysia.json',
 'anger-twitter-malaysia.json',
 'love-twitter-malaysia.json',
 'fear-twitter-malaysia.json',
 'joy-twitter-malaysia.json']

In [6]:
for file in files:
    with open(file) as fopen:
        dataset = json.load(fopen)
    print(len(dataset))
    texts.extend(dataset)
    labels.extend([file.split('-')[0]] * len(dataset))

83264
37778
55723
63107
18895
63234


In [7]:
len(texts), len(labels)

(420516, 420516)

In [8]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

unique_labels = np.unique(labels).tolist()
labels = LabelEncoder().fit_transform(labels)
unique_labels

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [9]:
labels = labels.tolist()

In [10]:
from tqdm import tqdm
pbar = tqdm(range(len(texts)))
for i in pbar:
    texts[i] = preprocessing(texts[i])

100%|██████████| 420516/420516 [01:29<00:00, 4693.06it/s]


In [11]:
with open('tokenization.json','w') as fopen:
    json.dump({'texts':texts, 'labels':labels}, fopen)