In [1]:
import re
from unidecode import unidecode
import malaya
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
tokenizer = malaya.preprocessing._SocialTokenizer().tokenize
rules_normalizer = malaya.texts._tatabahasa.rules_normalizer

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    tokenized = tokenizer(unidecode(string))
    tokenized = [malaya.stem.naive(w) for w in tokenized]
    tokenized = [w.lower() for w in tokenized if len(w) > 1]
    tokenized = [rules_normalizer.get(w, w) for w in tokenized]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

In [4]:
with open('subjectivity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('subjectivity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts

assert len(labels) == len(texts)

In [5]:
from tqdm import tqdm
pbar = tqdm(range(len(texts)))
for i in pbar:
    texts[i] = preprocessing(texts[i])

100%|██████████| 9962/9962 [00:02<00:00, 4343.28it/s]


In [6]:
labels[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [8]:
texts[:2]

[['filem',
  'mula',
  'pada',
  'masa',
  'lalu',
  'mana',
  'orang',
  'budak',
  'lelaki',
  'nama',
  'sama',
  'cuba',
  'yelamat',
  'celebi',
  'dari',
  'mburu'],
 ['yang',
  'muncul',
  'dari',
  'jiwa',
  'manusia',
  'unjuk',
  'ciri',
  'ciri',
  'abstrak',
  'expressionism',
  'abstrak',
  'yingkir',
  'grafiti',
  'konstruktivisme',
  'russi',
  'te',
  'uat',
  'tempat',
  'dalam',
  'jarah',
  'ini',
  'moden',
  'tika',
  'cipta',
  'oleh',
  'artis',
  'yang',
  'tidak',
  'dar',
  'ri',
  'deng',
  'capai',
  'seni',
  'reka']]

In [7]:
with open('tokenization.json','w') as fopen:
    json.dump({'texts':texts, 'labels':labels}, fopen)