In [4]:
import re
from unidecode import unidecode
import malaya
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [5]:
tokenizer = malaya.preprocessing._SocialTokenizer().tokenize
rules_normalizer = malaya.texts._tatabahasa.rules_normalizer

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    tokenized = tokenizer(unidecode(string))
    tokenized = [malaya.stem.naive(w) for w in tokenized]
    tokenized = [w.lower() for w in tokenized if len(w) > 1]
    tokenized = [rules_normalizer.get(w, w) for w in tokenized]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

In [6]:
df = pd.read_csv('sentiment-data-v2.csv')
Y = LabelEncoder().fit_transform(df.label)
with open('polarity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('polarity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts
texts += df.iloc[:,1].tolist()
labels += Y.tolist()

assert len(labels) == len(texts)

In [7]:
import json
with open('bm-amazon.json') as fopen:
    amazon = json.load(fopen)
    
with open('bm-imdb.json') as fopen:
    imdb = json.load(fopen)
    
with open('bm-yelp.json') as fopen:
    yelp = json.load(fopen)
    
texts += amazon['negative']
labels += [0] * len(amazon['negative'])
texts += amazon['positive']
labels += [1] * len(amazon['positive'])

texts += imdb['negative']
labels += [0] * len(imdb['negative'])
texts += imdb['positive']
labels += [1] * len(imdb['positive'])

texts += yelp['negative']
labels += [0] * len(yelp['negative'])
texts += yelp['positive']
labels += [1] * len(yelp['positive'])

In [8]:
import os
for i in [i for i in os.listdir('negative') if 'Store' not in i]:
    with open('negative/'+i) as fopen:
        a = json.load(fopen)
        texts += a
        labels += [0] * len(a)

In [9]:
for i in [i for i in os.listdir('positive') if 'Store' not in i]:
    with open('positive/'+i) as fopen:
        a = json.load(fopen)
        texts += a
        labels += [1] * len(a)

In [10]:
len(texts),len(labels)

(675023, 675023)

In [11]:
from tqdm import tqdm

In [12]:
pbar = tqdm(range(len(texts)))
for i in pbar:
    texts[i] = preprocessing(texts[i])

100%|██████████| 675023/675023 [01:42<00:00, 6568.58it/s]


In [13]:
texts[-10:]

[['ada',
  'anda',
  'yiar',
  'tentang',
  'gatur',
  'tidur',
  'yang',
  'baru',
  'tidak',
  'lama',
  'lagi',
  'yusun',
  'mula',
  'ada',
  'yang',
  'baik',
  'ntiasa',
  'asa',
  'begitu',
  'jaya',
  'lepas',
  'itu'],
 ['lamat', 'pagi', 'dunia'],
 ['sini', 'rumah', 'pupu', 'saya'],
 ['mungkin', 'ini', 'lebih', 'anda'],
 ['ima', 'kasih', 'saya', 'erlukan'],
 ['neraka',
  'windows',
  'luar',
  'dari',
  'julat',
  'harga',
  'saya',
  'cuali',
  'jika',
  'tidak'],
 ['neah',
  'saya',
  'harap',
  'ha',
  'enang',
  'mbali',
  'catat',
  'dalam',
  'tweet',
  'akhir',
  'saya'],
 ['aww',
  'saya',
  'benar',
  'benar',
  'minta',
  'maaf',
  'tentang',
  'itu',
  'tidak',
  'ada',
  'apa',
  'apa',
  'yang',
  'sa',
  'deng',
  'jadi',
  'asi',
  'tidak',
  'anda',
  'lihat',
  'quot',
  'quot',
  'malam',
  'kido'],
 ['saya',
  'tidak',
  'sabar',
  'untuk',
  'lihat',
  'apa',
  'kara',
  'yang',
  'akjub',
  'yang',
  'anda',
  'datang',
  'tidak',
  'nah',
  'lupa',
  'an

In [14]:
labels[-10:]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [15]:
with open('tokenization.json','w') as fopen:
    json.dump({'texts':texts, 'labels':labels}, fopen)