In [1]:
import re
from unidecode import unidecode
import malaya
import json
import pandas as pd
import itertools
import collections
import glob
import pandas as pd

In [2]:
tokenizer = malaya.preprocessing._SocialTokenizer().tokenize
rules_normalizer = malaya.texts._tatabahasa.rules_normalizer
rejected = ['wkwk', 'http', 'https', 'lolol', 'hahaha']

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    string = ''.join(''.join(s)[:2] for _, s in itertools.groupby(unidecode(string)))
    tokenized = tokenizer(string)
    tokenized = [malaya.stem.naive(w) for w in tokenized]
    tokenized = [w.lower() for w in tokenized if len(w) > 1]
    tokenized = [w for w in tokenized if all([r not in w for r in rejected])]
    tokenized = [rules_normalizer.get(w, w) for w in tokenized]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

def build_dataset(words, n_words, atleast = 2):
    count = [['[PAD]', 0], ['[UNK]', 1], ['[CLS]', 2], ['[SEP]', 3], ['[MASK]', 4],
            ['<S>', 5], ['<T>', 6]]
    counter = collections.Counter(words).most_common(n_words)
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 3)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [3]:
sentiment = []
for file in glob.glob('Malaya-Dataset/twitter-sentiment/*/*.json'):
    with open(file) as fopen:
        sentiment.extend(json.load(fopen))
        
for file in glob.glob('Malaya-Dataset/multidomain-sentiment/*.json'):
    with open(file) as fopen:
        x = json.load(fopen)
        sentiment.extend(x['negative'] + x['positive'])
        
sentiment.extend(pd.read_csv('Malaya-Dataset/news-sentiment/sentiment-data-v2.csv')['text'].tolist())

In [4]:
len(sentiment)

664429

In [5]:
emotion = []
for file in glob.glob('Malaya-Dataset/emotion/*.json'):
    with open(file) as fopen:
        emotion.extend(json.load(fopen))
        
for file in glob.glob('Malaya-Dataset/emotion/translated*'):
    with open(file) as fopen:
        x = list(filter(None, fopen.read().split('\n')))
        emotion.extend(x)
        
len(emotion)

420516

In [6]:
fake_news = []
for file in glob.glob('negative/*.json'):
    with open(file) as fopen:
        fake_news.extend(json.load(fopen))
        
for file in glob.glob('positive/*.json'):
    with open(file) as fopen:
        fake_news.extend(json.load(fopen))
        
len(fake_news)

42023

In [7]:
news = []
files = glob.glob('news/*.json')
for file in files:
    with open(file) as fopen:
        x = json.load(fopen)
    news.extend([i['text'] for i in x if i['language'] != 'ENGLISH' and len(i['text']) > 20])
len(news)

9673

In [9]:
with open('clean-wiki.txt') as fopen:
    wiki = fopen.readlines()
wiki = [i for i in wiki if i.count(' ') > 1]
len(wiki)

1663373

In [10]:
texts = news + fake_news + emotion + sentiment + wiki
len(texts)

2800014

In [11]:
from tqdm import tqdm
pbar = tqdm(range(len(texts)))
for i in pbar:
    texts[i] = preprocessing(texts[i])

100%|██████████| 2800014/2800014 [27:12<00:00, 1715.28it/s]


In [12]:
import itertools

concat = list(itertools.chain(*texts))
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])
len(dictionary) / vocabulary_size

vocab from size: 932618
Most common words [('yang', 1762593), ('saya', 902073), ('pada', 899529), ('untuk', 777441), ('deng', 624771), ('ini', 601432)]
Sample data [17578, 708, 1782, 50, 593, 15, 11, 72, 337, 811] ['pergera', 'tenaga', 'akademik', 'malaysia', 'gerak', 'te', 'ada', 'beberapa', 'siri', 'bincang']


0.43447263509818596

In [13]:
len(dictionary)

405197

In [14]:
with open('dictionary.json', 'w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))

In [15]:
with open('texts.json', 'w') as fopen:
    json.dump(texts, fopen)

In [None]:
with open('texts.txt', 'w') as fopen:
    fopen.write('\n'.join(texts))