In [1]:
import re
from unidecode import unidecode
from malaya.text.rules import normalized_chars

def filter_news(string):
    string = string.lower()
    return 'javascript is disabled' in string or 'requires javascript' in string or 'javascript' in string \
    or 'président' in string or 'register in order to view' in string or 'view this article' in string

def make_cleaning(s, c_dict):
    s = s.translate(c_dict)
    return s

def transformer_textcleaning(string):
    """
    use by any transformer model before tokenization
    """
    string = unidecode(string)
    string = ' '.join(
        [make_cleaning(w, normalized_chars) for w in string.split()]
    )
    string = re.sub('\(dot\)', '.', string)
    string = (
        re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string)
        if (len(re.findall(r'\<a (.*?)\>', string)) > 0)
        and ('href' in re.findall(r'\<a (.*?)\>', string)[0])
        else string
    )
    string = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', string
    )
    string = string.replace('\n', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip().split()
    string = [w for w in string if w[0] != '@']
    string = [w.title() if w[0].isupper() else w for w in string]
    return ' '.join(string)

chars = ',:-–—'

def cleaning(string, filter_chars = True):
    string = string.replace('\n', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    if filter_chars:
        index = max([string[:35].rfind(c) for c in chars])
        if index > -1:
            string = string[index + 1:]
            string = cleaning(string, False)
    return string

In [2]:
# !unzip Archive.zip
# !cp ../malaya/Malaya/misc/crawl/*.json news

In [3]:
import malaya
import operator
from glob import glob
from tqdm import tqdm
import json

news = glob('news/*.json')
len(news)

884

In [4]:
xlnet = malaya.sentiment.transformer(model = 'xlnet')
bert = malaya.sentiment.transformer(model = 'bert')
alxlnet = malaya.sentiment.transformer(model = 'alxlnet')







In [5]:
fast_text = malaya.language_detection.fasttext()




In [6]:
before, after = [], []
for n in tqdm(news):
    with open(n) as fopen:
        data = json.load(fopen)
    for i in data:
        lang_title, lang_text = fast_text.predict([i['title'], i['text']])
        if not filter_news(i['text']) and lang_title != 'eng' and lang_text != 'eng' \
        and len(i['text']) and len(i['title']):
            before.append(cleaning(i['text']))
            after.append(cleaning(i['title']))

100%|██████████| 884/884 [02:38<00:00,  5.59it/s]


In [7]:
with open('news-30k.json') as fopen:
    data = json.load(fopen)
    
for i in data:
    lang_title, lang_text = fast_text.predict([i['title'], i['text']])
    if not filter_news(i['text']) and lang_title != 'eng' and lang_text != 'eng' \
        and len(i['text']) and len(i['title']):
        before.append(cleaning(i['text']))
        after.append(cleaning(i['title']))

In [8]:
len(before), len(after)

(156727, 156727)

In [9]:
with open('populate-news.json', 'w') as fopen:
    json.dump({'text': before, 'title': after}, fopen)

In [10]:
rejected = ['saya', 'awak', 'kami']

def headline(string, length = 700):
    splitted = malaya.text.function.split_into_sentences(string)
    splitted = [s for s in splitted if all([r not in s for r in rejected])]
    selected, index, results = '', 0, []
    while len(selected) <= length and index < len(splitted):
        selected += splitted[index]
        results.append(splitted[index])
        index += 1
    return ' '.join(results)

In [11]:
batch_size = 10

overall_sentiment, headline_sentiment, headlines = [], [], []

for i in tqdm(range(0, len(before), batch_size)):
    index = min(i + batch_size, len(before))
    batch_x = before[i: index]
    batch_headline = [headline(i) for i in batch_x]
    r = malaya.stack.predict_stack([xlnet, bert, alxlnet], batch_x)
    
    for row in r:
        overall_sentiment.append(max(row.items(), key=operator.itemgetter(1))[0])
        
    r = malaya.stack.predict_stack([xlnet, bert, alxlnet], batch_headline)
    
    for row in r:
        headline_sentiment.append(max(row.items(), key=operator.itemgetter(1))[0])
        
    headlines.extend(batch_headline)

100%|██████████| 15673/15673 [5:37:50<00:00,  1.29s/it]  


In [12]:
with open('populate-news-sentiment.json', 'w') as fopen:
    json.dump({'text-sentiment': overall_sentiment, 
               'headline-sentiment': headline_sentiment,
              'headline': headlines}, fopen)