In [1]:
# !wget https://huggingface.co/datasets/mesolitica/semisupervised-abstractive-summarization-ms-news/resolve/main/populate-news.json.semisupervised

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [3]:
import json
import malaya
from tqdm import tqdm
from unidecode import unidecode
import random

In [4]:
import re

def cleaning(string):
    splitted = malaya.text.function.split_into_sentences(string)
    if not len(splitted):
        splitted = '. '.join([k.strip() for k in string.split('.') if len(k.strip())])
    if splitted[0][0] == '-':
        splitted[0] = splitted[0].replace('- ','')
    points = [f'{no + 1}. {s}' for no, s in enumerate(splitted)]
    points = ' '.join(points)
    return points

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from malaya.text.vectorizer import SkipGramCountVectorizer, SkipGramTfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation

stopwords = malaya.text.function.get_stopwords()

In [6]:
vectorizer = SkipGramCountVectorizer(
    max_df = 0.95,
    min_df = 1,
    ngram_range = (1, 3),
    stop_words = stopwords,
    skip = 2
)
svd = TruncatedSVD(n_components = 30)
model = malaya.summarization.extractive.sklearn(svd, vectorizer)

In [7]:
vectorizer = SkipGramTfidfVectorizer(
    max_df = 0.95,
    min_df = 1,
    ngram_range = (1, 3),
    stop_words = stopwords,
    skip = 2
)
svd = TruncatedSVD(n_components = 30)
model_tfidf = malaya.summarization.extractive.sklearn(svd, vectorizer)

In [43]:
months = {
    'january',
    'jan',
    'januari',
    'february',
    'feb',
    'februari',
    'march',
    'mac',
    'april',
    'apr',
    'may',
    'mei',
    'june',
    'jun',
    'july',
    'julai',
    'august',
    'ogos',
    'aug',
    'september',
    'sep',
    'october',
    'oktober',
    'oct',
    'november',
    'nov',
    'december',
    'disember',
    'dec',
    'utusan',
    'malaysiakini',
    'astroawani',
    'bernama',
    'com',
}

In [48]:
def simple_cleaning(string):
    return re.sub(r'[ ]+', ' ', unidecode(string).replace('\n', ' ').replace('--', ' ').replace('/', ' ')).strip()

In [55]:
before, after = [], []

count = 0
rejected = []
languages = []
accepted = []
para = []
malaysian_news = {
    'kosmo',
    'hmetro',
    'malaymail',
    'projekmm',
    'bharian',
    'utusan',
    'astroawani',
    'themalaysianinsight',
    'malaysiakini',
    'bernama'
}

def reject(data):
    if data['news'] in malaysian_news:
        return False
    if any([n in data['top-image'] for n in malaysian_news]):
        return False
    if any([n in data['url'] for n in malaysian_news]):
        return False
    if 'com.my' in data['top-image']:
        return False
    if data['language'] == 'malay':
        return False
    if 'Siaran Pers' in data['news']:
        return True
    if '.id' in data['news']:
        return True
    
    return True

with open('populate-news.json.semisupervised') as fopen:
    for l in tqdm(fopen):
        data = json.loads(l)
        text = re.sub(r'[ ]+', ' ', data['text']).strip()
        if 'kindly register' in text.lower() or 'disabled in your browser' in text.lower():
            continue
        if len(text.split()) < 30:
            continue
            
        # accepted.append(data)
        summaries = [malaya.text.function.remove_empty_parenthesis(s) for s in data['semisupervised-summaries']]
        for s in summaries:
            if len(s.split()) > 20:
                accepted.append((s, data['text']))
                
        keywords_rake = malaya.keyword.extractive.rake(data['text'], 
                                                  top_k = random.randint(25, 50))
        keywords_rake = [simple_cleaning(k[1]) for k in keywords_rake if len(k[1].split()) >= 3 and len(k[1]) > 20 \
                        and len(set(k[1].lower().replace('-', '').split()) & months) == 0]
        
        already = set()
        filtered = []
        for k in keywords_rake:
            if k.lower() in already:
                continue
            else:
                already.add(k.lower())
                filtered.append(k)
                
        keywords_rake = filtered
        
        if len(keywords_rake) > 5:
            # print(keywords_rake)
            accepted.append(('. '.join(keywords_rake), data['text']))
            
#         if random.random() > 0.1:
#             continue
        
#         try:
#             extractive = model_tfidf.sentence_level(data['text'], top_k = random.randint(3, 5))['summary']
#             splitted = malaya.text.function.split_into_sentences(data['text'])
#             if len(malaya.text.function.split_into_sentences(extractive)) <= (len(splitted) / 2):
#                 accepted.append((extractive, data['text']))
#         except Exception as e:
#             print(e)
            
        count += 1

81717it [08:35, 158.52it/s]


In [57]:
len(accepted)

219612

In [58]:
with open('news.json', 'w') as fopen:
    json.dump(accepted, fopen)

In [59]:
accepted[-1]

('wilayah Okayama mencipta pelbagai haiwan. Jepun mencipta hasil seni menarik. "Gambar kucing menangkap ikan. menghadiri pelbagai acara. menerima tawaran kerja. popular seekor kucing',
 'Seorang barista di Jepun mencipta hasil seni menarik dan comel hanya menggunakan buih susu dalam latte dihidangkannya.\n\nKazuki Yamamoto, 29, dari wilayah Okayama mencipta pelbagai haiwan kecil yang nampak realistik ketika bekerja di sebuah restoran, sebelum hasil seninya itu mula mendapat perhatian di seluruh dunia.\n\nKerja tangannya yang paling popular seekor kucing yang cuba masuk dari satu cawan ke cawan lain untuk menangkap ikan.\n\nDia yang kini pengurus sebuah kafe di Harajyuku juga membuat buih berbentuk arnab dan kucing yang lain, dianggap pelanggannya sebagai ‘terlalu comel sehingga tidak tergamak untuk meminumnya.’\n\n“Gambar kucing menangkap ikan menjadi popular selepas tersebar di Internet sehingga saya mendapat tawaran untuk menghadiri pelbagai acara dan bekerja di luar negara.\n\n“Sele