In [1]:
import pandas as pd
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import json
import malaya
from tqdm import tqdm
from unidecode import unidecode
from bs4 import BeautifulSoup

In [3]:
import re

def simple_cleaning(string):
    return re.sub(r'[ ]+', ' ', unidecode(string).replace('\n', ' ').replace('--', ' ').replace('/', ' ').replace('\r', ' ')).strip()

In [5]:
from malaya.text.rouge import postprocess_summary, filter_rouge, _get_word_ngrams, _rouge_clean, cal_rouge
from malaya.text.function import split_into_sentences

maxlen = 800

def filtering_rouge(summary, contents, start = 0.15, increment = 0.05, break_at = 800, n = 1):
    filtered = contents[:]
    i = 0
    while len(filtered.split()) > break_at:
        filtered = filter_rouge(summary, filtered, n = n, threshold = start + increment * i)
        i += 1
    return filtered

def clean_keyword(string):
    string = string.split(',')
    string = [simple_cleaning(s) for s in string]
    return ','.join(string), len(string)

In [6]:
df = pd.read_csv('train.csv')
df = df[df['type'] == 'KEYWORD']

In [7]:
df.head()

Unnamed: 0,dataset,file_id,text,summary,type
0,www,13534577,Crosslanguage blog mining and trend visualisat...,"blog mining,crosslanguage,miscellaneous,trend ...",KEYWORD
1,kdd,1370349,Mining a stream of transactions for customer p...,"approximate queries,customer profiles,dynamic ...",KEYWORD
3,kdd,6365307,A generative probabilistic approach to visuali...,"em algorithm,hidden markov model,information s...",KEYWORD
7,fao780,faobetf_fb29fe,<title>Better freshwater fish farming: The Pon...,"FISH PONDS,FISH CULTURE,FRESHWATER FISHES,EXTE...",KEYWORD
8,kdd,4295479,Pattern lattice traversal by selective jumps N...,nonnumerical algorithms and problems,KEYWORD


In [8]:
from glob import glob

xwikis = sorted(glob('xwikis-keywords.jsonl*.splitted.translated'))
xwikis

['xwikis-keywords.jsonl00.splitted.translated',
 'xwikis-keywords.jsonl01.splitted.translated',
 'xwikis-keywords.jsonl02.splitted.translated',
 'xwikis-keywords.jsonl03.splitted.translated',
 'xwikis-keywords.jsonl04.splitted.translated',
 'xwikis-keywords.jsonl05.splitted.translated']

In [9]:
train = open('train.json', 'w')

In [14]:
for i in range(len(df)):
    src = simple_cleaning(df['text'].iloc[i])
    tgt = simple_cleaning(df['summary'].iloc[i])
    tgt, tgt_len = clean_keyword(tgt)

    if len(src.split()) >= maxlen:
        continue

    d = {"translation": {"src": src, "tgt": tgt, 'prefix': f'{tgt_len} kata kunci: '}}
    train.write(f'{json.dumps(d)}\n')

In [15]:
for f in xwikis:
    with open(f) as fopen:
        for l in fopen:
            data = json.loads(l)
            ms = simple_cleaning(data['ms'])
            en = simple_cleaning(data['en'])
            tgt = data['keyword']
            tgt, tgt_len = clean_keyword(tgt)
            
            d = {"translation": {"src": ms, "tgt": tgt, 'prefix': f'{tgt_len} kata kunci: '}}
            train.write(f'{json.dumps(d)}\n')
            d = {"translation": {"src": en, "tgt": tgt, 'prefix': f'{tgt_len} kata kunci: '}}
            train.write(f'{json.dumps(d)}\n')

In [16]:
!tail -n 10 train.json

{"translation": {"src": "George Philip Farran was born on November 21, 1876. He attended Trinity College in Dublin in 1876 for law. Later he graduated in Natural Science winning the Gold Medal of the subject for his class. Later in his life, Farran connected with The Department of Agriculture and Technical Instruction in 1900. He enjoyed his time here and was even promoted to be the Chief Inspector of Fisheries. Farran was a widely published scientist and had his work published in many journals, a few of them include; The Royal Irish Academy, The linnean Society, Exploration of the Sea, and The Zoological Society. However, Most of his work was published in the Scientific Investigations of the Fisheries Branch of the Department of Agriculture and Technical Instruction.", "tgt": "Personal life and education,Studies,George Philip Farran", "prefix": "3 kata kunci: "}}
{"translation": {"src": "Menurut tradisi, Mark dan Marcellian adalah saudara kembar dari keluarga yang terkenal. Mereka ti

In [19]:
glob('berita-*')

['berita-sukan.json.nested',
 'berita-hiburan.json.nested',
 'berita-teknologi.json.nested',
 'berita-dunia.json.nested',
 'berita-bisnes.json.nested',
 'berita-politik.json.nested',
 'berita-malaysia.json.nested',
 'berita-english.json']

In [20]:
astro = sorted(glob('berita-*'))

In [21]:
for f in astro:
    print(f)
    with open(f) as fopen:
        data = json.load(fopen)
    for d in data:
        try:
            soup = BeautifulSoup(d['r']['response']['articleBody'], "lxml")
            text = simple_cleaning(BeautifulSoup(soup.text, 'lxml').text)
            tgt = ','.join(d['original']['tags'])
            tgt, tgt_len = clean_keyword(tgt)
            d = {"translation": {"src": text, "tgt": tgt, 'prefix': f'{tgt_len} kata kunci: '}}
            train.write(f'{json.dumps(d)}\n')
        except Exception as e:
            pass

berita-bisnes.json.nested
berita-dunia.json.nested
berita-english.json
berita-hiburan.json.nested
berita-malaysia.json.nested
berita-politik.json.nested
berita-sukan.json.nested
berita-teknologi.json.nested


In [22]:
!tail -n 10 train.json

{"translation": {"src": "Kata kunci iPhone 6 terus trending di Twitter dari hari ke hari. Peranti yang bakal diperkenalkan 12 tengah malam Selasa waktu Malaysia membuatkan orang ramai dari seluruh dunia tidak sabar untuk melihat dan memilikinya. Majlis pelancaran ini boleh disaksikan menerusi penstriman langsung di apple.com. Sebelum ini, kami telah melaporkan bahawa sudah ada beberapa individu yang telah mula beratur di apple store New York demi untuk memiliki iPhone terbaru keluaran apple itu. Namun di media sosial, ada juga peminat iPhone yang merancang tidak akan membeli iPhone 6 kerana saiznya yang agak besar berbanding keluaran iPhone sebelum ini. Walaupun iPhone 6 akan dilancarkan hari ini, ia hanya akan mula dijual di pasaran pada 19 September, sembilan hari selepas ia diperkenalkan. Selain iPhone 6, Apple juga bakal memperkenalkan sistem operasi baru, OS X Yosemite, IOS 8 dan juga iWatch.", "tgt": "kini trending,gadget,iphone", "prefix": "3 kata kunci: "}}
{"translation": {"s

In [23]:
train.close()

In [24]:
!wc -l train.json

1128280 train.json


In [25]:
df = pd.read_csv('test.csv')
df = df[df['type'] == 'KEYWORD']

test = open('test.json', 'w')
for i in range(len(df)):
    src = simple_cleaning(df['text'].iloc[i])
    tgt = simple_cleaning(df['summary'].iloc[i])
    tgt, tgt_len = clean_keyword(tgt)

    if len(src.split()) >= maxlen:
        continue

    d = {"translation": {"src": src, "tgt": tgt, 'prefix': f'{tgt_len} kata kunci: '}}
    test.write(f'{json.dumps(d)}\n')
    
test.close()

In [26]:
!wc -l test.json

948 test.json


In [27]:
!shuf train.json > shuffled-train.json
!shuf test.json > shuffled-test.json