In [None]:
%%capture
!pip install datasets

!pip install nltk

!pip install pyarrow
!pip install fastparquet
!pip install zstandard

In [None]:
%%capture
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
import re
import unicodedata

import pandas as pd
import numpy as np

import nltk
nltk.download('punkt')

import os
import io
import json
import zstandard as zstd

from datasets import load_dataset

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Google Credentials

In [None]:
# You can find the full reference here: https://cloud.google.com/storage/docs/gsutil/commands/cp
from google.colab import auth

auth.authenticate_user()

project_id = 'XXXX'
bucket_name = 'XXXX'

!gcloud config set project {project_id}

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
file_name = str(input("Input filename to move: "))
!gsutil cp /content/{file_name} gs://{bucket_name}/

# Download datasets

In [None]:
%%capture
!kaggle datasets download -d thonyyy/data-new
!unzip /content/data-new.zip

In [None]:
!kaggle datasets download -d thonyyy/indonesian-news-article-2017 -f indonesian_news_article_2017_kaggle_v4.parquet
!unzip /content/indonesian_news_article_2017_kaggle_v4.parquet.zip

Downloading indonesian_news_article_2017_kaggle_v4.parquet.zip to /content
 97% 393M/406M [00:04<00:00, 114MB/s]
100% 406M/406M [00:04<00:00, 99.7MB/s]
Archive:  /content/indonesian_news_article_2017_kaggle_v4.parquet.zip
  inflating: indonesian_news_article_2017_kaggle_v4.parquet  


In [None]:
%%capture
!kaggle datasets download -d thonyyy/document-summary-pair
!unzip /content/document-summary-pair.zip

# Preprocessing Pretrain Data


In [None]:
def remove_news_headline(text,delim):
    x = text.split(delim)
    if len(x)>1: # buang yang bukan konten
        return " ".join(x[1:])
    else:
        return x[0]

def text_cleaning(input_string, is_news = True):
    lowercase = input_string.lower()
    # stripped_html = BeautifulSoup(lowercase, 'html.parser').get_text()
    remove_link = re.sub(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)', '', lowercase).replace("&amp;","&")
    remove_bullet = "\n".join([T for T in remove_link.split('\n') if '•' not in T and "baca juga:" not in T])
    remove_accented = unicodedata.normalize('NFKD', remove_bullet).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # remove news headline
    if is_news:
        y = remove_news_headline(remove_accented,'- ')
        y = remove_news_headline(y,'– ')
    else:
        y = remove_accented
    remove_parentheses = re.sub("([\(\|]).*?([\)\|])", "\g<1>\g<2>", y)
    remove_punc = re.sub(r"[^\w\d.,\s]+",' ', remove_parentheses)
    remove_num_dot = re.sub(r"(?<=\d)\.|\.(?=\d)|(?<=#)\.","",remove_punc)
    remove_extra_whitespace =  re.sub(r'^\s*|\s\s*', ' ', remove_num_dot).strip()
    return ".".join([s for s in remove_extra_whitespace.strip().split('.') if len(s.strip())>10]).replace("_","")

## CC News ID

In [None]:
directory_path = '/content/ccnews-id/ccnews-id'
directory_files = os.listdir(directory_path)

In [None]:
ccnews_text = []
for f in directory_files:
    file_name = os.path.join(directory_path,f)
    with open(file_name, 'rb') as fh:
        dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
        stream_reader = dctx.stream_reader(fh)
        text_stream = io.TextIOWrapper(stream_reader, encoding='utf-8')
        for line in text_stream:
            obj = json.loads(line)
            # HANDLE OBJECT LOGIC HERE
            if obj['text'] != None:
                y = text_cleaning(obj['text'])
                ccnews_text.append(y)

In [None]:
df_cc = pd.DataFrame({'text':[*set(ccnews_text)]})
del ccnews_text
df_cc

Unnamed: 0,text
0,
1,setiap kampus dan fakultas perlu memiliki unit...
2,ketua dewan pimpinan daerah partai gerindra ac...
3,polda nusa tenggara barat masih memburu gembon...
4,berita labuan bajo panitia lokal di mabar untu...
...,...
2970616,valentino rossi jadi nomor wahid hasil latihan...
2970617,"komisioner komnas ham, beka ulung hapsara meny..."
2970618,napoli kembali ke jalur kemenangan setelah men...
2970619,akun instagram ustadz maaher hilang usai kriti...


In [None]:
df_cc['tokens'] = df_cc['text'].apply(lambda x: len(x.split()))
sum(df_cc['tokens']<=500)

In [None]:
df_cc['sent'] = df_cc['text'].apply(lambda x: len(nltk.sent_tokenize(x)))
df_cc[df_cc['sent']<3]

Unnamed: 0,text,tokens,sent
0,,0,0
51,ekonomi pemerintah fokus membantu nelayan keci...,37,1
100,toilet tembus pandang di taman kota tokyo,7,1
156,ridwan kamil kunjungi muhamad fahri assidiq bo...,141,1
235,"belinyu, mulkan boleh saja berwacana, tapi",6,1
...,...,...,...
2970212,jumlah penduduk seoul untuk pertama kalinya di...,59,2
2970227,"dan bukan karena kami adalah juara tahun lalu,...",16,1
2970560,15 april 2021 10 00 14 april 2021 13 42 11 mar...,58,1
2970606,jajaran koramil 410 06 kdt melaksanakan sholat...,55,2


In [None]:
df_cc_clean = df_cc[(df_cc['tokens']<=500)&(df_cc['sent']>2)][['text']]

In [None]:
from math import ceil

file_path = "ccnews-id.parquet.gzip"

n = ceil(len(df_cc_clean)/1e6)
for i in range(n):
    tes = df_cc_clean.iloc[i*int(1e6):(i+1)*int(1e6)]
    if not os.path.isfile(file_path):
        tes.to_parquet(file_path, engine='fastparquet',compression='gzip')
    else:
        tes.to_parquet(file_path, engine='fastparquet',compression='gzip', append=True)

In [None]:
file_name = "ccnews-id.parquet.gzip"
!gsutil cp /content/{file_name} gs://{bucket_name}/data_new/

## Indonesian News Article (2017)
Special thanks to Joshia for cleaning the data

In [None]:
df_2 = pd.read_parquet("/content/indonesian_news_article_2017_kaggle_v4.parquet")
df_2

Unnamed: 0,title,content_html,source
0,"Pantau Tahun Baru, Kapolda Metro Jaya Bonceng ...",kapolda metro jaya irjen mochamad iriawan mem...,kompas.com
1,"Rayakan Malam Pergantian Tahun, Warga Ambon Pa...",warga dari berbagai penjuru di kota ambon tum...,kompas.com
2,"Klakson ""Telolet"" Ramaikan Malam Tahun Baru di...",detik-detik pergantian tahun tidak hanya dira...,kompas.com
3,Taufik: Saya Baru Kali Ini Dampingi Gubernur J...,wakil ketua dprd dki jakarta m taufik mengata...,kompas.com
4,Hidayat Nur Wahid Nilai Kegiatan Zikir Nasiona...,wakil ketua mpr ri hidayat nur wahid mengapre...,kompas.com
...,...,...,...
455621,"Jelang Tutup Tahun, Registrasi Kartu Prabayar ...","menjelang tahun baru, registrasi kartu prabaya...",beritasatu.com
455622,"Dekat dengan JK, Agus Gumiwang Dinilai Punya P...","anggota dpr fraksi pks, mardani ali sera menil...",beritasatu.com
455623,Khofifah-Emil Buka Posko Pemenangan Pilgub di ...,bakal pasangan calon gubernur dan wagub jatim ...,beritasatu.com
455624,Deklarasi Marthen-Ryan Akan Dihadiri Ketum Par...,deklarasi pasangan marthen taha-ryan kono guna...,beritasatu.com


In [None]:
df_2['text'] = df_2['content_html'].apply(lambda x: text_cleaning(x, is_news =False))
df_2['tokens'] = df_2['text'].apply(lambda x: len(x.split()))
df_2['sent'] = df_2['text'].apply(lambda x: len(x.split('.')))

In [None]:
df_2_truncated = df_2[['text']].loc[(df_2['tokens'] <= 500) & (df_2['sent']>2)]
df_2_truncated.to_parquet('news-2017-clean.parquet')

In [None]:
file_name = 'news-2017-clean.parquet' #str(input("Input filename to move: "))
!gsutil cp /content/{file_name} gs://{bucket_name}/data_new/

## File for Training Tokenizer

In [None]:
# Tulis ke file .txt untuk training SentencePiece tokenizer
with open('training_tokenizer_new.txt', 'a') as f:
    add_breakline = lambda x: x+"\n"
    for line in list(df_cc_clean['text']):
	      f.write(add_breakline(line))

In [None]:
file_name = 'training_tokenizer_new.txt' #str(input("Input filename to move: "))
!gsutil cp /content/{file_name} gs://{bucket_name}/

# Preprocessing Finetune Data

In [None]:
def remove_news_headline(text,delim):
    x = text.split(delim)
    if len(x)>1: # buang yang bukan konten
        return " ".join(x[1:])
    else:
        return x[0]

def text_cleaning(input_string, is_news = True):
    lowercase = input_string.lower()
    # stripped_html = BeautifulSoup(lowercase, 'html.parser').get_text()
    remove_link = re.sub(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)', '', lowercase).replace("&amp;","&")
    remove_bullet = "\n".join([T for T in remove_link.split('\n') if '•' not in T and "baca juga:" not in T])
    remove_accented = unicodedata.normalize('NFKD', remove_bullet).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # remove news headline
    if is_news:
        y = remove_news_headline(remove_accented,'- ')
        y = remove_news_headline(y,'– ')
        y = remove_news_headline(y,': ')
    else:
        y = remove_accented
    remove_parentheses = re.sub("([\(\|]).*?([\)\|])", "\g<1>\g<2>", y)
    remove_punc = re.sub(r"[^\w\d.,\s]+",' ', remove_parentheses)
    remove_num_dot = re.sub(r"(?<=\d)\.|\.(?=\d)|(?<=#)\.","",remove_punc)
    remove_extra_whitespace =  re.sub(r'^\s*|\s\s*', ' ', remove_num_dot).strip()
    return ".".join([s for s in remove_extra_whitespace.strip().split('.') if len(s.strip())>10]).replace("_","")

## Liputan6

In [None]:
liputan6 = load_dataset("id_liputan6", 'canonical', data_dir="/content/liputan6_data/liputan6_data")
liputan6

Downloading builder script:   0%|          | 0.00/6.73k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.31k [00:00<?, ?B/s]

Downloading and preparing dataset id_liputan6/canonical to /root/.cache/huggingface/datasets/id_liputan6/canonical-c353f159bc1956b5/1.0.0/1902328d520c71779d871e7d0af752c16a6bb3ee7525aa02b4c9f518a8ed62a8...


Generating validation split:   0%|          | 0/10972 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10972 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/193883 [00:00<?, ? examples/s]

Dataset id_liputan6 downloaded and prepared to /root/.cache/huggingface/datasets/id_liputan6/canonical-c353f159bc1956b5/1.0.0/1902328d520c71779d871e7d0af752c16a6bb3ee7525aa02b4c9f518a8ed62a8. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    validation: Dataset({
        features: ['id', 'url', 'clean_article', 'clean_summary', 'extractive_summary'],
        num_rows: 10972
    })
    test: Dataset({
        features: ['id', 'url', 'clean_article', 'clean_summary', 'extractive_summary'],
        num_rows: 10972
    })
    train: Dataset({
        features: ['id', 'url', 'clean_article', 'clean_summary', 'extractive_summary'],
        num_rows: 193883
    })
})

In [None]:
[x[:50] for x in liputan6['test']['clean_article'][:10]]

['Liputan6.com, Jakarta: Kepolisian Daerah Riau bert',
 'Liputan6.com, Jakarta: Bank Indonesia dinilai masi',
 'Liputan6.com, Jakarta: Berbagai kendala menghambat',
 'Liputan6.com, Jakarta: Penghapusan beberapa pasal ',
 'Liputan6.com, Jakarta: Operasi Sadar Jaya yang dil',
 'Liputan6.com, Tangerang: Empat warga negara asing ',
 'Liputan6.com, Jakarta: Sejumlah pasal dalam Rancan',
 'Liputan6.com, Jakarta: Kepolisian Metro Jaya menyi',
 'Liputan6.com, Bantul: Pusat desa kerajinan atau Pa',
 'Liputan6.com, Jakarta: Pengawasan Pertamina terhad']

In [None]:
def cleaning_liputan6(examples):
    examples['labels'] = []
    examples['input'] = []

    for doc_index, document in enumerate(examples['clean_article']):
        examples['labels'].append(" "+text_cleaning(examples['clean_summary'][doc_index]))
        examples['input'].append(text_cleaning(document))

    return examples

In [None]:
liputan6_clean = liputan6.map(cleaning_liputan6, batched = True, remove_columns = ['id', 'url', 'clean_article', 'clean_summary', 'extractive_summary'])

Map:   0%|          | 0/10972 [00:00<?, ? examples/s]

Map:   0%|          | 0/10972 [00:00<?, ? examples/s]

Map:   0%|          | 0/193883 [00:00<?, ? examples/s]

In [None]:
liputan6_input = []
liputan6_labels = []
for split in ['test']: #'train','validation']:
    liputan6_input.extend(liputan6_clean[split]['input'])
    liputan6_labels.extend(liputan6_clean[split]['labels'])

In [None]:
df_liputan6 = pd.DataFrame({'input':liputan6_input,'labels':liputan6_labels})
df_liputan6

Unnamed: 0,input,labels
0,kepolisian daerah riau bertekad memberantas pe...,kapolda riau baru brigjen pol. johny yodjana ...
1,bank indonesia dinilai masih akan menghadapi s...,kendati bank sentral as menurunkan suku bunga...
2,berbagai kendala menghambat pendapatan negara ...,pemerintah bermaksud akan lebih mengandalkan ...
3,penghapusan beberapa pasal menyangkut hak buru...,"revisi kepmennaker nomor 78 tahun 2001, dinil..."
4,operasi sadar jaya yang dilancarkan selasa mal...,polisi menangkap 32 pengunjung diskotik milen...
...,...,...
10967,"kejaksaan agung, hingga kini, masih menunggu p...",kejagung masih menunggu putusan hakim pn jaks...
10968,dewan pimpinan pusat partai kebangkitan bangsa...,ketua dpp pkb khofifah indarparawansa menuduh...
10969,"warga kampung melayu, jakarta timur, menggelar...","menyambut tahun baru 2002, warga kampung mela..."
10970,"romadhani alias roban, penjahat kelas kakap, t...",romadhani alias roban tewas ditembak polisi s...


In [None]:
df_liputan6.loc[0,'input']

'kepolisian daerah riau bertekad memberantas pelaku penyelundupan kayu yang kerap terjadi di riau. selain itu, polda setempat juga akan memberangus menipulasi dana reboisasi dan iuran hasil hutan. demikian ditegaskan kepala polda riau brigadir jenderal polisi johny yodjana, seusai dilantik menjadi kapolda riau oleh kepala polri jenderal polisi suroyo bimantoro, di jakarta, baru baru ini. menurut johny, pelaku tindak kriminal yang kerap menjarah kayu di riau akan ditindak tegas. saya tak akan pandang bulu, janji johny. selain itu, ia bertekad menyelidiki dugaan manipulasi dana reboisasi dan iuran hasil hutan sebesar rp 680 miliar yang dilakukan sebuah perusahaan kayu di riau. sementara itu, selain melantik johny yodyana, kapolri juga melantik inspektur jenderal polisi firman gani menjadi kapolda sulawesi selatan dan brigjen pol. eddy darnadi menjadi kapolda maluku. selain itu, bimantoro juga melantik komisaris besar pol. totok soenarjo menjadi kapolda jambi, brigjen pol. sugiri menjadi 

In [None]:
df_liputan6.to_parquet("liputan6_test.parquet",compression='gzip')

In [None]:
file_name = "liputan6_test.parquet" #str(input("Input filename to move: "))
!gsutil cp /content/{file_name} gs://{bucket_name}/data_newest/

## IndoSum

In [None]:
indosum = load_dataset("json", data_files={'train': [f'/content/indosum/indosum/train.0{i}.jsonl' for i in [1,2,3,4,5]],
                                            'test': [f'/content/indosum/indosum/test.0{i}.jsonl' for i in [1,2,3,4,5]],
                                            'validation': [f'/content/indosum/indosum/dev.0{i}.jsonl' for i in [1,2,3,4,5]]
                                            })
indosum

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-d46cd27741103f59/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-d46cd27741103f59/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['category', 'gold_labels', 'id', 'paragraphs', 'source', 'source_url', 'summary'],
        num_rows: 71353
    })
    test: Dataset({
        features: ['category', 'gold_labels', 'id', 'paragraphs', 'source', 'source_url', 'summary'],
        num_rows: 18774
    })
    validation: Dataset({
        features: ['category', 'gold_labels', 'id', 'paragraphs', 'source', 'source_url', 'summary'],
        num_rows: 3743
    })
})

In [None]:
[" ".join(x[0][0]) for x in indosum['test']['paragraphs'][:10]]

['Jakarta , CNN Indonesia - - Dilansir AFP , seorang warga Mesir yang dipercaya sebagai wanita terberat di dunia masuk sebuah rumah sakit di Abu Dhabi Kamis ( 5 / 5 ) , dimana ia akan melanjutkan perawatan setelah menjalankan operasi menurunkan badan di India .',
 'Menteri Pertahanan Ryamizard Ryacudu menyambut baik usulan pembelian drone dari China untuk dikembangkan di Indonesia .',
 'Jakarta , CNN Indonesia - - Meski sudah hampir 12 tahun berlalu , film Mean Girls merupakan salah satu film remaja sepanjang masa yang sulit untuk dilupakan .',
 'Usai melaksanakan ibadah haji , Eggi Sudjana akhirnya mendatangi kantor Bareskrim Polri , Jakarta , Senin ( 18 / 9 ) , untuk memberikan klarifikasi terkait pencatutan namanya di struktur organisasi Saracen .',
 'Banyak cara untuk memberikan pengajaran kepada anak , termasuk melalui cara yang atraktif dan kreatif .',
 'Jakarta , CNN Indonesia - - Sebuah lagu misterius mendadak muncul di iTunes akhir pekan lalu .',
 'Kementerian Sosial sudah men

In [None]:
def cleaning_indosum(examples):

    summary_lists = examples['summary']

    examples['labels'] = []
    examples['input'] = []

    for doc_index, document in enumerate(examples['paragraphs']):
        examples['labels'].append(text_cleaning(" "+" ".join([" ".join(sentence) for sentence in summary_lists[doc_index]]),is_news=False))
        first = True
        list_paragraph = []
        for paragraph in document:
            list_sentence = []
            for sentence in paragraph:
                if first:
                    list_sentence.append(remove_news_headline(" ".join(sentence), "- "))
                    first = False
                else:
                    list_sentence.append(" ".join(sentence))
            list_paragraph.append(" ".join(list_sentence))
        examples['input'].append(text_cleaning(" ".join(list_paragraph), is_news=False))

    return examples

In [None]:
indosum_clean = indosum.map(cleaning_indosum, batched = True,remove_columns = ['category', 'gold_labels', 'id', 'paragraphs', 'source', 'source_url', 'summary'])

Map:   0%|          | 0/71353 [00:00<?, ? examples/s]

Map:   0%|          | 0/18774 [00:00<?, ? examples/s]

Map:   0%|          | 0/3743 [00:00<?, ? examples/s]

In [None]:
indosum_input = []
indosum_labels = []
for split in ['test']:
    indosum_input.extend(indosum_clean[split]['input'])
    indosum_labels.extend(indosum_clean[split]['labels'])

In [None]:
df_indosum = pd.DataFrame({'input':indosum_input,'labels':indosum_labels})
df_indosum

Unnamed: 0,input,labels
0,"dilansir afp , seorang warga mesir yang diperc...",eman ahmed abd el aty memiliki berat badan men...
1,menteri pertahanan ryamizard ryacudu menyambut...,menteri pertahanan ryamizard ryacudu menyambut...
2,"meski sudah hampir 12 tahun berlalu , film mea...",rumah produksi film yang dibintangi lindsay lo...
3,"usai melaksanakan ibadah haji , eggi sudjana a...",eggi sudjana akhirnya mendatangi kantor baresk...
4,banyak cara untuk memberikan pengajaran kepada...,game permainan kartu muslim . menggunakan basi...
...,...,...
18769,ketua umum dpp partai amanat nasional zulkifli...,ketua umum dpp partai amanat nasional zulkifli...
18770,"winger timnas indonesia , andik vermansah , di...","winger timnas indonesia , andik vermansah , di..."
18771,"an , tampilan kaskus saat ini sudah jauh berbe...",kaskus saat ini sudah jauh berbeda karena tida...
18772,"kabut asap menyelimuti kota meulaboh , aceh ba...","kabut asap menyelimuti kota meulaboh , aceh ba..."


In [None]:
df_indosum.to_parquet("indosum_test.parquet",compression='gzip')

In [None]:
file_name = "indosum_test.parquet"
!gsutil cp /content/{file_name} gs://{bucket_name}/data_newest/