In [1]:
import pandas as pd
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import json
from tqdm import tqdm
from unidecode import unidecode

In [3]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/parsing/indonesia-conll/Indonesian/id-common_crawl-000.conllu.jsonl
# !wget https://f000.backblazeb2.com/file/malay-dataset/parsing/indonesia-conll/Indonesian/id-common_crawl-001.conllu.jsonl
# !wget https://f000.backblazeb2.com/file/malay-dataset/parsing/indonesia-conll/Indonesian/id-wikipedia-000.conllu.jsonl
# !wget https://f000.backblazeb2.com/file/malay-dataset/parsing/indonesia-conll/Indonesian/id-wikipedia-001.conllu.jsonl
# !wget https://f000.backblazeb2.com/file/malay-dataset/parsing/indonesia-conll/Indonesian/id-wikipedia-002.conllu.jsonl

In [4]:
!head -n 2 id-common_crawl-000.conllu.jsonl

["# text = Hotel di dekat Cadogan Hall\n", "1\tHotel\t_\tNOUN\t_\t_\t0\troot\t_\t_\n", "2\tdi\t_\tADP\t_\t_\t4\tcase\t_\t_\n", "3\tdekat\t_\tADJ\t_\t_\t4\tamod\t_\t_\n", "4\tCadogan\t_\tPROPN\t_\t_\t1\tnmod\t_\t_\n", "5\tHall\t_\tPROPN\t_\t_\t4\tflat\t_\t_\n"]
["# text = Ini bisa menjadi masalah hardware atau masalah perangkat lunak.\n", "1\tIni\t_\tDET\t_\t_\t3\tnsubj\t_\t_\n", "2\tbisa\t_\tADV\t_\t_\t3\tadvmod\t_\t_\n", "3\tmenjadi\t_\tVERB\t_\t_\t0\troot\t_\t_\n", "4\tmasalah\t_\tNOUN\t_\t_\t3\tobj\t_\t_\n", "5\thardware\t_\tNOUN\t_\t_\t4\tcompound\t_\t_\n", "6\tatau\t_\tCCONJ\t_\t_\t7\tcc\t_\t_\n", "7\tmasalah\t_\tNOUN\t_\t_\t4\tconj\t_\t_\n", "8\tperangkat\t_\tNOUN\t_\t_\t7\tcompound\t_\t_\n", "9\tlunak\t_\tADJ\t_\t_\t8\tamod\t_\tSpaceAfter=No\n", "10\t.\t_\tPUNCT\t_\t_\t3\tpunct\t_\t_\n"]


In [5]:
!wc -l id-common_crawl-000.conllu.jsonl
!wc -l id-wikipedia-000.conllu.jsonl
!wc -l id-wikipedia-001.conllu.jsonl
!wc -l id-wikipedia-002.conllu.jsonl

853816 id-common_crawl-000.conllu.jsonl
1451051 id-wikipedia-000.conllu.jsonl
1788269 id-wikipedia-001.conllu.jsonl
1765019 id-wikipedia-002.conllu.jsonl


In [6]:
tag2idx = {'PAD': 0,
 'X': 1,
 'nsubj': 2,
 'cop': 3,
 'det': 4,
 'root': 5,
 'nsubj:pass': 6,
 'acl': 7,
 'case': 8,
 'obl': 9,
 'flat': 10,
 'punct': 11,
 'appos': 12,
 'amod': 13,
 'compound': 14,
 'advmod': 15,
 'cc': 16,
 'obj': 17,
 'conj': 18,
 'mark': 19,
 'advcl': 20,
 'nmod': 21,
 'nummod': 22,
 'dep': 23,
 'xcomp': 24,
 'ccomp': 25,
 'parataxis': 26,
 'compound:plur': 27,
 'fixed': 28,
 'aux': 29,
 'csubj': 30,
 'iobj': 31,
 'csubj:pass': 32}

In [10]:
def get_train(group):
    texts, arcs, tags = [], [], []
    for g in group:
        splitted = g.split('\t')
        texts.append(unidecode(splitted[1]))
        arcs.append(int(splitted[6]))
        tags.append(tag2idx[splitted[7]])
        
    return texts, arcs, tags

In [8]:
from glob import glob

files = sorted(glob('*.conllu.jsonl'))
files

['id-common_crawl-000.conllu.jsonl',
 'id-common_crawl-001.conllu.jsonl',
 'id-wikipedia-000.conllu.jsonl',
 'id-wikipedia-001.conllu.jsonl',
 'id-wikipedia-002.conllu.jsonl']

In [11]:
from tqdm import tqdm

with open('train.json', 'w') as train:
    for f in files:
        print(f)
        count = 0
        with open(f) as fopen:
            for l in tqdm(fopen):
                data = json.loads(l)
                data = [d for d in data if d[0] != '#']
                if len(data) > 200:
                    # print(data)
                    continue
                
                try:
                    texts, arcs, tags = get_train(data)
                    if any([len(t) > 50 for t in texts]) and len(data) > 120:
                        # print(data)
                        continue
                        
                    d = {'translation': {
                        'texts': texts,
                        'arcs': arcs,
                        'tags': tags,
                    }}
                    train.write(f'{json.dumps(d)}\n')
                    count += 1
                    
#                     if count == 5000:
#                         break
                        
                except Exception as e:
                    print(data)

id-common_crawl-000.conllu.jsonl


853816it [00:18, 46091.10it/s]


id-common_crawl-001.conllu.jsonl


828815it [00:18, 45351.89it/s]


id-wikipedia-000.conllu.jsonl


1451051it [00:23, 61154.67it/s]


id-wikipedia-001.conllu.jsonl


1788269it [00:25, 70645.21it/s] 


id-wikipedia-002.conllu.jsonl


1765019it [00:24, 71442.17it/s]


In [12]:
data

['1\t31\t_\tNUM\t_\tNumType=Card\t2\tnummod\t_\t_\n',
 '2\tJeff\t_\tPROPN\t_\t_\t0\troot\t_\t_\n',
 '3\tBurton\t_\tPROPN\t_\t_\t2\tflat\t_\t_\n',
 '4\tRichard\t_\tPROPN\t_\t_\t3\tflat\t_\t_\n',
 '5\tChildress\t_\tPROPN\t_\t_\t4\tflat\t_\t_\n',
 '6\tRacing\t_\tPROPN\t_\t_\t5\tflat\t_\t_\n',
 '7\tChevrolet\t_\tPROPN\t_\t_\t6\tflat\t_\t_\n',
 '8\t46.686\t_\tNUM\t_\tNumType=Card\t7\tnummod\t_\t_\n',
 '9\t192.777\t_\tNUM\t_\tNumType=Card\t7\tnummod\t_\t_\n']

In [13]:
!shuf train.json > shuffled-train.json
!head -n 1000 shuffled-train.json > test.json

In [14]:
!wc -l shuffled-train.json

6686644 shuffled-train.json


In [15]:
!head -n 100 shuffled-train.json

{"translation": {"texts": ["Penyakit", "ini", "dinamai", "dari", "Moritz", "Kaposi", "(", "1837-1902", ")", ",", "seorang", "ahli", "ilmu", "penyakit", "kulit", "Hongaria", "yang", "pertama", "kali", "menjelaskan", "gejala", "penyakit", "ini", "pada", "tahun", "1872", "."], "arcs": [3, 1, 0, 5, 3, 5, 8, 5, 8, 12, 12, 5, 12, 13, 14, 15, 18, 12, 18, 3, 20, 21, 21, 25, 20, 25, 3], "tags": [6, 4, 5, 8, 9, 10, 11, 12, 11, 11, 4, 18, 14, 14, 14, 10, 2, 13, 15, 20, 17, 14, 4, 8, 9, 22, 11]}}
{"translation": {"texts": ["\"", "Secret", "of", "Success", "\""], "arcs": [2, 0, 2, 3, 2], "tags": [11, 5, 21, 21, 11]}}
{"translation": {"texts": ["Pada", "tanggal", "27", "Oktober", ",", "Laksamana", "Inggris-Seymour", "(", "Michael", "Seymour", ")", "mengirim", "surat", "kepada", "Ye", "Ming", "Chen", "dengan", "mengulangi", "permintaan", "untuk", "memasuki", "kota", ";", "karena", "tidak", "mendapat", "jawaban", "dari", "Ye", "Ming", "Chen", ",", "Kantor", "Gubernur", "dibom", "sekali", "dalam", "5