# Adil Dataset Generator

In [142]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

In [143]:
TXT_DIR = "../../adil-dataset/txt_fixed"
CSV_DIR = "../../adil-dataset/csv"
EXTRA_DATA_DIR = "../peraturan_wide_mongo.json"

In [144]:
word_dataset = []
indexes = []

for filename in os.listdir(TXT_DIR):
    if filename.endswith(".txt"):
        with open(os.path.join(TXT_DIR, filename), "r") as f:
            content = f.read()
            content_split = content.split("\n\n")
            word_dataset.append(content_split)
            indexes.append([filename.split(".")[0] for c in content_split])

In [145]:
np_word_dataset = np.hstack(word_dataset)
np_indexes =  np.hstack(indexes)
print(np_word_dataset.shape)
print(len(np_word_dataset))
print(len(np_indexes))
# print(np_word_dataset)

(125213,)
125213
125213


In [146]:
# Strip the blank space in leading and trailing
np_word_dataset = [re.sub(" +", " ", c) for c in np_word_dataset]
np_word_dataset = np.char.lstrip(np_word_dataset)
np_word_dataset = np.char.rstrip(np_word_dataset)

In [147]:
df = pd.DataFrame({'id': np_indexes, 'content': np_word_dataset})
df

Unnamed: 0,id,content
0,11e44c4eb26bdc80ad4e313231363031,LEMBARAN NEGARA\nREPUBLIK INDONESIA
1,11e44c4eb26bdc80ad4e313231363031,"No.41, 2013 POLITIK. PEMILU. Pengunduran Diri...."
2,11e44c4eb26bdc80ad4e313231363031,PERATURAN PEMERINTAH REPUBLIK INDONESIA\nNOMOR...
3,11e44c4eb26bdc80ad4e313231363031,TENTANG
4,11e44c4eb26bdc80ad4e313231363031,"TATA CARA PENGUNDURAN DIRI KEPALA DAERAH, WAKI..."
...,...,...
125208,11e796c4c9968eb8a97e313434323339,"2017, No.182 \n-413-"
125209,11e796c4c9968eb8a97e313434323339,www.peraturan.go.id
125210,11e796c4c9968eb8a97e313434323339,"2017, No.182 -414-"
125211,11e796c4c9968eb8a97e313434323339,www.peraturan.go.id


In [160]:
# Clean the unrecognized word
FORBIDDEN = r"^[0-9]+[0-9]\,|-\s+[0-9]{1,2}\s+-|^$|^www\.|http|-[0-9]{1,2}-"
indexes = []
for index, values in df["content"].items():
    if re.search(FORBIDDEN, values):
        indexes.append(index)

df = df.drop(indexes)

In [164]:
for i in df['content']:
    if i.startswith("http"):
        print(i)
    

In [165]:
# Clean row that have small amount of letter
reg_section = r"^(pasal)|^(ayat)|^(bab)|^(undang-undang)|^(huruf)|\
                ^(cukup)|^(umum)"

indexes2 = []
for index, values in df["content"].items():
    if len(values) < 7 and not re.search(reg_section, values.lower()):
        indexes2.append(index)

df = df.drop(indexes2)

In [166]:
df

Unnamed: 0,id,content
0,11e44c4eb26bdc80ad4e313231363031,LEMBARAN NEGARA\nREPUBLIK INDONESIA
1,11e44c4eb26bdc80ad4e313231363031,"No.41, 2013 POLITIK. PEMILU. Pengunduran Diri...."
2,11e44c4eb26bdc80ad4e313231363031,PERATURAN PEMERINTAH REPUBLIK INDONESIA\nNOMOR...
3,11e44c4eb26bdc80ad4e313231363031,TENTANG
4,11e44c4eb26bdc80ad4e313231363031,"TATA CARA PENGUNDURAN DIRI KEPALA DAERAH, WAKI..."
...,...,...
124977,11e796c4c9968eb8a97e313434323339,Diundangkan di Jakarta
124978,11e796c4c9968eb8a97e313434323339,pada tanggal 16 Agustus 2017
124979,11e796c4c9968eb8a97e313434323339,MENTERI HUKUM DAN HAK ASASI MANUSIA
124980,11e796c4c9968eb8a97e313434323339,"REPUBLIK INDONESIA,"


In [167]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,id,content
0,11e44c4eb26bdc80ad4e313231363031,LEMBARAN NEGARA\nREPUBLIK INDONESIA
1,11e44c4eb26bdc80ad4e313231363031,"No.41, 2013 POLITIK. PEMILU. Pengunduran Diri...."
2,11e44c4eb26bdc80ad4e313231363031,PERATURAN PEMERINTAH REPUBLIK INDONESIA\nNOMOR...
3,11e44c4eb26bdc80ad4e313231363031,TENTANG
4,11e44c4eb26bdc80ad4e313231363031,"TATA CARA PENGUNDURAN DIRI KEPALA DAERAH, WAKI..."
...,...,...
96226,11e796c4c9968eb8a97e313434323339,Diundangkan di Jakarta
96227,11e796c4c9968eb8a97e313434323339,pada tanggal 16 Agustus 2017
96228,11e796c4c9968eb8a97e313434323339,MENTERI HUKUM DAN HAK ASASI MANUSIA
96229,11e796c4c9968eb8a97e313434323339,"REPUBLIK INDONESIA,"


## Add Label Column

In [168]:
labels = []
reg_section = r"^(pasal)|^(bab)"
for i, value in df["content"].items():
    if (value.startswith("PERATURAN") \
    or value.startswith("UNDANG-UNDANG")) \
    and "NOMOR" in value:
        labels.append("title")
    elif i > 0 and df["content"][i-1] == "TENTANG":
        labels.append("subtitle")
    elif re.search(reg_section, value.lower()):
        labels.append("section")
    else:
        labels.append("body")
        
len(labels)

96231

In [169]:
label_series = pd.Series(labels, name="labels")

In [170]:
df = pd.concat([df, label_series], axis=1)
df

Unnamed: 0,id,content,labels
0,11e44c4eb26bdc80ad4e313231363031,LEMBARAN NEGARA\nREPUBLIK INDONESIA,body
1,11e44c4eb26bdc80ad4e313231363031,"No.41, 2013 POLITIK. PEMILU. Pengunduran Diri....",body
2,11e44c4eb26bdc80ad4e313231363031,PERATURAN PEMERINTAH REPUBLIK INDONESIA\nNOMOR...,title
3,11e44c4eb26bdc80ad4e313231363031,TENTANG,body
4,11e44c4eb26bdc80ad4e313231363031,"TATA CARA PENGUNDURAN DIRI KEPALA DAERAH, WAKI...",subtitle
...,...,...,...
96226,11e796c4c9968eb8a97e313434323339,Diundangkan di Jakarta,body
96227,11e796c4c9968eb8a97e313434323339,pada tanggal 16 Agustus 2017,body
96228,11e796c4c9968eb8a97e313434323339,MENTERI HUKUM DAN HAK ASASI MANUSIA,body
96229,11e796c4c9968eb8a97e313434323339,"REPUBLIK INDONESIA,",body


In [171]:
df[df["labels"] == "section"]

Unnamed: 0,id,content,labels
18,11e44c4eb26bdc80ad4e313231363031,BAB I,section
20,11e44c4eb26bdc80ad4e313231363031,Pasal 1,section
29,11e44c4eb26bdc80ad4e313231363031,BAB II,section
32,11e44c4eb26bdc80ad4e313231363031,Pasal 2,section
35,11e44c4eb26bdc80ad4e313231363031,Pasal 3,section
...,...,...,...
96204,11e796c4c9968eb8a97e313434323339,Pasal 569,section
96206,11e796c4c9968eb8a97e313434323339,Pasal 570,section
96211,11e796c4c9968eb8a97e313434323339,Pasal 571,section
96217,11e796c4c9968eb8a97e313434323339,Pasal 572,section


## Create Additional Data

In [172]:
additional_df = pd.read_json(EXTRA_DATA_DIR)

In [173]:
additional_df

Unnamed: 0,_id,judul,tentang
0,11e55063664fdf98942c313133393135,Peraturan Menteri 27/PMK.05/2015 2015,TARIF LAYANAN BADAN LAYANAN UMUM UNIVERSITAS S...
1,11e4a10d2ec42b70bdca303833313131,Peraturan Presiden 83 2014,MAJELIS PERTIMBANGAN TENAGA NUKLIR
2,11e44c50fac7c8c0afa7313233323231,Peraturan Menteri 14/PERMEN-KP/2013 2013,PERUBAHAN ATAS PERATURAN MENTERI KELAUTAN DAN ...
3,11e44c50be1573a08ca1313233303339,Peraturan Menteri 10 2014,PERUBAHAN ATAS PERATURAN MENTERI KOMUNIKASI DA...
4,11e555142d260eb2a68a313035343434,Peraturan Menteri 41/PMK.05/2015 2015,TARIF LAYANAN BADAN LAYANAN UMUM UNIVERSITAS N...
...,...,...,...
9682,11e56d91c8d97264be4b313435343231,Peraturan Lembaga Pemerintah Non Kementerian 5...,PENANGANAN BENTURAN KEPENTINGAN
9683,11e576e91422f1c280b2313231313534,Peraturan Lembaga Pemerintah Non Kementerian 4...,PEDOMAN SISTEM PELAPORAN PELANGGARAN
9684,11e56d924c8a8fb2992d313435383032,Peraturan Lembaga Pemerintah Non Kementerian 4...,SISTEM PENGENDALIAN GRATIFIKASI DI LINGKUNGAN
9685,11e92035123a1126a7d3303730363332,Peraturan Lembaga Pemerintah Non Kementerian 1...,Produk Hukum di Komisi Pemberantasan Korupsi


In [174]:
title = additional_df["judul"]

In [175]:
title_label = ["title" for t in title]
title_label_series = pd.Series(title_label, name="label")
title_label_series

0       title
1       title
2       title
3       title
4       title
        ...  
9682    title
9683    title
9684    title
9685    title
9686    title
Name: label, Length: 9687, dtype: object

In [176]:
ids = additional_df["_id"]

In [177]:
title_ids_df = pd.concat([ids, title, title_label_series], names=["id", "content", "label"], axis=1, ignore_index=True)

In [178]:
title_ids_df.rename(columns={0: "id", 1: 'content', 2: 'labels'}, inplace=True)

In [179]:
title_ids_df

Unnamed: 0,id,content,labels
0,11e55063664fdf98942c313133393135,Peraturan Menteri 27/PMK.05/2015 2015,title
1,11e4a10d2ec42b70bdca303833313131,Peraturan Presiden 83 2014,title
2,11e44c50fac7c8c0afa7313233323231,Peraturan Menteri 14/PERMEN-KP/2013 2013,title
3,11e44c50be1573a08ca1313233303339,Peraturan Menteri 10 2014,title
4,11e555142d260eb2a68a313035343434,Peraturan Menteri 41/PMK.05/2015 2015,title
...,...,...,...
9682,11e56d91c8d97264be4b313435343231,Peraturan Lembaga Pemerintah Non Kementerian 5...,title
9683,11e576e91422f1c280b2313231313534,Peraturan Lembaga Pemerintah Non Kementerian 4...,title
9684,11e56d924c8a8fb2992d313435383032,Peraturan Lembaga Pemerintah Non Kementerian 4...,title
9685,11e92035123a1126a7d3303730363332,Peraturan Lembaga Pemerintah Non Kementerian 1...,title


In [180]:
about = additional_df["tentang"]
about

0       TARIF LAYANAN BADAN LAYANAN UMUM UNIVERSITAS S...
1                      MAJELIS PERTIMBANGAN TENAGA NUKLIR
2       PERUBAHAN ATAS PERATURAN MENTERI KELAUTAN DAN ...
3       PERUBAHAN ATAS PERATURAN MENTERI KOMUNIKASI DA...
4       TARIF LAYANAN BADAN LAYANAN UMUM UNIVERSITAS N...
                              ...                        
9682                      PENANGANAN BENTURAN KEPENTINGAN
9683                 PEDOMAN SISTEM PELAPORAN PELANGGARAN
9684        SISTEM PENGENDALIAN GRATIFIKASI DI LINGKUNGAN
9685         Produk Hukum di Komisi Pemberantasan Korupsi
9686    Organisasi dan Tata Kerja Komisi Pemberantasan...
Name: tentang, Length: 9687, dtype: object

In [181]:
about_label = ['subtitle' for a in about]
print(len(about_label))
print(about_label[:5])

9687
['subtitle', 'subtitle', 'subtitle', 'subtitle', 'subtitle']


In [182]:
about_label_series = pd.Series(about_label, name="labels")
about_label_series

0       subtitle
1       subtitle
2       subtitle
3       subtitle
4       subtitle
          ...   
9682    subtitle
9683    subtitle
9684    subtitle
9685    subtitle
9686    subtitle
Name: labels, Length: 9687, dtype: object

In [183]:
abouts_df = pd.concat([ids, about, about_label_series],axis=1, ignore_index=True)
abouts_df.rename(columns={0: "id", 1: 'content', 2: 'labels'}, inplace=True)
abouts_df

Unnamed: 0,id,content,labels
0,11e55063664fdf98942c313133393135,TARIF LAYANAN BADAN LAYANAN UMUM UNIVERSITAS S...,subtitle
1,11e4a10d2ec42b70bdca303833313131,MAJELIS PERTIMBANGAN TENAGA NUKLIR,subtitle
2,11e44c50fac7c8c0afa7313233323231,PERUBAHAN ATAS PERATURAN MENTERI KELAUTAN DAN ...,subtitle
3,11e44c50be1573a08ca1313233303339,PERUBAHAN ATAS PERATURAN MENTERI KOMUNIKASI DA...,subtitle
4,11e555142d260eb2a68a313035343434,TARIF LAYANAN BADAN LAYANAN UMUM UNIVERSITAS N...,subtitle
...,...,...,...
9682,11e56d91c8d97264be4b313435343231,PENANGANAN BENTURAN KEPENTINGAN,subtitle
9683,11e576e91422f1c280b2313231313534,PEDOMAN SISTEM PELAPORAN PELANGGARAN,subtitle
9684,11e56d924c8a8fb2992d313435383032,SISTEM PENGENDALIAN GRATIFIKASI DI LINGKUNGAN,subtitle
9685,11e92035123a1126a7d3303730363332,Produk Hukum di Komisi Pemberantasan Korupsi,subtitle


In [184]:
fixed_new_df = pd.concat([title_ids_df, abouts_df], ignore_index=True)
fixed_new_df

Unnamed: 0,id,content,labels
0,11e55063664fdf98942c313133393135,Peraturan Menteri 27/PMK.05/2015 2015,title
1,11e4a10d2ec42b70bdca303833313131,Peraturan Presiden 83 2014,title
2,11e44c50fac7c8c0afa7313233323231,Peraturan Menteri 14/PERMEN-KP/2013 2013,title
3,11e44c50be1573a08ca1313233303339,Peraturan Menteri 10 2014,title
4,11e555142d260eb2a68a313035343434,Peraturan Menteri 41/PMK.05/2015 2015,title
...,...,...,...
19369,11e56d91c8d97264be4b313435343231,PENANGANAN BENTURAN KEPENTINGAN,subtitle
19370,11e576e91422f1c280b2313231313534,PEDOMAN SISTEM PELAPORAN PELANGGARAN,subtitle
19371,11e56d924c8a8fb2992d313435383032,SISTEM PENGENDALIAN GRATIFIKASI DI LINGKUNGAN,subtitle
19372,11e92035123a1126a7d3303730363332,Produk Hukum di Komisi Pemberantasan Korupsi,subtitle


In [185]:
fixed_new_df = shuffle(fixed_new_df)

In [186]:
fixed_new_df.reset_index(inplace=True,drop=True)
fixed_new_df

Unnamed: 0,id,content,labels
0,11e44c51b9085ae0ac5a313233373430,PEDOMAN AKREDITASI LEMBAGA PENYELENGGARA PENDI...,subtitle
1,11e7d414de790b349f36313531393335,Peraturan Lembaga Pemerintah Non Kementerian 1...,title
2,11e723f70ddcbc32a8e3313232323435,Komisi Nasional Pengkajian Sumber Daya Ikan,subtitle
3,11e44c502835c560b15f313232363238,Peraturan Pemerintah 43 1965,title
4,11e44c4ee7bb23c094d2313231373330,PENYELENGGARAAN PENANAMAN MODAL DALAM RANGKA P...,subtitle
...,...,...,...
19369,11e6f7f0d949fdaea1d5313134373239,Peraturan Menteri 12 2016,title
19370,11e44c4f14f3baf0a708313231383436,PERUBAHAN KEDUA ATAS PERATURAN PEMERINTAH NOMO...,subtitle
19371,11e44c4fed61a670bd6f313232343439,PERIZINAN INSTALASI NUKLIR DAN PEMANFAATAN BAH...,subtitle
19372,11e44c50f930d4a0a3ff313233323138,Peraturan Lembaga Pemerintah Non Kementerian 5...,title


## Add Data to Dataframe

In [187]:
df = pd.concat([df, fixed_new_df], ignore_index=True)
df

Unnamed: 0,id,content,labels
0,11e44c4eb26bdc80ad4e313231363031,LEMBARAN NEGARA\nREPUBLIK INDONESIA,body
1,11e44c4eb26bdc80ad4e313231363031,"No.41, 2013 POLITIK. PEMILU. Pengunduran Diri....",body
2,11e44c4eb26bdc80ad4e313231363031,PERATURAN PEMERINTAH REPUBLIK INDONESIA\nNOMOR...,title
3,11e44c4eb26bdc80ad4e313231363031,TENTANG,body
4,11e44c4eb26bdc80ad4e313231363031,"TATA CARA PENGUNDURAN DIRI KEPALA DAERAH, WAKI...",subtitle
...,...,...,...
115600,11e6f7f0d949fdaea1d5313134373239,Peraturan Menteri 12 2016,title
115601,11e44c4f14f3baf0a708313231383436,PERUBAHAN KEDUA ATAS PERATURAN PEMERINTAH NOMO...,subtitle
115602,11e44c4fed61a670bd6f313232343439,PERIZINAN INSTALASI NUKLIR DAN PEMANFAATAN BAH...,subtitle
115603,11e44c50f930d4a0a3ff313233323138,Peraturan Lembaga Pemerintah Non Kementerian 5...,title


## Reduce Body & Section Label Data

In [188]:
df = df.drop(df[df['labels'] == 'body'].sample(frac=.8).index)

In [189]:
df = df.drop(df[df['labels'] == 'section'].sample(frac=.5).index)

In [190]:
df.reset_index(inplace=True, drop=True)

In [191]:
df

Unnamed: 0,id,content,labels
0,11e44c4eb26bdc80ad4e313231363031,PERATURAN PEMERINTAH REPUBLIK INDONESIA\nNOMOR...,title
1,11e44c4eb26bdc80ad4e313231363031,TENTANG,body
2,11e44c4eb26bdc80ad4e313231363031,"TATA CARA PENGUNDURAN DIRI KEPALA DAERAH, WAKI...",subtitle
3,11e44c4eb26bdc80ad4e313231363031,DENGAN RAHMAT TUHAN YANG MAHA ESA\nPRESIDEN RE...,body
4,11e44c4eb26bdc80ad4e313231363031,Menimbang : a. bahwa berdasarkan Undang-Undang...,body
...,...,...,...
44131,11e6f7f0d949fdaea1d5313134373239,Peraturan Menteri 12 2016,title
44132,11e44c4f14f3baf0a708313231383436,PERUBAHAN KEDUA ATAS PERATURAN PEMERINTAH NOMO...,subtitle
44133,11e44c4fed61a670bd6f313232343439,PERIZINAN INSTALASI NUKLIR DAN PEMANFAATAN BAH...,subtitle
44134,11e44c50f930d4a0a3ff313233323138,Peraturan Lembaga Pemerintah Non Kementerian 5...,title


## Save Data To CSV

In [192]:
df.to_csv(CSV_DIR+"/data_v2.csv", index=False)