# Adil Dataset Generator

In [2]:
import os
import re
import numpy as np
import pandas as pd

In [3]:
TXT_DIR = "../../adil-dataset/txt-fixed"
CSV_DIR = "../../adil-dataset/csv"

In [50]:
word_dataset = []
indexes = []

for filename in os.listdir(TXT_DIR):
    if filename.endswith(".txt"):
        with open(os.path.join(TXT_DIR, filename), "r") as f:
            content = f.read()
            content_split = content.split("\n\n")
            word_dataset.append(content_split)
            indexes.append([filename.split(".")[0] for c in content_split])

In [51]:
np_word_dataset = np.hstack(word_dataset)
np_indexes =  np.hstack(indexes)
print(np_word_dataset.shape)
print(len(np_word_dataset))
print(len(np_indexes))
# print(np_word_dataset)

(125208,)
125208
125208


In [66]:
# Strip the blank space in leading and trailing
np_word_dataset = [re.sub(" +", " ", c) for c in np_word_dataset]
np_word_dataset = np.char.lstrip(np_word_dataset)
np_word_dataset = np.char.rstrip(np_word_dataset)

In [95]:
df = pd.DataFrame({'id': np_indexes, 'content': np_word_dataset})
df

Unnamed: 0,id,content
0,11e44c4eb26bdc80ad4e313231363031,LEMBARAN NEGARA\nREPUBLIK INDONESIA
1,11e44c4eb26bdc80ad4e313231363031,"No.41, 2013 POLITIK. PEMILU. Pengunduran Diri...."
2,11e44c4eb26bdc80ad4e313231363031,PERATURAN PEMERINTAH REPUBLIK INDONESIA\nNOMOR...
3,11e44c4eb26bdc80ad4e313231363031,TENTANG
4,11e44c4eb26bdc80ad4e313231363031,"TATA CARA PENGUNDURAN DIRI KEPALA DAERAH, WAKI..."
...,...,...
125203,11e796c4c9968eb8a97e313434323339,"2017, No.182 \n-413-"
125204,11e796c4c9968eb8a97e313434323339,www.peraturan.go.id
125205,11e796c4c9968eb8a97e313434323339,"2017, No.182 -414-"
125206,11e796c4c9968eb8a97e313434323339,www.peraturan.go.id


In [96]:
# Clean the unrecognized word
FORBIDDEN = r"^[0-9]+[0-9]\,|-\s+[0-9]{1,2}\s+-|^$|^www\.| \
            ^http:\/\/|-[0-9]{1,2}-"
indexes = []
for index, values in df["content"].items():
    if re.search(FORBIDDEN, values):
        indexes.append(index)

df = df.drop(indexes)

In [97]:
df

Unnamed: 0,id,content
0,11e44c4eb26bdc80ad4e313231363031,LEMBARAN NEGARA\nREPUBLIK INDONESIA
1,11e44c4eb26bdc80ad4e313231363031,"No.41, 2013 POLITIK. PEMILU. Pengunduran Diri...."
2,11e44c4eb26bdc80ad4e313231363031,PERATURAN PEMERINTAH REPUBLIK INDONESIA\nNOMOR...
3,11e44c4eb26bdc80ad4e313231363031,TENTANG
4,11e44c4eb26bdc80ad4e313231363031,"TATA CARA PENGUNDURAN DIRI KEPALA DAERAH, WAKI..."
...,...,...
124973,11e796c4c9968eb8a97e313434323339,pada tanggal 16 Agustus 2017
124974,11e796c4c9968eb8a97e313434323339,MENTERI HUKUM DAN HAK ASASI MANUSIA
124975,11e796c4c9968eb8a97e313434323339,"REPUBLIK INDONESIA,"
124976,11e796c4c9968eb8a97e313434323339,ttd


In [98]:
# Clean row that have small amount of letter
reg_section = r"^(pasal)|^(ayat)|^(bab)|^(undang-undang)|^(huruf)|\
                ^(cukup)|^(umum)"

indexes2 = []
for index, values in df["content"].items():
    if len(values) < 7 and not re.search(reg_section, values.lower()):
        indexes2.append(index)

df = df.drop(indexes2)

In [99]:
df

Unnamed: 0,id,content
0,11e44c4eb26bdc80ad4e313231363031,LEMBARAN NEGARA\nREPUBLIK INDONESIA
1,11e44c4eb26bdc80ad4e313231363031,"No.41, 2013 POLITIK. PEMILU. Pengunduran Diri...."
2,11e44c4eb26bdc80ad4e313231363031,PERATURAN PEMERINTAH REPUBLIK INDONESIA\nNOMOR...
3,11e44c4eb26bdc80ad4e313231363031,TENTANG
4,11e44c4eb26bdc80ad4e313231363031,"TATA CARA PENGUNDURAN DIRI KEPALA DAERAH, WAKI..."
...,...,...
124972,11e796c4c9968eb8a97e313434323339,Diundangkan di Jakarta
124973,11e796c4c9968eb8a97e313434323339,pada tanggal 16 Agustus 2017
124974,11e796c4c9968eb8a97e313434323339,MENTERI HUKUM DAN HAK ASASI MANUSIA
124975,11e796c4c9968eb8a97e313434323339,"REPUBLIK INDONESIA,"


In [101]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,id,content
0,11e44c4eb26bdc80ad4e313231363031,LEMBARAN NEGARA\nREPUBLIK INDONESIA
1,11e44c4eb26bdc80ad4e313231363031,"No.41, 2013 POLITIK. PEMILU. Pengunduran Diri...."
2,11e44c4eb26bdc80ad4e313231363031,PERATURAN PEMERINTAH REPUBLIK INDONESIA\nNOMOR...
3,11e44c4eb26bdc80ad4e313231363031,TENTANG
4,11e44c4eb26bdc80ad4e313231363031,"TATA CARA PENGUNDURAN DIRI KEPALA DAERAH, WAKI..."
...,...,...
98712,11e796c4c9968eb8a97e313434323339,Diundangkan di Jakarta
98713,11e796c4c9968eb8a97e313434323339,pada tanggal 16 Agustus 2017
98714,11e796c4c9968eb8a97e313434323339,MENTERI HUKUM DAN HAK ASASI MANUSIA
98715,11e796c4c9968eb8a97e313434323339,"REPUBLIK INDONESIA,"


In [111]:
labels = []
reg_section = r"^(pasal)|^(bab)"
for i, value in df["content"].items():
    if (value.startswith("PERATURAN") \
    or value.startswith("UNDANG-UNDANG")) \
    and "NOMOR" in value:
        labels.append("title")
    elif i > 0 and df["content"][i-1] == "TENTANG":
        labels.append("subtitle")
    elif re.search(reg_section, value.lower()):
        labels.append("section")
    else:
        labels.append("body")
        
len(labels)

98717

In [112]:
label_series = pd.Series(labels, name="labels")

In [113]:
df = pd.concat([df, label_series], axis=1)
df

Unnamed: 0,id,content,labels
0,11e44c4eb26bdc80ad4e313231363031,LEMBARAN NEGARA\nREPUBLIK INDONESIA,body
1,11e44c4eb26bdc80ad4e313231363031,"No.41, 2013 POLITIK. PEMILU. Pengunduran Diri....",body
2,11e44c4eb26bdc80ad4e313231363031,PERATURAN PEMERINTAH REPUBLIK INDONESIA\nNOMOR...,title
3,11e44c4eb26bdc80ad4e313231363031,TENTANG,body
4,11e44c4eb26bdc80ad4e313231363031,"TATA CARA PENGUNDURAN DIRI KEPALA DAERAH, WAKI...",subtitle
...,...,...,...
98712,11e796c4c9968eb8a97e313434323339,Diundangkan di Jakarta,body
98713,11e796c4c9968eb8a97e313434323339,pada tanggal 16 Agustus 2017,body
98714,11e796c4c9968eb8a97e313434323339,MENTERI HUKUM DAN HAK ASASI MANUSIA,body
98715,11e796c4c9968eb8a97e313434323339,"REPUBLIK INDONESIA,",body


In [115]:
df[df["labels"] == "section"]

Unnamed: 0,id,content,labels
18,11e44c4eb26bdc80ad4e313231363031,BAB I,section
20,11e44c4eb26bdc80ad4e313231363031,Pasal 1,section
29,11e44c4eb26bdc80ad4e313231363031,BAB II,section
32,11e44c4eb26bdc80ad4e313231363031,Pasal 2,section
35,11e44c4eb26bdc80ad4e313231363031,Pasal 3,section
...,...,...,...
98690,11e796c4c9968eb8a97e313434323339,Pasal 569,section
98692,11e796c4c9968eb8a97e313434323339,Pasal 570,section
98697,11e796c4c9968eb8a97e313434323339,Pasal 571,section
98703,11e796c4c9968eb8a97e313434323339,Pasal 572,section


In [116]:
df.to_csv(CSV_DIR+"/data.csv", index=False)