# Filtering IndoSum

# Importing Libraries

In [1]:
import os
import random
import json
import numpy as np
from pathlib import Path

# Initialize the random number generator.
random.seed(42)
np.random.seed(42)

# Downloading Dataset

In [2]:
# Download indosum.tar.gz.
os.system('gdown https://drive.google.com/uc?id=1jPHVCx33-nseIKdLV8lBu6SCwESwl0te')
# Extracting indosum.tar.gz.
! tar -xvf indosum.tar.gz
! rm indosum.tar.gz

indosum/
indosum/test.02.jsonl
indosum/README.txt
indosum/train.05.jsonl
indosum/train.04.jsonl
indosum/test.04.jsonl
indosum/train.01.jsonl
indosum/test.05.jsonl
indosum/test.03.jsonl
indosum/test.01.jsonl
indosum/dev.04.jsonl
indosum/dev.01.jsonl
indosum/dev.02.jsonl
indosum/train.02.jsonl
indosum/dev.03.jsonl
indosum/dev.05.jsonl
indosum/CHANGELOG.txt
indosum/train.03.jsonl


# Helper Functions

In [3]:
def countToken(jsonObj):
  """
  This function counts how many tokens in the document.
  """
  tokenCount = 0
  for paragraph in jsonObj["paragraphs"]: # Loop over paragraphs.
    for sentence in paragraph: # Loop over sentences.
      tokenCount += len(sentence)
  
  return tokenCount

# Get unique documents and filter them.

In [4]:
#####################################################
# Get unique data, filter them, and classify them. #
###################################################
folds = 5
categoryDict = {"tajuk utama":{}, "teknologi":{}, "hiburan":{}, "olahraga":{}, "showbiz":{}, "inspirasi":{}}
for k in range(1, folds+1): # Loop over k.
  for type in ["train", "dev", "test"]: # Loop over types.
    with open(f"indosum/{type}.0{k}.jsonl") as lines:
      for line in lines:
        jsonObj = json.loads(line)
        tokenCount = countToken(jsonObj)

        # Filter based on number of tokens.
        if tokenCount < 210 or tokenCount > 460:
          continue
        
        categoryDict[jsonObj["category"]][jsonObj["id"]] = jsonObj # Pool the data into a dictionary.

total = 0
for value in categoryDict.values(): # Loop over categories.
  total += len(value)
print(f"Number of Unique: {total}")

Number of Unique: 13448


# Subclassify based on number of tokens.

In [5]:
##########################################
# Subclassify based on number of token. #
########################################
subCategoryDict = {"tajuk utama":{}, "teknologi":{}, "hiburan":{}, "olahraga":{}, "showbiz":{}, "inspirasi":{}}
for dictionary in categoryDict.values(): # Loop over categories.
  for jsonObj in dictionary.values(): # Loop over documents.
    tokenCount = countToken(jsonObj) # Count tokens.

    # Subclassify
    sub = None
    if tokenCount >= 210 and tokenCount <= 260: sub = "sub1"
    elif tokenCount > 260 and tokenCount <= 310: sub = "sub2"
    elif tokenCount > 310 and tokenCount <= 360: sub = "sub3"
    elif tokenCount > 360 and tokenCount <= 410: sub = "sub4"
    elif tokenCount > 410 and tokenCount <= 460: sub = "sub5"

    category = jsonObj["category"]
    if sub not in subCategoryDict[category]:
      subCategoryDict[category][sub] = []
    subCategoryDict[category][sub].append(jsonObj)

total = 0
for dictionary in subCategoryDict.values(): # Loop over categories.
  for jsonList in dictionary.values(): # Loop over subcategories.
    total += len(jsonList)
print(f"Number of Unique: {total}")

Number of Unique: 13448


# Shuffle the data under subclass and split them into k folds.

In [6]:
##############################################################
# Shuffle the data of subclass and split them into k folds. #
############################################################
folds = 5
finalCategoryDict = {"tajuk utama":{}, "teknologi":{}, "hiburan":{}, "olahraga":{}, "showbiz":{}, "inspirasi":{}}
for category, dictionary in subCategoryDict.items(): # Loop over categories.
  for sub, jsonList in dictionary.items(): # Loop over subcategories.
    random.shuffle(jsonList) # Shuffle the list.
    finalCategoryDict[category][sub] = np.array_split(jsonList, folds) # Split to be 5, then pool again.

total = 0
for dictionary in finalCategoryDict.values(): # Loop over categories.
  for jsonList in dictionary.values(): # Loop over subcategories.
    for jList in jsonList: # Loop over folds.
      total += len(jList)
print(f"Number of Unique: {total}")

Number of Unique: 13448


# Save into files.

In [7]:
#####################
# Save into files. #
###################
! rm -rf filtered_indosum # Be careful! Don't accidentally remove all your data.
folder = "filtered_indosum"
Path(folder).mkdir(exist_ok=True)

folds = 5
for k in range(1, folds+1): # Loop over k.
  trainData = []
  valData = []
  testData = []

  for dictionary in finalCategoryDict.values(): # Loop over categories.
    for jList in dictionary.values(): # Loop over subcategories.
      jsonList = jList.copy() # Clone the data. jList and jsonList contain 5 splits.
      testList = jsonList.pop(k-1) # Take (k-1)th split. Pop and remove data at a time.
      random.shuffle(testList) # Shuffle the data.

      testLength = len(testList)
      valLength = int(testLength / 4)

      # Pool the training data.
      jsonList = [jsonObj for jsonSubList in jsonList for jsonObj in jsonSubList]
      trainData.extend(jsonList)
      # Pool the validation data.
      valData.extend(testList[0:valLength]) # Taken from 1/4 of testList.
      # Pool the testing data.
      testData.extend(testList[valLength:testLength]) # Taken from  3/4 of testList.

  # Shuffle again before saving.
  random.shuffle(trainData)
  random.shuffle(valData)
  random.shuffle(testData)

  # Save into files.
  with open(f"{folder}/train.0{k}.jsonl", "w") as outfile:
    for jsonObj in trainData:
      json.dump(jsonObj, outfile)
      outfile.write('\n')
  print(f"{folder}/train.0{k}.jsonl")

  with open(f"{folder}/dev.0{k}.jsonl", "w") as outfile:
    for jsonObj in valData:
      json.dump(jsonObj, outfile)
      outfile.write('\n')
  print(f"{folder}/dev.0{k}.jsonl")

  with open(f"{folder}/test.0{k}.jsonl", "w") as outfile:
    for jsonObj in testData:
      json.dump(jsonObj, outfile)
      outfile.write('\n')
  print(f"{folder}/test.0{k}.jsonl")

filtered_indosum/train.01.jsonl
filtered_indosum/dev.01.jsonl
filtered_indosum/test.01.jsonl
filtered_indosum/train.02.jsonl
filtered_indosum/dev.02.jsonl
filtered_indosum/test.02.jsonl
filtered_indosum/train.03.jsonl
filtered_indosum/dev.03.jsonl
filtered_indosum/test.03.jsonl
filtered_indosum/train.04.jsonl
filtered_indosum/dev.04.jsonl
filtered_indosum/test.04.jsonl
filtered_indosum/train.05.jsonl
filtered_indosum/dev.05.jsonl
filtered_indosum/test.05.jsonl


# Compress the folder.

In [8]:
# ! cd drive/MyDrive/ && tar -czvf filtered_indosum.tar.gz filtered_indosum
! tar -czvf filtered_indosum.tar.gz filtered_indosum

filtered_indosum/
filtered_indosum/test.05.jsonl
filtered_indosum/test.02.jsonl
filtered_indosum/train.02.jsonl
filtered_indosum/train.04.jsonl
filtered_indosum/train.05.jsonl
filtered_indosum/test.03.jsonl
filtered_indosum/train.01.jsonl
filtered_indosum/dev.04.jsonl
filtered_indosum/test.01.jsonl
filtered_indosum/dev.05.jsonl
filtered_indosum/dev.01.jsonl
filtered_indosum/dev.02.jsonl
filtered_indosum/dev.03.jsonl
filtered_indosum/test.04.jsonl
filtered_indosum/train.03.jsonl
