# Simple EDA for IndoSum dataset

## Import everything needed

In [None]:
import pandas as pd
import os
import json
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

In [None]:
# global config 

sns.set_style('ticks')

## Finding files

In [None]:
# Detect train, dev, and test files
DATASET_ROOT = '/kaggle/input/indosum/indosum'

files_id_dir = os.listdir(DATASET_ROOT)
train_files = []
dev_files = []
test_files = []

for filename in files_id_dir:
    if 'train' in filename:
        train_files.append(filename)
    elif 'dev' in filename:
        dev_files.append(filename)
    elif 'test' in filename:
        test_files.append(filename)

train_files, dev_files, test_files

## Pre-procesing Data

### This is a function to load file into list of json object

In [None]:
def load_file_to_json_list(filename):
    file = os.path.join(DATASET_ROOT, filename)
    data = []
    with open(file, 'r') as f:
        json_list = list(f)
        for json_str in tqdm(json_list, desc=f'Loading data {filename}'):
            d = json.loads(json_str)
            data.append(d)
    return data

In [None]:
# only for testing purpose 
json_list = load_file_to_json_list(dev_files[0])
len(json_list), json_list[0].keys()

### Converter from raw data to more readable data

In [None]:
def label_to_dict_str(label_list):
    label_dict = {} # key = paragraph_id : value = label list 
    for i, label in enumerate(label_list[:]):
        label_dict[i] = label

    json_str = json.dumps(label_dict)
    num = len(label_dict)
    return json_str, num

In [None]:
# only for testing purpose 
label_to_dict_str(json_list[0]['gold_labels'])

In [None]:
def paragraph_to_dict_str(paragraph_list):
    paragraph_dict = {} # key = paragraph_id : value = paragraph list 
    for i, paragraph in enumerate(paragraph_list):
        new_paragraph = []
        for sentence in paragraph:
            sentence = ' '.join(sentence)
            new_paragraph.append(sentence)
        paragraph_dict[i] = new_paragraph

    json_str = json.dumps(paragraph_dict)
    num = len(paragraph_dict)
    return json_str, num

In [None]:
# only for testing purpose 
paragraph_list = json_list[0]['paragraphs']
paragraph_to_dict_str(paragraph_list)

In [None]:
def paragraph_to_text(raw_paragraph_list):
    new_paragraph_list = []
    for i, paragraph in enumerate(raw_paragraph_list):
        paragraph_list = []
        for sentence in paragraph:
            sentence = ' '.join(sentence)
            paragraph_list.append(sentence)
        
        new_paragraph = ' '.join(paragraph_list)
        new_paragraph_list.append(new_paragraph)

    paragraph_str = ' '.join(new_paragraph_list)
    return paragraph_str

In [None]:
pl = json_list[0]['paragraphs']
paragraph_to_text(pl)

In [None]:
def summary_to_dict_str(summary_list):
    summary_dict = {} # key = summary_id : value = summary sentence 
    for i, summary in enumerate(summary_list):
        summary_dict[i] = ' '.join(summary)

    json_str = json.dumps(summary_dict)
    num = len(summary_dict)
    return json_str, num

In [None]:
# only for testing purpose 
summary_to_dict_str(json_list[0]['summary'])

In [None]:
def summary_to_text(raw_summary_list):
    summary_list = []
    for i, summary in enumerate(raw_summary_list):
        summary_list.append(' '.join(summary))

    summary_str = ' '.join(summary_list)
    return summary_str

In [None]:
# only for testing purpose 
summary_to_text(json_list[0]['summary'])

### Altering raw data to a new format

In [None]:
def alter_json_data(json_list_data, filename=''):
    new_json_list = []
    for json_data in tqdm(json_list_data, desc=f'Altering json data {filename}'):
        json_data = json_data.copy()
        json_data['gold_labels'], _ = label_to_dict_str(json_data['gold_labels'])
        json_data['news_text'] = paragraph_to_text(json_data['paragraphs'])
        json_data['paragraphs'], num_paragraph = paragraph_to_dict_str(json_data['paragraphs'])
        json_data['num_of_paragraphs'] = num_paragraph
        json_data['summary_text'] = summary_to_text(json_data['summary'])
        json_data['summary'], num_summary = summary_to_dict_str(json_data['summary'])
        json_data['num_of_summary'] = num_summary
        
        new_json_list.append(json_data)
    
    return new_json_list

In [None]:
# only for testing purpose 
json_list_alter = alter_json_data(json_list)
len(json_list_alter)

### Create pandas DataFrame

In [None]:
def create_dataset(jsonl):
    header = list(jsonl[0].keys())
    dataset_list = []
    for json_data in jsonl:
        row = []
        for h in header:
            row.append(json_data[h])
        dataset_list.append(row)
    
    return header, dataset_list

In [None]:
def create_dataset_from_files(file_list):
    df_header = None
    dataset_list = []
    for filename in file_list:
        json_l = load_file_to_json_list(filename)
        new_json_l = alter_json_data(json_l, filename)
        header, dataset_part = create_dataset(new_json_l)
        
        if not df_header: df_header = header
        dataset_list.extend(dataset_part)
        
    df_full = pd.DataFrame().from_records(dataset_list)
    df_full = df_full.rename(columns=dict(enumerate(header)))
    return df_full
        

In [None]:
df_train = create_dataset_from_files(train_files)
df_dev = create_dataset_from_files(dev_files)
df_test = create_dataset_from_files(test_files)

In [None]:
df_train.head()

## EDA

### Histogram from Number of Paragraph in News

In [None]:
sns.set(rc={'figure.figsize':(15, 4)})
f, axes = plt.subplots(1, 3)
f.suptitle("Histogram from Number of Paragraph in News")
f.tight_layout()
hist0 = sns.histplot(data=df_train['num_of_paragraphs'], binwidth=1, ax=axes[0]).set(
    title='Train Dataset',
    xlabel='Number of paragraphs',
    ylabel='Number of news',
    yscale='log'
)
hist1 = sns.histplot(data=df_dev['num_of_paragraphs'], binwidth=1, ax=axes[1]).set(
    title='Dev Dataset',
    xlabel='Number of paragraphs',
    ylabel='Number of news',
    yscale='log'
)
hist2 = sns.histplot(data=df_test['num_of_paragraphs'], binwidth=1, ax=axes[2]).set(
    title='Test Dataset',
    xlabel='Number of paragraphs',
    ylabel='Number of news',
    yscale='log'
)

### Histogram of Summary Lengths

In [None]:
sns.set(rc={'figure.figsize':(15, 4)})
f, axes = plt.subplots(1, 3)
f.suptitle("Histogram from Number of Summary Sentences")
f.tight_layout()
hist0 = sns.histplot(data=df_train['num_of_summary'], binwidth=1, ax=axes[0]).set(
    title='Train Dataset',
    xlabel='Number of summary',
    ylabel='Number of news',
    yscale='log'
)
hist1 = sns.histplot(data=df_dev['num_of_summary'], binwidth=1, ax=axes[1]).set(
    title='Dev Dataset',
    xlabel='Number of summary',
    ylabel='Number of news',
    yscale='log'
)
hist2 = sns.histplot(data=df_test['num_of_summary'], binwidth=1, ax=axes[2]).set(
    title='Test Dataset',
    xlabel='Number of summary',
    ylabel='Number of news',
    yscale='log'
)

### Histogram from Number Characters in News

In [None]:
df_train['paragraphs_text_len'] = df_train['news_text'].str.len()
df_dev['paragraphs_text_len'] = df_dev['news_text'].str.len()
df_test['paragraphs_text_len'] = df_test['news_text'].str.len()

In [None]:
sns.set(rc={'figure.figsize':(15, 4)})
f, axes = plt.subplots(1, 3)
f.suptitle("Histogram from Number Characters in News")
f.tight_layout()
hist0 = sns.histplot(data=df_train['paragraphs_text_len'], bins=20, ax=axes[0]).set(
    title='Train Dataset',
    xlabel='Number of characters',
    ylabel='Number of news',
    yscale='log'
)
hist1 = sns.histplot(data=df_dev['paragraphs_text_len'], bins=20, ax=axes[1]).set(
    title='Dev Dataset',
    xlabel='Number of characters',
    ylabel='Number of news',
    yscale='log'
)
hist2 = sns.histplot(data=df_test['paragraphs_text_len'], bins=20, ax=axes[2]).set(
    title='Test Dataset',
    xlabel='Number of characters',
    ylabel='Number of news',
    yscale='log'
)

### Histogram from Number Characters in Summary

In [None]:
df_train['summary_text_len'] = df_train['summary_text'].str.len()
df_dev['summary_text_len'] = df_dev['summary_text'].str.len()
df_test['summary_text_len'] = df_test['summary_text'].str.len()

In [None]:
sns.set(rc={'figure.figsize':(15, 4)})
f, axes = plt.subplots(1, 3)
f.suptitle("Histogram from Number Characters in Summary")
f.tight_layout()
hist0 = sns.histplot(data=df_train['summary_text_len'], bins=20, ax=axes[0]).set(
    title='Train Dataset',
    xlabel='Number of characters',
    ylabel='Number of news',
    yscale='log'
)
hist1 = sns.histplot(data=df_dev['summary_text_len'], bins=20, ax=axes[1]).set(
    title='Dev Dataset',
    xlabel='Number of characters',
    ylabel='Number of news',
    yscale='log'
)
hist2 = sns.histplot(data=df_test['summary_text_len'], bins=20, ax=axes[2]).set(
    title='Test Dataset',
    xlabel='Number of characters',
    ylabel='Number of news',
    yscale='log'
)