In [16]:
import datasets
import pathlib
from collections import defaultdict
from gensim.parsing.preprocessing import STOPWORDS
from tqdm.notebook import tqdm
import pandas as pd

In [12]:
##https://medium.com/swlh/text-summarization-guide-exploratory-data-analysis-on-text-data-4e22ce2dd6ad

def count_sentences(examples):
    return {
        "document_sentence_count": [len(document) for document in examples["document"]],
        "summary_sentence_count": [len(document) for document in examples["summary"]],
    }

def count_words(examples):
    return {
        "document_word_count": [sum(len(item.split()) for item in document) for document in examples["document"]],
        "summary_word_count": [sum(len(item.split()) for item in document) for document in examples["summary"]],
    }

def count_chars(examples):
    return {
        "document_char_count": [sum(len("".join(item.split())) for item in document) for document in examples["document"]],
        "summary_char_count": [sum(len("".join(item.split())) for item in document) for document in examples["summary"]],
    }

def sentence_density(examples):
    return {
        "document_sentence_density": [document[0]/(document[1] if document[1] else 1) for document in zip(examples["document_sentence_count"], examples["document_word_count"])], 
        "summary_sentence_density": [document[0]/(document[1] if document[1] else 1)  for document in zip(examples["summary_sentence_count"], examples["summary_word_count"])],
    }



def _count_stopwords(text, stopwords=STOPWORDS):
    ''' Return the number of stopwords in the text
        Input:
            - text: string
            - stopwords: list of string, containing the stopwords
        Output:
            - int, number of stopwords in the text argument
    '''
    stopwords_x = [w for w in "\n".join(text).split() if w.lower() in stopwords]
    
    return len(stopwords_x)

def count_stopwords(examples):
    return {
        "document_stopword_count": [_count_stopwords(document) for document in examples["document"]],
        "summary_stopword_count": [_count_stopwords(document) for document in examples["summary"]]
    }


def load_dataset_stats(dataset):

    features = datasets.features.Features({
    'document': datasets.Sequence(feature=datasets.Value(dtype='string', id=None), length=-1, id=None),
    'summary': datasets.Sequence(feature=datasets.Value(dtype='string', id=None), length=-1, id=None)
    })
    dataset_name, splits = dataset

    dataset = datasets.load_dataset("json", dataset_name, data_files=splits, features=features,)
    removed_cols = list(set(column_name for sublist in dataset.values() for column_name in sublist.column_names))

    datasets_info = (
        dataset
        .map(count_sentences,batched=True)
        .map(count_words,batched=True,)
        .map(count_chars, batched=True)
        .map(sentence_density,batched=True)
        .map(count_stopwords, batched=True, remove_columns=removed_cols)
        )
    
    return datasets_info


def load_data_splits_from_dir(directory, ext="json.gz"):
    splits = defaultdict(dict)
    directory_files = pathlib.Path(directory).glob(f"**/*.{ext}")
    for file_path in directory_files:
        data_split = file_path.stem.split(".")[0]
        splits[file_path.parent.stem][data_split] = str(file_path.resolve())
    return dict(splits)


In [13]:
RAW_SUMMARIZATION_DATASETS_DIR = "../datasets/raw/supervised/summarization"
data_splits = load_data_splits_from_dir(RAW_SUMMARIZATION_DATASETS_DIR)
for dataset in tqdm(data_splits.items()):
    ## TODO Write Dataset loading script
    # https://huggingface.co/docs/datasets/v1.12.0/dataset_script.html
    dataset_info = load_dataset_stats(dataset)
    dataset_name, _ = dataset
    for split_name,split_info in dataset_info.items():
        outfile = pathlib.Path(RAW_SUMMARIZATION_DATASETS_DIR).joinpath(dataset_name, f"{split_name}_info.csv").resolve()
        split_df = split_info.to_pandas()
        split_df.to_csv(outfile, index=False, index_label=False)


  0%|          | 0/8 [00:00<?, ?it/s]

Using custom data configuration ami-1844cbad6c43206f
Reusing dataset json (/Users/bebop/.cache/huggingface/datasets/json/ami-1844cbad6c43206f/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50)
100%|██████████| 3/3 [00:00<00:00, 149.50it/s]
Loading cached processed dataset at /Users/bebop/.cache/huggingface/datasets/json/ami-1844cbad6c43206f/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50/cache-77fe406a0c73db71.arrow
Loading cached processed dataset at /Users/bebop/.cache/huggingface/datasets/json/ami-1844cbad6c43206f/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50/cache-01f102143ae717bf.arrow
Loading cached processed dataset at /Users/bebop/.cache/huggingface/datasets/json/ami-1844cbad6c43206f/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50/cache-32388734c206c20d.arrow
Loading cached processed dataset at /Users/bebop/.cache/huggingface/datasets/json/ami-1844cbad6c43206f/0.0.0/d75ead8d5cfcbe67495df

Unnamed: 0,Unnamed: 1,Unnamed: 2,count,mean,std,min,25%,50%,75%,max
mediasumm,valid,document_char_count,10000.0,7041.132,6208.918967,161.0,3169.0,5354.0,8992.0,78403.0
mediasumm,valid,document_sentence_count,10000.0,29.2756,33.793657,1.0,12.0,21.0,35.0,684.0
mediasumm,valid,document_sentence_density,10000.0,0.019765,0.010223,0.000598,0.01263,0.017873,0.024441,0.112346
mediasumm,valid,document_stopword_count,10000.0,785.718,722.740281,13.0,344.0,600.0,988.0,9122.0
mediasumm,valid,document_word_count,10000.0,1529.7229,1369.522284,31.0,681.0,1168.0,1943.0,17304.0
mediasumm,valid,summary_char_count,10000.0,77.844,66.247955,15.0,37.0,52.0,91.25,915.0
mediasumm,valid,summary_sentence_count,10000.0,1.187,0.53578,1.0,1.0,1.0,1.0,11.0
mediasumm,valid,summary_sentence_density,10000.0,0.115985,0.055903,0.007519,0.068966,0.111111,0.166667,0.4
mediasumm,valid,summary_stopword_count,10000.0,3.9978,5.246003,0.0,1.0,2.0,4.0,72.0
mediasumm,valid,summary_word_count,10000.0,14.2033,12.539461,5.0,7.0,9.0,16.0,185.0


In [45]:
# info_splits = load_data_splits_from_dir(RAW_SUMMARIZATION_DATASETS_DIR, ext="csv")
# descriptions = []
# for name, splits in info_splits.items():
#     for split, fname in splits.items():
#         data_info = pd.read_csv(fname)
#         split = split.split('_')[0]
#         description = data_info.describe().T
#         description = description.set_index(pd.MultiIndex.from_tuples([(name, split, col )for col in description.index]))
#         descriptions.append(description)
# descriptions= pd.concat(descriptions, axis=0)
display(HTML(descriptions.to_html()))

Unnamed: 0,Unnamed: 1,Unnamed: 2,count,mean,std,min,25%,50%,75%,max
ami,test,document_char_count,13.0,18087.076923,9478.357149,4770.0,9311.0,15164.0,27179.0,32859.0
ami,test,document_sentence_count,13.0,619.230769,343.820242,215.0,345.0,506.0,774.0,1450.0
ami,test,document_sentence_density,13.0,0.134519,0.016588,0.109982,0.12662,0.135725,0.141104,0.162142
ami,test,document_stopword_count,13.0,2702.307692,1466.848117,827.0,1468.0,2157.0,3942.0,5496.0
ami,test,document_word_count,13.0,4698.769231,2481.55476,1326.0,2445.0,3952.0,6903.0,9014.0
ami,test,summary_char_count,13.0,7275.153846,4277.15082,2269.0,3670.0,6161.0,10341.0,14079.0
ami,test,summary_sentence_count,13.0,142.307692,81.197685,47.0,82.0,126.0,199.0,314.0
ami,test,summary_sentence_density,13.0,0.076755,0.009574,0.059656,0.070284,0.07709,0.082566,0.095238
ami,test,summary_stopword_count,13.0,979.153846,603.339021,289.0,485.0,809.0,1265.0,2105.0
ami,test,summary_word_count,13.0,1908.538462,1116.730616,578.0,950.0,1626.0,2729.0,3803.0


In [44]:
from IPython.display import HTML