In [8]:
import datasets
import pathlib
from collections import defaultdict
from gensim.parsing.preprocessing import STOPWORDS
from tqdm.notebook import tqdm

In [9]:
##https://medium.com/swlh/text-summarization-guide-exploratory-data-analysis-on-text-data-4e22ce2dd6ad

def count_sentences(examples):
    return {
        "document_sentence_count": [len(document) for document in examples["document"]],
        "summary_sentence_count": [len(document) for document in examples["summary"]],
    }

def count_words(examples):
    return {
        "document_word_count": [sum(len(item.split()) for item in document) for document in examples["document"]],
        "summary_word_count": [sum(len(item.split()) for item in document) for document in examples["summary"]],
    }

def count_chars(examples):
    return {
        "document_char_count": [sum(len("".join(item.split())) for item in document) for document in examples["document"]],
        "summary_char_count": [sum(len("".join(item.split())) for item in document) for document in examples["summary"]],
    }

def sentence_density(examples):
    return {
        "document_sentence_density": [document[0]/(document[1] if document[1] else 1) for document in zip(examples["document_sentence_count"], examples["document_word_count"])], 
        "summary_sentence_density": [document[0]/(document[1] if document[1] else 1)  for document in zip(examples["summary_sentence_count"], examples["summary_word_count"])],
    }



def _count_stopwords(text, stopwords=STOPWORDS):
    ''' Return the number of stopwords in the text
        Input:
            - text: string
            - stopwords: list of string, containing the stopwords
        Output:
            - int, number of stopwords in the text argument
    '''
    stopwords_x = [w for w in "\n".join(text).split() if w.lower() in stopwords]
    
    return len(stopwords_x)

def count_stopwords(examples):
    return {
        "document_stopword_count": [_count_stopwords(document) for document in examples["document"]],
        "summary_stopword_count": [_count_stopwords(document) for document in examples["summary"]]
    }


def load_dataset_stats(dataset):

    features = datasets.features.Features({
    'document': datasets.Sequence(feature=datasets.Value(dtype='string', id=None), length=-1, id=None),
    'summary': datasets.Sequence(feature=datasets.Value(dtype='string', id=None), length=-1, id=None)
    })
    dataset_name, splits = dataset

    dataset = datasets.load_dataset("json", dataset_name, data_files=splits, features=features,)
    removed_cols = list(set(column_name for sublist in dataset.values() for column_name in sublist.column_names))

    datasets_info = (
        dataset
        .map(count_sentences,batched=True)
        .map(count_words,batched=True,)
        .map(count_chars, batched=True)
        .map(sentence_density,batched=True)
        .map(count_stopwords, batched=True, remove_columns=removed_cols)
        )
    
    return datasets_info


def load_data_splits_from_dir(directory):
    splits = defaultdict(dict)
    directory_files = pathlib.Path(directory).glob("**/*.json.gz")
    for file_path in directory_files:
        data_split = file_path.stem.split(".")[0]
        splits[file_path.parent.stem][data_split] = str(file_path.resolve())
    return dict(splits)


In [10]:
RAW_SUMMARIZATION_DATASETS_DIR = "../datasets/raw/supervised/summarization"
data_splits = load_data_splits_from_dir(RAW_SUMMARIZATION_DATASETS_DIR)
for dataset in tqdm(data_splits.items()):
    ## TODO Write Dataset loading script
    # https://huggingface.co/docs/datasets/v1.12.0/dataset_script.html
    dataset_info = load_dataset_stats(dataset)
    dataset_name, _ = dataset
    for split_name,split_info in dataset_info.items():
        outfile = pathlib.Path(RAW_SUMMARIZATION_DATASETS_DIR).joinpath(dataset_name, f"{split_name}_info.csv").resolve()
        split_df = split_info.to_pandas()
        split_df.to_csv(outfile, index=False, index_label=False)



  0%|          | 0/8 [00:00<?, ?it/s]

Using custom data configuration ami-1844cbad6c43206f
Reusing dataset json (/Users/bebop/.cache/huggingface/datasets/json/ami-1844cbad6c43206f/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50)
100%|██████████| 3/3 [00:00<00:00, 423.97it/s]
Loading cached processed dataset at /Users/bebop/.cache/huggingface/datasets/json/ami-1844cbad6c43206f/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50/cache-77fe406a0c73db71.arrow
Loading cached processed dataset at /Users/bebop/.cache/huggingface/datasets/json/ami-1844cbad6c43206f/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50/cache-01f102143ae717bf.arrow
Loading cached processed dataset at /Users/bebop/.cache/huggingface/datasets/json/ami-1844cbad6c43206f/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50/cache-32388734c206c20d.arrow
Loading cached processed dataset at /Users/bebop/.cache/huggingface/datasets/json/ami-1844cbad6c43206f/0.0.0/d75ead8d5cfcbe67495df