## Read all files from `../corpus/`

In [7]:
import os
import json
from collections import defaultdict

In [8]:
corpus_path = "../corpus/"

In [10]:
def corpus_reader(path):
    """
    Given a path to a directory containing JSON files of the scraped corpus
    documents and their metadata, load them all into a dict{list[dicts]}
    such that:
    {
        "A1": [{"source": "...", "content": "...", ...}, {...}],
        "A2": [...],
        ...
    }
    
    path: (str) the path of the directory containing the JSON files
    
    corpus: (dict{list[dicts]}) a dictionary of texts arranged by reading level
    (a text is a single cohesive piece of reading material, be it a short
    story, a poem, song lyrics, a book chapter, etc.)
    """
    
    corpus = defaultdict(list)
    for file in os.listdir(path):
        with open(os.path.join(path, file), "r", encoding="utf-8") as f:
            doc_list = json.load(f)
            for d in doc_list:
                corpus[d["level"]].append(d)
    return corpus

In [12]:
corpus = corpus_reader(corpus_path)

In [15]:
print(corpus.keys())
print()
for k, v in corpus.items():
    print(f"Number of {k}-level texts: {len(v)}")

dict_keys(['A1', 'A2', 'B', 'B1/B2', 'A2/B1'])

Number of A1-level texts: 51
Number of A2-level texts: 51
Number of B-level texts: 21
Number of B1/B2-level texts: 89
Number of A2/B1-level texts: 31
