In [1]:
from bs4 import BeautifulSoup, NavigableString
from urllib.request import urlopen
import re
import time
import random

# python -m spacy download es_core_news_md
import spacy
from spacy.lang.es.examples import sentences


import os
import json
from collections import defaultdict

In [2]:
# read all texts
def read_corpus(path="../corpus/"):
    """
    Given a path to a directory containing JSON files of the scraped corpus
    documents and their metadata, load them all into a dict{list[dicts]}
    such that:
    {
        "A1": [{"source": "...", "content": "...", ...}, {...}],
        "A2": [...],
        ...
    }
    path: (str) the path of the directory containing the JSON files
    return: (dict{list[dicts]}) a dictionary of texts arranged by reading level
    (a text is a single cohesive piece of reading material, be it a short
    story, a poem, song lyrics, a book chapter, etc.)
    """

    corpus = defaultdict(list)
    for file in os.listdir(path):
        if "json" in file:
            with open(os.path.join(path, file), "r", encoding="utf-8") as f:
                doc_list = json.load(f)
                for d in doc_list:
                    level = d["level"]
                    if level == "A2/B1":
                        level = "B1"
                    corpus[level].append(d)
    return corpus

In [3]:
corpus = read_corpus()
corpus_A = corpus['A1'] + corpus['A2']
corpus_B = corpus['B1'] + corpus['B']

# shuffle
random.shuffle(corpus_A)
random.shuffle(corpus_B)

In [4]:
print(f'number of A level texts: {len(corpus_A)}')
print(f'number of B level texts: {len(corpus_B)}')

print(f'eval size for A level: {round(len(corpus_A) * 0.1)}')
print(f'eval size for B level: {round(len(corpus_B) * 0.1)}')

number of A level texts: 156
number of B level texts: 152
eval size for A level: 16
eval size for B level: 15


In [5]:
train = corpus_A[16:] + corpus_B[15:]
val = corpus_A[:16] + corpus_B[:15]

In [6]:
X_train = []
y_train = []
for text in train:
    X_train.append(text['content'])
    y_train.append(text['level'][0])
    
X_val = []
y_val = []
for text in val:
    X_val.append(text['content'])
    y_val.append(text['level'][0])

In [7]:
# write to file
with open('../data/train.json', 'w', encoding='utf-8') as fout:
    json.dump(train, fout)
with open('../data/val.json', 'w', encoding='utf-8') as fout:
    json.dump(val, fout)

In [8]:
def to_file(split, filename, mode='X'):
    '''
    given a train or validation split, write to file
    
    split: (list) list of texts or list of labels
    filename: (str) name for the output file
    mode: (str) 'X' - the separator is #*20, 'y' - the separator is newline symbol \n
    '''
    tofile = ''
    if mode == 'X':
        for text in split:
            tofile += text + '#'*20
        tofile = tofile[:-20]

        with open(f'../data/{filename}.txt', 'w', encoding='utf-8') as fout:
            fout.write(tofile)
    
    elif mode == 'y':
        for label in split:
            tofile += label + '\n'
        tofile = tofile[:-1]

        with open(f'../data/{filename}.txt', 'w', encoding='utf-8') as fout:
            fout.write(tofile)
        

In [9]:
to_file(X_train, 'X_train', 'X')
to_file(y_train, 'y_train', 'y')
to_file(X_val, 'X_val', 'X')
to_file(y_val, 'y_val', 'y')