In [1]:
from bs4 import BeautifulSoup, NavigableString
from urllib.request import urlopen
import re
import time
import random
import os
import json
from collections import defaultdict
from utils import read_corpus

In [2]:
corpus = read_corpus()
for k, v in corpus.items():
    print(f"Number of {k}-level texts: {len(v)}")
corpus_A1 = corpus["A1"]
corpus_A2 = corpus["A2"]
corpus_B = corpus["B"]
print(f"total number of texts: {len(corpus_A1) + len(corpus_A2) + len(corpus_B)}")

# shuffle
random.shuffle(corpus_A1)
random.shuffle(corpus_A2)
random.shuffle(corpus_B)

Number of A1-level texts: 94
Number of A2-level texts: 62
Number of B-level texts: 152
total number of texts: 308


In [3]:
A1_train_split = round(len(corpus_A1) * 0.8)
A2_train_split = round(len(corpus_A2) * 0.8)
B_train_split = round(len(corpus_B) * 0.8)
A1_val_split = round(len(corpus_A1) * 0.1)
A2_val_split = round(len(corpus_A2) * 0.1)
B_val_split = round(len(corpus_B) * 0.1)

print(f"Train size for A1 level: {A1_train_split}")
print(f"Train size for A2 level: {A2_train_split}")
print(f"Train size for B level: {B_train_split}")
print(f"Validation size for A1 level: {A1_val_split}")
print(f"Validation size for A2 level: {A2_val_split}")
print(f"Validation size for B level: {B_val_split}")
print(f"Test size for A1 level: {A1_val_split}")
print(f"Test size for A2 level: {A2_val_split}")
print(f"Test size for B level: {B_val_split}")

Train size for A1 level: 75
Train size for A2 level: 50
Train size for B level: 122
Validation size for A1 level: 9
Validation size for A2 level: 6
Validation size for B level: 15
Test size for A1 level: 9
Test size for A2 level: 6
Test size for B level: 15


In [4]:
val = corpus_A1[:A1_val_split] + corpus_A2[:A2_val_split] + corpus_B[:B_val_split]
test = (
    corpus_A1[A1_val_split : A1_val_split * 2]
    + corpus_A2[A2_val_split : A2_val_split * 2]
    + corpus_B[B_val_split : B_val_split * 2]
)
train = (
    corpus_A1[A1_val_split * 2 :]
    + corpus_A2[A2_val_split * 2 :]
    + corpus_B[B_val_split * 2 :]
)

# shuffle
random.shuffle(train)
random.shuffle(val)
random.shuffle(test)

# verify
print(f"Total train size: {len(train)}")
print(f"Total validation size: {len(val)}")
print(f"Total test size: {len(test)}")
print(f"Sum of sizes: {len(train) + len(val) + len(test)}")

Total train size: 248
Total validation size: 30
Total test size: 30
Sum of sizes: 308


In [5]:
X_train = []
y_train = []
for text in train:
    X_train.append(text["content"])
    y_train.append(text["level"])

X_val = []
y_val = []
for text in val:
    X_val.append(text["content"])
    y_val.append(text["level"])

X_test = []
y_test = []
for text in test:
    X_test.append(text["content"])
    y_test.append(text["level"])

In [6]:
# write to file
with open("../data/train.json", "w", encoding="utf-8") as fout:
    json.dump(train, fout)
with open("../data/val.json", "w", encoding="utf-8") as fout:
    json.dump(val, fout)
with open("../data/test.json", "w", encoding="utf-8") as fout:
    json.dump(test, fout)

In [7]:
def to_file(split, filename):
    """
    given a train or validation split, write to file

    split: (list) list of texts or list of labels
    filename: (str) name for the output file
    """
    tofile = ""
    mode = filename[0]
    if mode == "X":
        for text in split:
            tofile += text + "#" * 20
        tofile = tofile[:-20]

        with open(f"../data/{filename}.txt", "w", encoding="utf-8") as fout:
            fout.write(tofile)

    elif mode == "y":
        for label in split:
            tofile += label + "\n"
        tofile = tofile[:-1]

        with open(f"../data/{filename}.txt", "w", encoding="utf-8") as fout:
            fout.write(tofile)

In [8]:
to_file(X_train, "X_train")
to_file(y_train, "y_train")
to_file(X_val, "X_val")
to_file(y_val, "y_val")
to_file(X_test, "X_test")
to_file(y_test, "y_test")