## Build and write feature matrices

In [1]:
import json
from tqdm import tqdm
from features import feature_pipeline
from collections import defaultdict

In [2]:
with open("../data/train.json", "r") as f:
    train = json.load(f)
with open("../data/val.json", "r") as f:
    val = json.load(f)

In [3]:
pipe = feature_pipeline()

#### Build Feature Matrices

In [4]:
training_data = defaultdict(list)
for ex in tqdm(train, desc="Building Training Data"):
    training_data["preprocessed_text"].append(pipe.preprocess(ex["content"]))
    features = pipe.feature_extractor()
    for k, v in features.items():
        training_data[k].append(v)
    training_data["level"].append(ex["level"])

Building Training Data: 100%|████████████████████████████████████████████| 277/277 [07:46<00:00,  1.68s/it]


In [5]:
validation_data = defaultdict(list)
for ex in tqdm(val, desc="Building Validation Data"):
    validation_data["preprocessed_text"].append(pipe.preprocess(ex["content"]))
    features = pipe.feature_extractor()
    for k, v in features.items():
        validation_data[k].append(v)
    validation_data["level"].append(ex["level"])

Building Validation Data: 100%|████████████████████████████████████████████| 31/31 [01:17<00:00,  2.50s/it]


#### Writing Feature Matrices to File

In [9]:
training_data
with open("../data/train_features.json", "w", encoding="utf-8") as fout:
    json.dump(training_data, fout)
with open("../data/val_features.json", "w", encoding="utf-8") as fout:
    json.dump(validation_data, fout)