## Build and write feature matrices

In [1]:
import json
from tqdm import tqdm
from features import feature_pipeline
from collections import defaultdict

#### Load data files

In [2]:
with open("../data/train.json", "r") as f:
    train = json.load(f)
with open("../data/val.json", "r") as f:
    val = json.load(f)
with open("../data/test.json", "r") as f:
    test = json.load(f)

#### Initialize pipeline object and start CoreNLP client

In [3]:
pipe = feature_pipeline(
    dep_parse_flag=True,
    # CHANGE THE PATHS BELOW TO WORK ON YOUR SYSTEM!!
    dep_parse_classpath="C:/Users/rsss9/stanza_corenlp/*",
    result_root="../wordnet_spa",
)
pipe.corenlp_client.start()

2021-06-12 17:35:43 INFO: Using CoreNLP default properties for: spanish.  Make sure to have spanish models jar (available for download here: https://stanfordnlp.github.io/CoreNLP/) in CLASSPATH
2021-06-12 17:35:46 INFO: Starting server with command: java -Xmx5G -cp C:/Users/rsss9/stanza_corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties spanish -annotators depparse -preload -outputFormat serialized


#### Build Feature Matrices

In [4]:
training_data = defaultdict(list)
for ex in tqdm(train, desc="Building Training Data", dynamic_ncols=True):
    training_data["preprocessed_text"].append(pipe.preprocess(ex["content"]))
    features = pipe.feature_extractor()
    for k, v in features.items():
        training_data[k].append(v)
    training_data["level"].append(ex["level"])

Building Training Data: 100%|████████████████████████████████████████████| 257/257 [14:17<00:00,  3.34s/it]


In [5]:
validation_data = defaultdict(list)
for ex in tqdm(val, desc="Building Validation Data", dynamic_ncols=True):
    validation_data["preprocessed_text"].append(pipe.preprocess(ex["content"]))
    features = pipe.feature_extractor()
    for k, v in features.items():
        validation_data[k].append(v)
    validation_data["level"].append(ex["level"])

Building Validation Data: 100%|████████████████████████████████████████████| 32/32 [01:37<00:00,  3.03s/it]


In [6]:
testing_data = defaultdict(list)
for ex in tqdm(test, desc="Building Testing Data", dynamic_ncols=True):
    testing_data["preprocessed_text"].append(pipe.preprocess(ex["content"]))
    features = pipe.feature_extractor()
    for k, v in features.items():
        testing_data[k].append(v)
    testing_data["level"].append(ex["level"])

Building Testing Data: 100%|███████████████████████████████████████████████| 32/32 [01:36<00:00,  3.00s/it]


#### Stop CoreNLP client

In [7]:
pipe.corenlp_client.stop()

#### Writing Feature Matrices to File

In [8]:
with open("../data/train_features.json", "w", encoding="utf-8") as fout:
    json.dump(training_data, fout)

In [9]:
with open("../data/val_features.json", "w", encoding="utf-8") as fout:
    json.dump(validation_data, fout)

In [10]:
with open("../data/test_features.json", "w", encoding="utf-8") as fout:
    json.dump(testing_data, fout)