# Preprocess

In [1]:
import shutil
import os
import glob
import random

rnd_seed = 1234
random.seed(rnd_seed)

from tqdm import tqdm
import spacy

In [2]:
def clean_dir(dirname):
    try:
        shutil.rmtree(dirname)
    except FileNotFoundError:
        pass
    os.mkdir(dirname)

clean_dir('tagged')

In [3]:
allowed = set('ADJ NOUN VERB'.split())

In [4]:
nlp = spacy.load('nl', disable=['parser', 'ner'])
nlp.max_length = 10000000

In [5]:
def preprocess(indir, outdir, max_num=None):
    clean_dir(outdir)
    
    f_ins = list(glob.glob(f'{indir}/*.txt'))
    if max_num:
        random.shuffle(f_ins)
        f_ins = f_ins[:max_num]
    
    for f_in in tqdm(f_ins):
        with open(f_in) as f:
            text = f.read()
        tokens = nlp(text)
        if not tokens:
            continue
        new_fn = f'{outdir}/{os.path.basename(f_in)}'
        with open(new_fn, 'w') as f:
            for t in tokens:
                if t.pos_ in allowed and t.is_alpha and not t.is_stop:
                    f.write(t.text.lower() + ' ')

In [6]:
preprocess(indir='data/orig/Oorlogsromans',
           outdir='data/tagged/oorlog',
           max_num=None)

100%|██████████| 483/483 [35:31<00:00,  4.41s/it] 


In [None]:
preprocess(indir='data/orig/Streekromans',
           outdir='data/tagged/streek',
           max_num=None)

 15%|█▌        | 171/1105 [14:11<1:20:52,  5.20s/it]