In [1]:
import re
from pathlib import Path
import os
import time

In [2]:
corpus_in = Path("/home/max/Corpora/flashback-pol-time/yearly/fb-pt-clean/files")

In [10]:
paradigm_path = Path("utils/dwts.paradigm")

In [4]:
corpus_out = Path("/home/max/Corpora/flashback-pol-time/yearly/contexts/files")

In [14]:
def read_util(file_path):
    with open(file_path, "r") as f:
        terms = [line.strip("\n").split("#")[0] for line in f.readlines()]
    terms = [term for term in terms if term != ""]
    return terms 

In [6]:
def read_paradigm(file_path):
    paradigms = read_util(file_path)
    paradigms = [tuple([re.sub(r" \Z", r"", column) for column in p.split(" -> ")]) for p in paradigms]
    return paradigms

In [39]:
def parser(line, hit, paradigm):
    
    lemmas = []
    
    for pos, regex, lemma in paradigm:
        regex = re.compile(regex)
        if re.search(regex, line) != None:
            cls = f"{pos}_{lemma}"
            lemmas.append(cls)
    
    lemmas = lemmas if lemmas != [] else [f"X_{hit.group()}"]
    
    return "{}\t{}\t{}".format("; ".join(lemmas), len(lemmas), line)


In [35]:
def context_collector(corpus_in, corpus_out, full_paradigm, stop=None):
    
    t0 = time.time()
    corpus_in = Path(corpus_in)
    corpus_out = Path(corpus_out)
    files = sorted(os.listdir(corpus_in))
    
    roots, pos, regex, lemmas = zip(*read_paradigm(full_paradigm))

    paradigm = list(zip(pos, regex, lemmas))
    roots = set(roots)
    roots = re.compile(r"(" + "|".join(roots) + ")")

    for k, file in enumerate(files, start=1):

        if stop != None:
            if k == stop:
                return        

        f_out = open(corpus_out / file, "w")
        f_out.close()

        with open(corpus_in / file, "r") as f_in, open(corpus_out / file, "a") as f_out:
            year = file.strip(".txt")

            for i, line in enumerate(f_in):

                if i % 10000 == 0:
                    print(f"PROCESSED INPUT: {file} {k} / {len(files)}: {i:>10}", end="\r")

                
                hit = re.search(roots, line) 
                if hit == None:
                    continue 
                    
                # get lemma(s), line "LEM    EXAMPLE"
                line = line.strip("\n")
                to_write = parser(line, hit, paradigm)

                f_out.write(to_write + "\n")
    
    delta_t = time.time() - t0
    m = int(delta_t / 60)
    s = int(delta_t / 60)
    
    print()
    print("Done!", f"({m} m, {s} s)")             

In [41]:
context_collector(corpus_in, corpus_out, paradigm_path, stop=None)

PROCESSED INPUT: 2022.txt 23 / 23:    3610000
Done! (2 m, 2 s)
