In [1]:
import re
from pathlib import Path
import os
import time


Variation ...
* N1_kulturberikaren  --  behövs kanske inte
* N1_kulturberikare (lemma)
* N1C_globalistmaffia -- behövs nog inte heller
* N1C_globalistX
* N1C_globalist maffia 


##### Original
0. sv_compounds (original-original)
1. en_compounds (original-enc)
##### Lemmatized
2. sv_compounds (lem-original)
3. en_compounds (len-enc)
    

In [2]:
def read_util(file_path):
    with open(file_path, "r") as f:
        terms = [line.strip("\n").split("#")[0] for line in f.readlines()]
    terms = [term for term in terms if term != ""]
    return terms 

In [3]:
def read_paradigm(file_path):
    paradigms = read_util(file_path)
    paradigms = [tuple([re.sub(r" \Z", r"", column) for column in p.split(" -> ")]) for p in paradigms]
    return paradigms

In [4]:
def enc(line, paradigm):
   
    for _, regex, lemma in paradigm:
        if lemma.endswith("X"): # see `.param` file
            regex = re.compile(regex)
            if re.search(regex, line):
                #print(1, lemma, i, line)

                line = re.sub(regex, r"\1 \2", line) # this will assume `(aaa)(bbb)`

                #print(2, lemma, i, line)
        
    return line
    
def dwt_lem(line, paradigm):
    
    for pos, regex, lemma in paradigm:
        if lemma == "hjälpa_X_på_plats":
            continue
        regex = re.compile(regex)
        line = re.sub(regex, f"{pos}_{lemma}", line)
    
    return line

In [5]:
def dwt_transformer(line, paradigm, mode):
        
    if mode == 1:
        return enc(line, paradigm)
    if mode == 2:
        return dwt_lem(line, paradigm)
    if mode == 3:
        line = enc(line, paradigm)
        line = dwt_lem(line, paradigm)
        return line

In [6]:
def preprocess_dwt(corpus_in, corpus_out, full_paradigm, log_dir, mode=1, stop=None):
    
    #global i
    
    t0 = time.time()
    corpus_in = Path(corpus_in)
    corpus_out = Path(corpus_out)
    files = sorted(os.listdir(corpus_in))
    
    roots, pos, regex, lemmas = zip(*read_paradigm(full_paradigm))

    paradigm = list(zip(pos, regex, lemmas))
    roots = set(roots)
    roots = re.compile(r"(" + "|".join(roots) + ")")

    for k, file in enumerate(files, start=1):

        if stop != None:
            if k == stop:
                return        

        f_out = open(corpus_out / file, "w")
        f_out.close()

        log_counter = 1
        
        log_file = file.strip(".txt") + ".log"

        with open(corpus_in / file, "r") as f_in, open(corpus_out / file, "a") as f_out, open(log_dir / log_file, "w") as log:
            year = file.strip(".txt")

            log.write(f"\n---  {file}  ---\n")

            for i, line in enumerate(f_in):

                if i % 10000 == 0:
                    print(f"{file} {k} / {len(files)}: {i}    ", end="\r")

                if re.search(roots, line) == None:
                    f_out.write(line)
                    continue                

                line_pp = dwt_transformer(line, paradigm, mode)

                log.write(f"============ {log_counter}\n")
                log.write(line)

                if line != line_pp:
                    log.write("----------->\n")
                    log.write(line_pp)
                #print(line)

                f_out.write(line_pp)

                log_counter += 1

    
    delta_t = time.time() - t0
    m = int(delta_t / 60)
    s = int(delta_t / 60)
    
    print()
    print("Done!", f"({m} m, {s} s)")             

In [7]:
def out_template(x):
    return f"/home/max/Corpora/flashback-pol-time/yearly/fb-pt-radical{x}/files"
def log_template(log_dir, x):
    return log_dir / f"radical{x}/"

In [8]:
#corpus_in = Path("/home/max/Corpora/flashback-pol-time/yearly/fb-pt-clean-sgns/files/")
corpus_in = Path("/home/max/Corpora/familjeliv-smh-time/yearly/fm-sh-clean-sgns/files/")

In [9]:
paradigm = Path("utils/dwts.paradigm")

In [10]:
log_dir = Path("../../pp-log-dir")

In [12]:
my_mode=3
preprocess_dwt(
    corpus_in = corpus_in, 
    #corpus_out = f"/home/max/Corpora/flashback-pol-time/yearly/fb-pt-radical{my_mode}/files", 
    corpus_out = f"/home/max/Corpora/familjeliv-smh-time/yearly/fm-sh-radical{my_mode}/files",
    full_paradigm = paradigm, 
    log_dir = log_template(log_dir, my_mode), 
    mode=my_mode, 
    stop=None
)

2022.txt 20 / 20: 110000     
Done! (1 m, 1 s)


In [None]:
# for mode in [1, 2, 3]:
# #for mode in [3]:
#     print()
#     print("MODE =", mode)
#     print()
#     preprocess_dwt(corpus_in, 
#                    out_template(mode), 
#                    paradigm, 
#                    log_template(log_dir, mode), 
#                    mode=mode, 
#                    stop=None)