In [1]:
import re
import os
import json
#import csv
from pathlib import Path
import time
import json

In [2]:
output_dir = Path("/home/max/tmp")

In [3]:
corpus = Path("/home/max/Corpora/flashback-pol-time/yearly/fb-pt-radical3/files")

In [4]:
dwt_seeds = Path("utils/dwts.txt")

In [5]:
def read_util(file_path):
    with open(file_path, "r") as f:
        terms = [line.strip("\n").split("#")[0] for line in f.readlines()]
    terms = [term for term in terms if term != ""]
    return terms    

In [6]:
read_util(dwt_seeds)

['förortsgäng',
 'återvandr',
 'berika',
 'kulturberika',
 'ordning_och_reda_i_flyktingpolitiken',
 'globalist',
 'hjälpa_på_plats',
 'självständig_utrikespolitik']

In [7]:
def read_paradigm(file_path):
    paradigms = read_util(file_path)
    paradigms = [tuple([re.sub(r" \Z", r"", column) for column in p.split(" -> ")]) for p in paradigms]
    return paradigms

In [8]:
read_paradigm("utils/dwts.paradigm")

[('förortsgäng', 'N1', '\\bförortsgäng(|et|en|s|ets|ens)\\b', 'förortsgäng'),
 ('förortsgäng',
  'N1C',
  '\\b(förortsgäng)s?([^(\\s|et|en|s|ets|ens)].*?)\\b',
  'förortsgängX'),
 ('återvandr',
  'N1',
  '\\båtervandring(|en|ar|s|ens|ars|arna|arnas)\\b',
  'återvandring'),
 ('återvandr', 'N2', '\\båtervandrar(e|en|es|ens|na|nas)\\b', 'återvandrare'),
 ('återvandr', 'N3', '\\båtervandrande(|t|ts|n|ns|rnas)', 'återvandrande'),
 ('återvandr',
  'N1C',
  '\\b(återvandring)s?([^(\\s|en|ar|s|ens|ars|arna|arnas)].*?)\\b',
  'återvandringsX'),
 ('återvandr', 'N2C', '\\b(återvandrar)([^(\\s|e)].*?)\\b', 'återvandrarX'),
 ('återvandr', 'V1', '\\båtervandra(|r|d|de|t)\\b', 'återvandra'),
 ('berika', 'N1', '\\bberikar(e|en|na|es|ens|nas)\\b', 'berikare'),
 ('berika', 'N1C', '\\b(berikar)([^(\\s|en|na|s|ens|nas)].*?)\\b', 'berikarX'),
 ('berika', 'V1', '\\bberik(a|ar|ad|ade|at|ande)\\b', 'berika'),
 ('berika', 'N1', '\\bkulturberikar(e|en|na|es|ens|nas)\\b', 'kulturberikare'),
 ('berika',
  'N1C',


In [9]:
def dwt_counter(corpus, full_paradigm):
    t0 = time.time()
    corpus = Path(corpus)
    files = sorted(os.listdir(corpus))
    years = [year.strip(".txt") for year in files]
    roots, pos, regex, lemmas = zip(*full_paradigm)
    counter = {year: {lemma: {} for lemma in lemmas} for year in years}    
   
    paradigm = list(zip(pos, regex, lemmas))
    roots = set(roots)
    roots = re.compile(r"(" + "|".join(roots) + ")")
    
    hits = 0
   
    for k, file in enumerate(files, start=1):
        with open(corpus / file, "r") as f:
            year = file.strip(".txt")
            
            for i, line in enumerate(f):
                
#                 if hits > 5000:
#                     return counter
                
                if i % 10000 == 0:
                    print(f"{file} {k} / {len(files)}: {i}; hits = {hits}    ", end="\r")
                
                if re.search(roots, line) == None:
                    continue
                
                else:
                    hits += 1

                    for pos, regex, lemma in paradigm:
                        regex = re.compile(regex)
                        payload = re.search(regex, line)
                        if payload == None:
                            continue
                        else:
                            hit = payload.group(0)
                            
                            if hit in counter[year][lemma]:
                                counter[year][lemma][hit] += 1
                            else:
                                counter[year][lemma][hit] = 1
    
    delta_t = time.time() - t0
    m = int(delta_t / 60)
    s = int(delta_t / 60)
    
    print("Done!", f"({m} m, {s} s)")
    
    return counter               

In [10]:
dd = dwt_counter(corpus, read_paradigm("utils/dwts.paradigm"))

Done! (3 m, 3 s): 3610000; hits = 75689    


In [11]:
dd

{'2000': {'förortsgäng': {},
  'förortsgängX': {},
  'återvandring': {},
  'återvandrare': {},
  'återvandrande': {},
  'återvandringsX': {},
  'återvandrarX': {},
  'återvandra': {},
  'berikare': {},
  'berikarX': {},
  'berika': {},
  'kulturberikare': {},
  'kulturberikarX': {},
  'kulturberika': {},
  'ordning_och_reda_i_flyktingpolitiken': {},
  'globalist': {},
  'globalistX': {},
  'globalistisk': {},
  'hjälpa_på_plats': {},
  'hjälpa_X_på_plats': {'hjälpa folket på plats': 1},
  'självständig_utrikespolitik': {}},
 '2001': {'förortsgäng': {},
  'förortsgängX': {},
  'återvandring': {},
  'återvandrare': {},
  'återvandrande': {},
  'återvandringsX': {},
  'återvandrarX': {},
  'återvandra': {},
  'berikare': {},
  'berikarX': {},
  'berika': {},
  'kulturberikare': {},
  'kulturberikarX': {},
  'kulturberika': {},
  'ordning_och_reda_i_flyktingpolitiken': {},
  'globalist': {},
  'globalistX': {},
  'globalistisk': {},
  'hjälpa_på_plats': {},
  'hjälpa_X_på_plats': {},
  'sj

In [None]:
with open(output_dir / "radical3.json", "w") as f:
    f.write(json.dumps(dd))

In [None]:
for year in dd.keys():
    print(year)
    for lemma in dd[year].keys():
        tot = sum(dd[year][lemma].values())
        if tot > 0:
            print(lemma.upper(), f"(n={tot})")
            for word, count in dd[year][lemma].items():
                print("\t", word, count)