In [1]:
import xml.etree.ElementTree as ET
import re
from string import punctuation
from os.path import exists, getsize
from os import listdir, remove
from pathlib import Path
import json
import time
import tracemalloc

https://www.geeksforgeeks.org/monitoring-memory-usage-of-a-running-python-program/

In [2]:
file_path = "/home/max/datasets/flashback-mat.xml"

```
corpus
    thread
        text #date
            paragraph
                sentence
                    token
```

In [None]:
# 119 000 000 lines passed in 4 minutes and 19 seconds.

In [None]:
# from 2000 to 2022

In [3]:
stopwords_path = Path("utils/stopwords-sv.txt")
with open(stopwords_path, mode = "r") as f:
    stopwords = [stopword.strip("\n") for stopword in f.readlines()]

In [4]:
def inititialize(directory, start_year, stop_year, suffix):
    """
    Creates a set of files to populate with examples.
    WARNING: replaces existing files with empty ones.
    """
    
    directory = Path(directory)
    
    for year in range(start_year, stop_year + 1):
        filename = f"{year}.txt" if suffix == "" else f"{year}_suffix.txt"
        f = open(directory / filename, mode = "w")
        f.close()        
    

In [5]:
def save_sentences(batch, directory, suffix):
    """
    Saves an example to the .txt file of its year.
    Assumes initiation by `inititialize()`.
    """
    
    directory = Path(directory)
    
    for year, example_str in batch:
        filename  = f"{year}.txt" if suffix == "" else f"{year}_{suffix}.txt"

        with open(directory/filename, mode="a") as f:
            f.write(example_str + "\n")

In [6]:
def clean_up(directory):
    """
    Removes empty files.
    """
    
    for file in listdir(directory):
        if getsize(directory / file) == 0:
            remove(directory / file)
    

In [7]:
def preprocess(example,
               remove_stopwords = False,
               stopwords = None,
               remove_numbers = True,
               remove_urls = True,
               remove_punctuations = True,
               min_tok_utterances = None,
               lower = True):
    """ 
    Preprocess a list of words and returns a string.
    """
    
    if remove_stopwords:
        assert isinstance(stopwords, list) or isinstance(stopwords, set), "You have selected to use stopwords, but no stopword list is provided."

        example = [word for word in example if word not in stopwords]
        
    if remove_punctuations:
        example = [token for token in example if not token in punctuation]
        
    if min_tok_utterances != None:
        if len(example) <= min_tok_utterances:
            return []
        
    example = " ".join(example)
    
    if lower:
        example = example.lower()
    
    if remove_numbers:
        example = re.sub(r"[0-9]+", "", example)
    
    if remove_urls:
        example = re.sub(r"https?://.*", "", example)
        example = re.sub(r"www\..*", "", example)

    return example

In [8]:
def build_temporal_corpus(xml_file_path, 
                          directory_out, 
                          stop = 400,
                          interval = 100000,
                          start_year = 2000, 
                          end_year = 2022,
                          buffer_limit = 500, 
                          suffix = ""):
    """
    Builds temporal corpus from xml-file; one file per year.
    """
    
    t0 = time.time()
    tracemalloc.start()
    memory0 = (0,0)

    xml_parser = ET.XMLPullParser(['start'])    
    
    directory_out = Path(directory_out)

    collector = []

    counter = {str(year): {"examples": 0, "word_tokens": 0} for year in range(start_year, end_year + 1)}
    
    inititialize(directory_out, start_year, end_year, suffix)

    with open(xml_file_path, "r") as xml:
        
        for i, line in enumerate(xml):
            
            if i % interval == 0:
                
                t = time.time() - t0
                m = int(t / 60)
                s = t % 60
                
                norm, unit = (1000000, "MB")
                memory1 = tracemalloc.get_traced_memory()
                memory  = round(memory1[0]/norm, 1)
                memory_delta = round((memory1[0]-memory0[0])/norm, 1)
                memory0 = memory1                
                
                print(f"{i} lines; in {m} m, {s:.1f} s; memory={memory} {unit} (+{memory_delta} {unit}) ", end = "\r")
                
                # this manouver is to avoid memory explosion
                del xml_parser
                xml_parser = ET.XMLPullParser(['start'])                

            if stop != None:
                if i > stop:
                    break
            
            if re.search("<text.*?>", line) != None:
                xml_parser.feed(line)
                date = [tag for _, tag in xml_parser.read_events()][0].attrib["date"]
                year = date.split()[0].split("-")[0] # "2012-08-17 10:22"
                continue
            
#             if restricted_to_year != None:
#                 if year != restricted_to_year:
#                     continue
            
            if re.search("<sentence.*?>", line) != None:
                raw = list()
                #lem = list() # some inspection suggests that lemmas does not work that well; some very strange cases
                #pos = list()
                continue

            if re.search("<token.*?>", line) != None:
                #print(line)
                xml_parser.feed("<root>"+line) # fake root to avoid ParseError
                tag = [tag for _, tag in xml_parser.read_events()][-1]
                #print(tag.text)
                raw.append(tag.text)
                #lem.append(re.sub(r"\|", "", tag.attrib["lemma"]))
                #pos.append(tag.attrib["pos"])
                continue

            if re.search("</sentence>", line) != None:
                #print(raw)
                example = preprocess(raw)    # consider other parameters: pos, lemmas
                if example == []:
                    continue
                counter[year]["examples"] += 1
                counter[year]["word_tokens"] += len(example.split())
                collector.append((year, example))

            if len(collector) == buffer_limit:      
                save_sentences(batch = collector, directory = directory_out, suffix = suffix)
                collector.clear()
            
            # Without collector there will be alot of opening and closenings :|    
            # Also, to use `with open()` for multiple files would require too many files open 

    if collector != []:
        save_sentences(batch = collector, directory = directory_out, suffix = suffix)
        
    
    counter = {year: counts for year, counts in counter.items() if sum(counts.values()) != 0}
    with open("counter.log", "w") as log:
        log.write(json.dumps(counter))
    
    clean_up(directory_out)
    
    print()
    print("Lines, total:", i)
    print("Done!")
    
    tracemalloc.stop()

In [9]:
build_temporal_corpus(xml_file_path = file_path, 
                      directory_out = "diamat1",
                      stop=None)

119000000 lines; in 82 m, 13.5 s; memory=124.1 MB (+-0.6 MB) 
Lines, total: 119054753
Done!


In [None]:
# -preprocessing -saving 6M lines: 01:51

### Old code for `build_temporal_corpus` 
Old code implements `xml.etree.ElementTree.iterparse()`, but this seems not to be a proper iterator. The computer gets parlyzed by memory overload.