# Ad-hoc code for v4 JSON -> v5 xml -> JSON

In [51]:
import json

xml_filename = "Winterreise.corpus.xml"
with open(xml_filename, "rt", encoding="utf-8") as fin:
	xml = fin.read()
xml = xml.replace("<html>", "<HUH>")
xml = xml.replace("</html>", "</HUH>")

In [52]:
from bs4 import BeautifulSoup as Soup
soup = Soup(xml, "lxml")

  soup = Soup(xml, "lxml")


In [53]:
metadata_e = soup.find("metadata")
metadata = {}
for item in metadata_e.find_all("item"):
	k = item["type"]
	v = item.get_text()
	metadata[k] = v
print(metadata)

{'title': 'Winterreise', 'author': 'Wilhelm Müller', 'annotation-info': 'Machine-glossed with fine-tuned gpt-4o-mini (https://github.com/parkchamchi/GlossySnake/blob/master/src/tools/data/gs_240918.jsonl)', 'original-language': 'de', 'gloss-language': 'en', 'note': 'Initially generated as a v4 file.'}


In [54]:
glosses_e = soup.find("glosses")
glosses = {}
for g in glosses_e.find_all("gloss"):
	g_target = g["for"]
	g_txt = g.get_text()
	g_special = g["special"] if g.has_attr("special") in g else None
	if g_special:
		print(g_special)
	glosses[g_target] = {"g_txt": g_txt, "g_special": g_special}

print(glosses)

{'0': {'g_txt': '#', 'g_special': None}, '1': {'g_txt': 'Good', 'g_special': None}, '2': {'g_txt': 'Night.', 'g_special': None}, '3': {'g_txt': 'Stranger', 'g_special': None}, '4': {'g_txt': 'am', 'g_special': None}, '5': {'g_txt': 'I', 'g_special': None}, '6': {'g_txt': 'gone-in,', 'g_special': None}, '7': {'g_txt': 'stranger', 'g_special': None}, '8': {'g_txt': 'draw', 'g_special': None}, '9': {'g_txt': 'I', 'g_special': None}, '10': {'g_txt': 'again (now)', 'g_special': None}, '11': {'g_txt': 'out.', 'g_special': None}, '12': {'g_txt': 'The', 'g_special': None}, '13': {'g_txt': 'May', 'g_special': None}, '14': {'g_txt': 'was', 'g_special': None}, '15': {'g_txt': 'to-me', 'g_special': None}, '16': {'g_txt': 'favourable', 'g_special': None}, '17': {'g_txt': 'With', 'g_special': None}, '18': {'g_txt': 'many-a', 'g_special': None}, '19': {'g_txt': 'flower-bunch.', 'g_special': None}, '20': {'g_txt': 'The', 'g_special': None}, '21': {'g_txt': 'girl', 'g_special': None}, '22': {'g_txt': '

In [55]:
paragraph_delimiter = r"<br/>\s*<br/>"
chars_per_paragraph = 2048

html = soup.find("huh").decode_contents()
#print(html)

import re

def divide_into_paragraphs(html, chars_per_paragraph, paragraph_delimiter):
    splitted = re.split(
        f"({paragraph_delimiter})",
        html
    )
    toret = []
    for s in splitted:
        #Init.
        if toret == []:
            toret.append(s)
            continue  

        last = toret[-1]
        
        s_is_delim = re.fullmatch(paragraph_delimiter, s)
        if s_is_delim or len(last) < chars_per_paragraph:
            toret[-1] = last + s
            continue

        toret.append(s)
        continue

    return toret

paragraphs = divide_into_paragraphs(html, chars_per_paragraph, paragraph_delimiter)
assert ''.join(paragraphs) == html
for p in paragraphs:
    print(len(p), end=" ")


3670 3808 2314 2204 2742 2678 2955 2626 2527 2404 2106 2762 2717 2564 3141 2334 2835 2155 2188 2858 2400 5214 3005 2721 2817 2110 2217 2134 3197 2149 2869 2105 3014 2735 2428 691 

In [56]:
def construct_paragraph(p):
	#Assuming from v4.

	spanpattern = r"(<span.*?</span>)"

	#Get the <span>s first.
	tokens = re.split(spanpattern, p)
	tokens = [e for e in tokens if e != ""]
	assert p == ''.join(tokens)

	toret = []
	for t in tokens:
		is_span = re.fullmatch(spanpattern, t) is not None
		#print(is_span, t)

		if is_span:
			#Fetch id
			match = re.search(r'id="(\d+)"', t)
			span_id = match.group(1)
			gloss = glosses[span_id]
			if gloss["g_special"] == "unknown":
				gloss = "!UNKNOWN"
			elif gloss["g_special"] == "to-reannotate":
				gloss = "!TO_REANNOTATE"
			else:
				gloss = gloss["g_txt"]

			inner_text_match = re.search(r'<span.*?>(.*?)</span>', t)
			inner_text = inner_text_match.group(1) if inner_text_match else ""

			toret.append({"txt": inner_text, "gloss": gloss, "is_delimiter": False})
		else:
			#Divide
			for sub_t in re.split(r"(\s+)", t):
				if sub_t == "":
					continue
				sub_t = sub_t.replace("<br/>", "\n")
				toret.append({"txt": sub_t, "gloss": None, "is_delimiter": True})

	#tokens = re.split(r"(\s+)", p)
	#print(tokens)
	return toret

new_paragraphs = []
for p in paragraphs:
	new_paragraphs.append(
		construct_paragraph(p)
	)
paragraphs = [{"tokens": e} for e in new_paragraphs]
	

In [None]:
corpus = {
	"version": 5,
	"metadata": metadata,
	
	"paragraphs": paragraphs,
}

with open(xml_filename.replace(".xml", "..json"), "wt", encoding="utf-8") as fout:
	json.dump(corpus, fout)