- See /docs/README.md

In [7]:
import json

class Metadata:
	def __init__(self, v4_filename, author, annotation_info, original_language, gloss_language, note):
		self.v4_filename = v4_filename
		self.author = author
		self.annotation_info = annotation_info
		self.original_language = original_language
		self.gloss_language = gloss_language
		self.note = note
	
v4s = [
	Metadata(
		v4_filename="Tristesse d'Olympio.corpus.json",
		author="Victor Hugo",
		annotation_info="Machine-glossed with fine-tuned gpt-4o-mini (https://github.com/parkchamchi/GlossySnake/blob/master/src/tools/data/gs_240918.jsonl)",
		original_language="fr",
		gloss_language="en",
		note="Initially generated as a v4 file.",
	),
	Metadata(
		v4_filename="Der Prozess.corpus.json",
		author="Franz Kafka",
		annotation_info="Machine-glossed with fine-tuned gpt-4o-mini (https://github.com/parkchamchi/GlossySnake/blob/master/src/tools/data/gs_240918.jsonl)",
		original_language="de",
		gloss_language="en",
		note="Initially generated as a v4 file.",
	),
	Metadata(
		v4_filename="Die Leiden des jungen Werther.corpus.json",
		author="Goethe",
		annotation_info="Machine-glossed with fine-tuned gpt-4o-mini (https://github.com/parkchamchi/GlossySnake/blob/master/src/tools/data/gs_240918.jsonl)",
		original_language="de",
		gloss_language="en",
		note="Initially generated as a v4 file.",
	),
	Metadata(
		v4_filename="Ethica - Pars I.corpus.json",
		author="Spinoza",
		annotation_info="Machine-glossed with fine-tuned gpt-4o-mini (https://github.com/parkchamchi/GlossySnake/blob/master/src/tools/data/gs_240918.jsonl)",
		original_language="la",
		gloss_language="en",
		note="Initially generated as a v4 file.",
	),
	Metadata(
		v4_filename="Le papillon et le fleur.corpus.json",
		author="Victor Hugo",
		annotation_info="Machine-glossed with fine-tuned gpt-4o-mini (https://github.com/parkchamchi/GlossySnake/blob/master/src/tools/data/gs_240918.jsonl)",
		original_language="fr",
		gloss_language="en",
		note="Initially generated as a v4 file.",
	),
	Metadata(
		v4_filename="Winterreise.corpus.json",
		author="Wilhelm Müller",
		annotation_info="Machine-glossed with fine-tuned gpt-4o-mini (https://github.com/parkchamchi/GlossySnake/blob/master/src/tools/data/gs_240918.jsonl)",
		original_language="de",
		gloss_language="en",
		note="Initially generated as a v4 file.",
	),
]

def v4_to_v5(meta):
	with open(meta.v4_filename, "rt", encoding="utf-8") as fin:
		v4 = json.load(fin)
	
	# Naive

	title = meta.v4_filename.replace(".corpus.json", "")

	p_out_all = ""
	g_out_all = ""
	t_id = 0
	for p in v4["paragraphs"]:
		for t in p["tokens"]:
			#Escape XML characters
			p_out = t["txt"].replace('&', "&amp;").replace('<', "&lt;").replace('>', "&gt;") \
				.replace('\'', "&apos;").replace('\"', "&quot;")
			p_out = p_out.replace('\n', "<br />") #Take care of the newlines

			has_gloss = not t["is_delimiter"]
			if has_gloss:
				p_out = f"<span class='token-span' id='{ t_id }'>{ p_out }</span>"

				g_out = t["gloss"]
				special = ""
				if g_out == "!UNKNOWN":
					special = " special='unknown'"
				elif g_out == "!TO_REANNOTATE":
					special = " special='to-reannotate'"
				g_out = f"<gloss for='{ t_id }'{ special }>{ g_out }</gloss>"
				g_out_all += g_out

				t_id += 1 #Unique
			p_out_all += p_out
		
	out = f"""<?xml version='1.0' encoding='UTF-8' ?>
	<glossysnake-text version='5'>
		<metadata>
			<item type='title'>{ title }</item>
			<item type='author'>{ meta.author }</item>
			<item type='annotation-info'>{ meta.annotation_info }</item>
			<item type='original-language'>{ meta.original_language }</item>
			<item type='gloss-language'>{ meta.gloss_language }</item>
			<item type='note'>{ meta.note }</item>
		</metadata>

		<html>{ p_out_all }</html>

		<glosses>{ g_out_all }</glosses>
	</glossysnake-text>""".replace('\n', ' ').replace('\t', ' ')

	with open(meta.v4_filename.replace(".json", ".xml"), "wt", encoding="utf-8") as fout:
		fout.write(out)

for v4 in v4s:
	v4_to_v5(v4)