In [13]:
UPOS_XPOS_MAP = {
    "VERB": ["vi", "vd", "vc", "v", "verb"],
    "NOUN": ["n", "nl", "nmlz", "noun"],
    "PRON": ["pron", "int"],
    "PROPN": ["propn"],
    "DET": ["adn", "determiner"],
    "ADV": ["adv"],
    "CCONJ": ["cconj"],
    "SCONJ": ["cconj", "sconj", "padv"],
    "POST": ["post"],
    "AUX": ["auxv", "cop"],
    "PART": ["sfp", "pers"],
    "INTJ": ["intj"],
    "ADP": ["advp", "postp", "parti"],
    "NUM": ["num"],
    "PUNCT": ["punct"],
}


def generate_upos_from_xpos(xpos: str) -> str:
    """Generate UPOS from XPOS using the reverse mapping."""
    reverse_map = {x: upos for upos, xposes in UPOS_XPOS_MAP.items() for x in xposes}
    return reverse_map.get(xpos, "_")

generate_upos_from_xpos("vi")

'VERB'

In [16]:
import pickle
from typing import List, Dict
from pathlib import Path
from tqdm.notebook import tqdm

def create_conllu_entry(
    token_id: int, form: str, pos_tags: List[str], glosses: List[str]
) -> str:
    """Create a CoNLL-U format line for a token."""
    # Fields: ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC

    # Use first POS tag as UPOS if available, otherwise '_'
    upos = generate_upos_from_xpos(pos_tags[0]) if pos_tags else "_"

    # Join all POS tags for XPOS
    xpos = "|".join(pos_tags) if pos_tags else "_"

    # Join glosses for MISC field
    misc = "Gloss=" + "|".join(glosses) if glosses else "_"

    lemma = form # TODO:
    # Use '_' for fields we don't have information for
    return f"{token_id}\t{form}\t{lemma}\t{upos}\t{xpos}\t_\t_\t_\t_\t{misc}"


def sentence_to_conllu(sentence: Dict) -> str:
    """Convert a sentence dict to CoNLL-U format string."""
    # Add sentence metadata
    lines = [
        "# text = " + sentence["sentence"],
        "# translation = " + sentence["translation"],
    ]

    # Add token entries
    for idx, (word, pos_tags, glosses) in enumerate(
        zip(sentence["words"], sentence["part_of_speech"], sentence["glosses"]), start=1
    ):
        lines.append(create_conllu_entry(idx, word, pos_tags, glosses))

    return "\n".join(lines) + "\n"


with open("output/annotated_translated_tokenized_corpus_by_book.pkl", "rb") as f:
    sentences_by_book = pickle.load(f)

# Create output directory if it doesn't exist
output_dir = Path("output/conllu")
output_dir.mkdir(parents=True, exist_ok=True)

# Process each book
for book_title, sentences in tqdm(list(sentences_by_book.items())):
    # Create sanitized filename
    filename = "".join(c for c in book_title if c.isalnum() or c in (" ", "-"))
    filename = filename.replace(" ", "_") + ".conllu"

    # Write CoNLL-U file
    with open(output_dir / filename, "w", encoding="utf-8") as f:
        for sentence in sentences:
            f.write(sentence_to_conllu(sentence))
            f.write("\n")  # Add blank line between sentences

    # break

  0%|          | 0/53 [00:00<?, ?it/s]