In [40]:
import pandas as pd
import json
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from dotenv import load_dotenv
from rich import print as rprint
import os
import random
from dbfread import DBF

load_dotenv()

True

In [272]:
uri = os.environ["MONGODB_URI"]

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi("1"))

# Send a ping to confirm a successful connection
try:
    client.admin.command("ping")
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


### Loading entries from the database


In [323]:
entries = list(
    client.get_database("dictionary")
    .get_collection("entries")
    .find(
        projection={
            "headword": 1,
            "french_translation": 1,
            "english_translation": 1,
            "part_of_speech": 1,
            "examples": 1,
        }
    )
)

## Processing XML


In [34]:
from pathlib import Path
from mbay_dict.schemas.generated.mbay import NewDataSet
from xsdata.formats.dataclass.parsers import XmlParser

xml_string = Path("./../../data/SaraBagirmiXmlXsd/Mbay.xml").read_text(encoding="utf-8")
parser = XmlParser()
dataset = parser.from_string(xml_string, NewDataSet)

In [35]:
def show(seq, n=10):
    print("count of items:", len(seq))
    rprint(random.sample(seq, n))

In [32]:
rprint("kÉ-sòlï".encode("utf-8"))
rprint("kɨ́-sòlī".encode("utf-8"))

In [47]:
show(dataset.entry)

count of items: 5697


In [27]:
show(dataset.samples)

count of items: 7543


In [48]:
rprint(next(t for t in dataset.translations if t.entrycode == "  202"))

In [33]:
show(dataset.translations)

count of items: 8551


In [29]:
show(dataset.expressions)

count of items: 3299


In [72]:
from dataclasses import asdict

In [88]:
entries = pd.DataFrame(dataset.entry)
entries["entrycode"] = entries["entrycode"].str.strip().astype(int)

samples = pd.DataFrame(dataset.samples)
samples["entrycode"] = samples["entrycode"].str.strip().astype(int, errors="ignore")
samples["trancode"] = samples["trancode"].str.strip().astype(int, errors="ignore")
samples["entrytranc"] = samples["entrytranc"].str.strip().astype(int, errors="ignore")

translations = pd.DataFrame(dataset.translations)
translations["entrycode"] = (
    translations["entrycode"].str.strip().astype(int, errors="ignore")
)
translations["trancode"] = (
    translations["trancode"].str.strip().astype(int, errors="ignore")
)
translations["entrytranc"] = (
    translations["entrytranc"].str.strip().astype(int, errors="ignore")
)

expressions = pd.DataFrame(dataset.expressions)
expressions["entrycode"] = (
    expressions["entrycode"].str.strip().astype(int, errors="ignore")
)
expressions["entrytranc"] = (
    expressions["entrytranc"].str.strip().astype(int, errors="ignore")
)

In [91]:
entries_w_trans = entries.join(
    translations, on="entrycode", how="left", lsuffix="_entry", rsuffix="_translation"
)

In [107]:
translations[translations["translate"] == "God"]

Unnamed: 0,translate,category,entrycode,trancode,entrytranc,relword,gramnote,sortfield
1539,God,NP,16,1,161,,,


In [109]:
entries_w_trans[entries_w_trans["translate"] == "fried millet dough"]

Unnamed: 0,entry,entrycode_entry,soundfile,translate,category,entrycode_translation,trancode,entrytranc,relword,gramnote,sortfield
3575,ndáa,1538,c04_000113.wav,fried millet dough,NI,15.0,1,151,,,


## Fix the encoding of the entries


In [116]:
import openai
import instructor
from pydantic import BaseModel, Field

instructor.patch()


class AlphabetMapping(BaseModel):
    """Mapping from correct unicode letters to the incorrect ones for us to fix the rest of the text. All complex unicodes in the Mbay phonetic alphabet are mapped."""

    mapping: dict[str, list[str] | None] = Field(
        ...,
        description="Mapping from correct unicode letters to the incorrect ones for us to fix the rest of the text. Return your mappings as json object whose python type would be dict[str, list[str] | None]. If no mapping exists for a letter, set it as None so we know we are missing samples. If there is more than one value found, append to the list so we know the incorrect values are incoherent.",
    )


examples = [
    ("Kāgɨ̄-yòo lò kàa-ḿ ì Kútúu.", "Kägæ-yòo lò kàa-» ì Kútúu."),
    ("ɓìr̄-n̄ nà̰ā̰ kɨ̀ dèē", "ßì®-ñ n¼º kÆ dèë"),
    ("Dèē-kɨ́-dḛ̀ḛ́ ɓìr̄-n̄ nà̰ā̰ kɨ̀ ngàw-ǹ.", "Dèë-kÉ-dŸÏ ßì®-ñ n¼º kÆ ngàw-µ."),
    ("Ngōn kɨ́ ngè ɓōĺ kɨ̀là tɔ́y tɔ̀ӯ.", "Ngön kÉ ngè ßö• kÆlà tôy tîÿ."),
    ("M-āw kɨ̀ ɗóo kɨ́lá-ḿ.", "M-äw kÆ ÷óo kÉlá-»."),
    (
        "Tùbò ɔ̀dɨ̀ ngè-kɔ́l lɔ́w kám-tā̰á̰ lā tɔ̄l-á yé.",
        "Tùbò îdÆ ngè-kôl lôw kám-tºª lä tûl-á yé.",
    ),
    ("bɔ̀mbɔ́ḿ", "bîmbô»"),
    ("Bùl̄-mṵ̀ṵ àndɨ̄ kùm ngōn-kó̰o̰-í tɨ́.", "bù£-m“¾ àndæ kùm ngön-kÇ¢-í tÉ."),
    ("píẁ", "píÞ"),
    ("ɓèe-óríyò-góỳ-góỳ", "ßèe-óríyò-góØ-góØ"),
    ("ɓèe-dà-tànjɨ̀", "ßèe-dà-tànjÆ"),
    ("álàpō", "álàpö"),
    ("àhá̰à̰", "àhª¼"),
    ("nà̰-tâ", "n¼-tΓ"),
    ("m̄-ɗèē-ň m-ādɨ̄-ī.", "«-÷èë-² m-ädæ-ï."),
    ("bɔ̀ӯ", "bîÿ"),
    ("Jùwḛ́ḛ̀,Jùwḛ̄ḛ,Jṵ̀wḛ́ḛ̀", "JùwÏŸ,Jùwþõ,J“wÏŸ"),
    ("bɨ̀lò-kḭ̀ḭ", "bÆlò-k¡¿"),
    ("á̰yóỳ,á̰yōy", "ªyóØ,ªyöy"),
    ("bàgɨ̀r-bèŕ", "bàgÆr-bè©"),
]

formatted_examples = ""
for example in examples:
    formatted_examples += f"Correct: {example[0]}\nIncorrect: {example[1]}\n\n"


prompt = f"""
I have correct and broken spellings for various strings in the Mbay dialect. 
Build a mapping from the correct unicode letters to the incorrect ones for us to fix the rest of the text. 
If no mapping exists for a letter, set it as None so we know we are missing samples. If there is more than one value found, append to the list so we know the incorrect values are incoherent.

Letters: à á â è é ì í ò ó ù ú ý ā ē ī ĺ ń ň ō ŕ ū ɓ ɔ ɗ ɨ ӯ ḛ ḭ ḿ ṵ ẁ ỳ

{formatted_examples}
"""

# response = openai.ChatCompletion.create(
#     model="gpt-4",
#     messages=[{"role":"user", "content": prompt}],
#     temperature=0.0,
# )

In [117]:
# rprint(mapping["choices"][0]["message"]["content"])

In [118]:
rprint(prompt)

In [334]:
mapping = {
    # a
    "à": ["à"],
    "á": ["á"],
    "â": ["Γ"],
    "ā": ["ä"],
    "a̰": ["½"],
    "à̰": ["¼"],
    "á̰": ["ª"],
    "ā̰": ["º"],
    # e
    "è": ["è"],
    "é": ["é"],
    "ē": ["ë"],
    "ḛ": ["õ"],
    "ḛ̄": ["þ"],
    "ḛ̀": ["Ÿ"],
    "ḛ́": ["Ï"],
    # i
    "ì": ["ì"],
    "í": ["í"],
    "ī": ["ï"],
    "ḭ̄": ["¬"],
    # o
    "ò": ["ò"],
    "ó": ["ó"],
    "ō": ["ö"],
    "o̰": ["¢"],
    "ò̰": ["Å"],
    "ó̰": ["Ç"],
    "ō̰": ["ç"],
    # u
    "ù": ["ù"],
    "ú": ["ú"],
    "ū": ["ü"],
    "ṵ": ["¾"],
    "ṵ̀": ["“"],
    "ṵ́": ["ø"],
    "ṵ̄": ["¦"],
    # l
    "ĺ": ["•"],
    "l̀": ["¥"],
    "l̄": ["£"],
    # n
    "ǹ": ["µ"],
    "ń": ["Ñ"],
    "n̄": ["ñ"],
    "ň": ["²"],
    # r
    "r̀": ["±"],
    "ŕ": ["©"],
    "r̄": ["®"],
    # b
    "ɓ": ["ß"],
    # c
    "ɔ": ["ɔ"],
    "ɔ́": ["ô"],
    "ɔ̀": ["î"],
    "ɔ̄": ["û"],
    # d
    "ɗ": ["÷"],
    # i
    "ḭ": ["¿"],
    "ḭ̀": ["¡"],
    "ḭ́": ["ƒ"],
    # m
    "ḿ": ["»"],
    "m̄": ["«"],
    # w
    "ẁ": ["Þ"],
    "w̄": ["Û"],
    # y
    # y
    "ý": ["ý"],
    "ỳ": ["Ø"],
    "ӯ": ["ÿ"],
    # i special
    # "ɨ": ["Æ"],
    "ɨ̀": ["Æ"],
    "ɨ́": ["É"],
    "ɨ̄": ["æ"],
}

reverse_mapping = {}
for key, value in mapping.items():
    for v in value:
        if v in reverse_mapping:
            print(f"Duplicate values {v}: {key} and {reverse_mapping[v]}")
        reverse_mapping[v] = key

In [335]:
def fix_text(text: str):
    for key, value in reverse_mapping.items():
        text = text.replace(key, value)
    return text

In [336]:
fixed_entries = []
for entry in dataset.entry:
    fixed_entry = fix_text(entry.entry)
    fixed_entries.append(
        {
            "entrycode": entry.entrycode,
            "entry": fixed_entry,
            "old_entry": entry.entry,
        }
    )

In [337]:
yṵ́tɨ̀
yøtÆ

NameError: name 'yṵ́tɨ̀' is not defined

In [338]:
i = 740
fixed_entries[i : i + 10]

[{'entrycode': ' 5196', 'entry': 'dà-kám', 'old_entry': 'dà-kám'},
 {'entrycode': ' 5203', 'entry': 'dà-kàsɨ̀', 'old_entry': 'dà-kàsÆ'},
 {'entrycode': '  603',
  'entry': 'dà-kɨ́-bòy-dètɨ́',
  'old_entry': 'dà-kÉ-bòy-dètÉ'},
 {'entrycode': ' 4462',
  'entry': 'dà-kɨ́-mbīr-yóò',
  'old_entry': 'dà-kÉ-mbïr-yóò'},
 {'entrycode': ' 4369', 'entry': 'dà-kɨ́-ɗōrī', 'old_entry': 'dà-kÉ-÷örï'},
 {'entrycode': ' 2636', 'entry': 'dà-kɨ́rō', 'old_entry': 'dà-kÉrö'},
 {'entrycode': '  319', 'entry': 'dà-kóy', 'old_entry': 'dà-kóy'},
 {'entrycode': ' 3258', 'entry': 'dà-līi-gɨ̄', 'old_entry': 'dà-lïi-gæ'},
 {'entrycode': ' 4656', 'entry': 'dà-lūu', 'old_entry': 'dà-lüu'},
 {'entrycode': '  323', 'entry': 'dà-mbī', 'old_entry': 'dà-mbï'}]

In [339]:
entries[2]

{'_id': ObjectId('64e50c7ac1f64bbda8eac708'),
 'headword': 'àhá̰à̰',
 'english_translation': 'now you see!.',
 'french_translation': 'maintenant tu vois!',
 'part_of_speech': 'INJ',
 'examples': [{'created_at': '2023-08-19T22:59:14Z',
   'updated_at': '2023-08-19T22:59:14Z',
   'entry_id': 21,
   'mbay': 'àhá̰à̰, ngóo ń ī-tɔ̄ɔ̄ nò m-ā m-él kíyā-á.',
   'english_translation': "Now you'll see, that gourd you broke, I'm going to tell its owner.",
   'french_translation': 'Maintenant tu verras, cette calebasse que tu as cassée, je vais le dire à son propriétaire.',
   'sound_filename': 'AHaaN_SS.mp3',
   '_id': ObjectId('64e50c7ac1f64bbda8eaa36e')}]}

In [340]:
from itertools import chain
entries_map = {entry["headword"].replace(" ", ""): entry for entry in entries}
entry_examples = list(chain.from_iterable(
    (ex["mbay"] for ex in entry["examples"])
    for entry in entries
))

In [341]:
entry_examples[0]

'J-ḭ̀ḭ̄ ɓògɨ̀-gɨ̄ àĺ.'

In [342]:
def examples_contain(text: str):
    return any(text in example for example in entry_examples)

In [343]:
misses = {}

for entry in fixed_entries:
    if (entry["entry"] not in entries_map) and (not examples_contain(entry["entry"])):
        misses[entry["entry"]] = entry

In [344]:
len(misses)

104

In [345]:
rprint(misses)

## Via DBase


In [38]:
from itertools import islice

In [46]:
for record in islice(DBF("../../data/SaraLangAsDbaseTables/MBAYFONT.DBF"), 10):
    print(record)

OrderedDict([('KEYSTROKE', 'a'), ('KEYVALUE', 97), ('SINGLEKEY', 'a'), ('ASCIIVAL', 97), ('CHARVAL', 'a'), ('DEFAULTVAL', 97), ('SORTVALUE', '!'), ('SORTVALUE2', '')])
OrderedDict([('KEYSTROKE', 'shift-a'), ('KEYVALUE', 65), ('SINGLEKEY', 'a'), ('ASCIIVAL', 65), ('CHARVAL', 'A'), ('DEFAULTVAL', 65), ('SORTVALUE', '!'), ('SORTVALUE2', '')])


UnicodeDecodeError: 'ascii' codec can't decode byte 0x85 in position 0: ordinal not in range(128)