In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import json
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from dotenv import load_dotenv
import os
from rich import print as rprint
from mbay_dict.core import domain as d
from mbay_dict.core.models import new_object_id

load_dotenv()

True

In [3]:
uri = os.environ["MONGODB_URI"]

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi("1"))

# Send a ping to confirm a successful connection
try:
    client.admin.command("ping")
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [4]:
entries_v1 = list(
    client.get_database("dictionary")
    .get_collection("entries")
    .find(
        projection={
            "headword": 1,
            "french.translation": 1,
            "english.translation": 1,
            "part_of_speech": 1,
        }
    )
)

entries_v2 = list(
    client.get_database("dictionary")
    .get_collection("entries-devel")
    .find(
        projection={
            "headword": 1,
            "french.translation": 1,
            "english.translation": 1,
            "partOfSpeech": 1,
        }
    )
)

In [5]:
len(entries_v1), len(entries_v2)

(5690, 5132)

In [6]:
df_v1 = pd.json_normalize(entries_v1).set_index("_id")
df_v2 = pd.json_normalize(entries_v2).set_index("_id")

In [7]:
df_v1.sort_values(by=["headword"], inplace=True)
df_v2.sort_values(by=["headword"], inplace=True)

In [8]:
df_v1.head()

Unnamed: 0_level_0,headword,part_of_speech,french.translation,english.translation
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
64eca312f6197fd20d76300c,-dé,PRA,"eux (avec sè quand prononcé sè; donc, sè, mais...","them (with sè when pronounced sè; thus, sè, bu..."
64eca312f6197fd20d763055,-dɨ́,PRA,les {forme obj.}.,them {obj. form}.
64eca313f6197fd20d763163,-gē,NAF,particule marquant le pluriel d'une expression...,particle marking plural of noun phrase (rare v...
64eca313f6197fd20d7631ae,-gɨ̄,NAF,particule marquant le pluriel de la phrase nom...,particle marking plural of noun phrase.
64eca313f6197fd20d763337,-jè,PRA,nous.,us.


In [9]:
df_v2.head()

Unnamed: 0_level_0,headword,partOfSpeech,french.translation,english.translation
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
64fc50f6286b18ef7de0b199,-dɨ́,PRA,leur,their
64fc50f7286b18ef7de0eea2,-gē,NAF,marquage de particule indiquant le pluriel d'u...,particle marking plural of noun phrase
64fc50f7286b18ef7de0cebb,-gɨ̄,AF,approximativement,approximately
64fc50f7286b18ef7de0e7f2,-jè,PRA,nous,us
64fc50f6286b18ef7de0bc52,-jɨ̀,PRA,nous,us {obj.}


In [10]:
in_v2_not_v1 = df_v2[~df_v2["headword"].isin(df_v1["headword"])]
in_v1_not_v2 = df_v1[~df_v1["headword"].isin(df_v2["headword"])]

In [11]:
(
    df_v1.drop_duplicates(subset=["headword"]).shape,
    df_v2.drop_duplicates(subset=["headword"]).shape,
)

((5132, 4), (5132, 4))

In [13]:
df_v1.shape, df_v2.shape

((5690, 4), (5132, 4))

In [14]:
in_v2_not_v1

Unnamed: 0_level_0,headword,partOfSpeech,french.translation,english.translation
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
64fc50f7286b18ef7de0dc42,bàa-bè,CNJ,un instant plus tard,an instant later
64fc50f7286b18ef7de0bf39,bḭ̀ḭ̄-gìdɨ̀-gījā,NE,obscurité {lit : poil sur le dos du civet},obscurity {lit: hair on back of civet}
64fc50f7286b18ef7de0ceea,bíjír,IDS,tout est en place dans la vie; avoir tout ce d...,all set up in life; having everything needed
64fc50f6286b18ef7de0b5b9,bògɨ̀r,ID,sale (blanc) {couleur de la peau lorsqu'elle e...,dirty (white) {color of skin when covered with...
64fc50f6286b18ef7de0b3de,"bālē,bàlē",NP,Samedi {jour traditionnel du balayage},Saturday {traditional day of sweeping}
...,...,...,...,...
64fc50f7286b18ef7de0e375,ɓèe-kàw-kīnjá,NI,type de champignon,type of mushroom
64fc50f7286b18ef7de0e368,ɓèe-kɨ́lē,NI,type de champignon jaune,type of yellow mushroom
64fc50f7286b18ef7de0d968,ɓɨ̀lò,NI,esclave,slave
64fc50f7286b18ef7de0d8ac,ɓɨ̄ngā,VI,être déchiré,be torn up


In [15]:
in_v1_not_v2

Unnamed: 0_level_0,headword,part_of_speech,french.translation,english.translation
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
64eca312f6197fd20d76300c,-dé,PRA,"eux (avec sè quand prononcé sè; donc, sè, mais...","them (with sè when pronounced sè; thus, sè, bu..."
64eca315f6197fd20d764195,-ú,AF,"dans, à (suffixe locatif) (suit les noms conte...","in, at (locative suffix) (follows nouns contai..."
64eca314f6197fd20d763d46,-ɔ́,AF,"dans, à {suffixe locatif} (suit les tiges cont...","in, at {locative suffix} (follows stems contai..."
64eca313f6197fd20d7633c5,Kàdɨ̀-yòo,N,"dieu de la magie, dieu traditionnel.","god of magic, traditional god."
64eca312f6197fd20d762d32,bàa-bèē,AVE,"pour un moment, brièvement (communément abrégé...","for a moment, briefly (commonly shortened to b..."
...,...,...,...,...
64eca312f6197fd20d762d1f,à̰a̰a̰,INJ,maintenant tu vois!,now you see!.
64eca312f6197fd20d762cf8,ámpɨ̀rmē,NI,infirmière ; aide hospitalière,nurse; hospital aide.
64eca313f6197fd20d7630f4,èdéè,VT,aider,help.
64eca312f6197fd20d762f31,ɓèe-kàw-kɨ̄njá,NI,type de champignon.,type of mushroom.


In [16]:
df_v2[df_v2["english.translation"].str.contains("god")]

Unnamed: 0_level_0,headword,partOfSpeech,french.translation,english.translation
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
64fc50f7286b18ef7de0e2ff,Kàdɨ̀,NP,"dieu de la fortune, 'ange gardien'","god of fortune, 'guardian angel'"
64fc50f6286b18ef7de0b7d3,Kàdɨ̀-dɔ̀-ngɔ̀ɔ̀,NP,dieu des morts,god of the dead
64fc50f7286b18ef7de0eff1,Mò̰ō̰,NP,parrain de l'initiation,godfather of initiation
64fc50f7286b18ef7de0d315,bɨ̀rà-ndòo,NT,parrain de [garçon en cours d'initiation],godfather of [boy being initiated]
64fc50f6286b18ef7de0a2ac,kóo-dèē-bòo,NP,déesse de la pluie,goddess of rain


In [17]:
df_v1[df_v1["headword"].str.contains("^-.$")]

Unnamed: 0_level_0,headword,part_of_speech,french.translation,english.translation
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
64eca312f6197fd20d762cd6,-á,PRA,lui {obj. du verbe}.,"him, her, it {obj. of verb}."
64eca312f6197fd20d762cd5,-á,AF,"dans, à {suffixe locatif} (suit les tiges cont...","in, at {locative suffix} (follows stems contai..."
64eca313f6197fd20d7630ef,-è,AF,particule marquant une phrase interrogative {s...,particle marking an interrogative sentence {as...
64eca313f6197fd20d7630f1,-é,AF,"dans, à {suffixe locatif} (suit les racines co...","in, at {locative suffix} (follows stems contai..."
64eca313f6197fd20d7632ec,-í,PRA,toi {obl.sing.},you {obl.sing.}.
64eca313f6197fd20d7632eb,-í,AF,"dans, à {suffixe locatif} (suit les racines co...","in, at {locative suffix} (follows stems contai..."
64eca314f6197fd20d763d2d,-ò,AF,particule marquant une phrase interrogative (u...,particle marking an interrogative sentence (us...
64eca314f6197fd20d763d2f,-ó,AF,"dans, à {suffixe locatif} (suit les racines co...","in, at {locative suffix} (follows stems contai..."
64eca315f6197fd20d764195,-ú,AF,"dans, à (suffixe locatif) (suit les noms conte...","in, at (locative suffix) (follows nouns contai..."
64eca313f6197fd20d7630f0,-ē,AF,particule marquant une phrase interrogative.,particle marking an interrogative sentence.


In [18]:
df_v2[df_v2["headword"].str.contains("^-.$")]

Unnamed: 0_level_0,headword,partOfSpeech,french.translation,english.translation
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
64fc50f7286b18ef7de0e7ec,-á,AF,"dans, à {suffixe locatif}","in, at {locative suffix}"
64fc50f7286b18ef7de0d8aa,-è,AF,particule marquant une phrase interrogative {s...,particle marking an interrogative sentence {as...
64fc50f7286b18ef7de0e811,-é,AF,"dans, à {suffixe locatif}","in, at {locative suffix}"
64fc50f7286b18ef7de0e80f,-í,AF,"dans, à {suffixe locatif}","in, at {locative suffix}"
64fc50f7286b18ef7de0e839,-ò,AF,particule marquant une phrase interrogative,particle marking an interrogative sentence
64fc50f7286b18ef7de0e80d,-ó,AF,"dans, à {suffixe locatif}","in, at {locative suffix}"
64fc50f7286b18ef7de0e7ef,-ē,AF,particule marquant une phrase interrogative,particle marking an interrogative sentence
64fc50f7286b18ef7de0ca78,-ī,PRA,vous,you [obj. sing.]
64fc50f7286b18ef7de0bf2a,-ň,PRA,par lui/elle/cela,by or through him/her/it
64fc50f6286b18ef7de0a0ea,-ḿ,PRA,mon,my


In [19]:
entries_v1_full = list(
    client.get_database("dictionary").get_collection("entries").find()
)

In [29]:
new_entries = []
for id in in_v1_not_v2.index:
    entry_v1 = next(item for item in entries_v1_full if item["_id"] == id)
    id = entry_v1["_id"]
    examples = [
        d.Example(
            id=example["_id"],
            parent_id=d.ParentId(id=id, type="entry"),
            mbay=example["mbay"],
            english=d.Translation.from_text(example["english_translation"]),
            french=d.Translation.from_text(example["french_translation"]),
            sound_filename=example["sound_filename"],
        )
        for example in entry_v1["examples"]
    ]
    entry = d.Entry(
        id=id,
        headword=entry_v1["headword"],
        part_of_speech=entry_v1["part_of_speech"],
        english=d.Translation.from_text(entry_v1["english"]["translation"]),
        french=d.Translation.from_text(entry_v1["french"]["translation"]),
        sound_filename=entry_v1["sound_filename"],
        examples=examples,
        expressions=[],
    )
    new_entries.append(entry.model_dump(by_alias=True))

In [30]:
rprint(entry_v1)
rprint(entry)

In [31]:
rprint(new_entries[-1])

In [32]:
entries_v2_full = list(
    client.get_database("dictionary").get_collection("entries-devel").find()
)

In [33]:
rprint(new_entries[0], entries_v2_full[0])

In [34]:
output = [*entries_v2_full, *new_entries]

In [35]:
from pathlib import Path

output_filepath = Path("../../data/xml_processing/completed_mongo_items.json")

In [36]:
from mbay_dict.core.serializers import CustomJSONEncoder
from bson.json_util import dumps

print(f"Saving data to {output_filepath}")
with open(output_filepath, "w") as f:
    f.write(dumps(output, cls=CustomJSONEncoder, ensure_ascii=False))

Saving data to ../../data/xml_processing/completed_mongo_items.json
