In [85]:
%load_ext autoreload
%autoreload 2

In [86]:
from typing import Generator, Any
import pandas as pd
import json
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from dotenv import load_dotenv
import os
from rich import print as rprint
from mbay_nmt.utils import domain as d
from mbay_nmt.utils.models import new_object_id
from datasets import load_dataset, Dataset, DatasetDict
from rich import print as rprint

load_dotenv()

True

In [22]:
uri = os.environ["MONGODB_URI"]

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi("1"))

# Send a ping to confirm a successful connection
try:
    client.admin.command("ping")
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [23]:
entries = client.get_database("dictionary").get_collection("entries")

In [27]:
def find_word(word: str):
    result = entries.find_one({"headword": word})
    if not result:
        return None

    return {
        "headword": result["headword"],
        "french": result["french"]["translation"],
        "english": result["english"]["translation"],
    }

In [28]:
find_word("már")

{'headword': 'már',
 'french': 'complètement (noir)',
 'english': 'completely (black).'}

In [102]:
from oauth2client.service_account import ServiceAccountCredentials

# Use the JSON key file you downloaded when you created your Service Account
json_key_file = "/Users/nasoungadoy/.googlecloud/fluid-mind-303321-602c5b57c4e7.json"

# Define the scope
scope = [
    "https://spreadsheets.google.com/feeds",
    "https://www.googleapis.com/auth/drive",
]

In [103]:
import gspread

# Load the credentials and create a client to interact with the Google Drive API
credentials = ServiceAccountCredentials.from_json_keyfile_name(json_key_file, scope)
google_sheets = gspread.authorize(credentials)

{'id': 0,
 'entry_id': 1,
 'type': 2,
 'mbay': 3,
 'french': 4,
 'english': 5,
 'flagged': 6,
 'comment': 7}

In [113]:
sheet = google_sheets.open_by_url(
    "https://docs.google.com/spreadsheets/d/1KbpQS0RdcLhUOJNFNdXlnTDZBKEU2S8Q7DnJhPYrr2M"
).sheet1
columns = sheet.row_values(1)
columns = {columns[i]: i + 1 for i in range(len(columns))}
columns

{'id': 1,
 'entry_id': 2,
 'type': 3,
 'mbay': 4,
 'french': 5,
 'english': 6,
 'flagged': 7,
 'comment': 8}

In [38]:
sheet.cell(1, columns["mbay"]).value

'mbay'

In [40]:
for row_id, v in enumerate(["a", "b", "c"], start=1):
    print(row_id, v)

1 a
2 b
3 c


In [68]:
from langdetect import detect, detect_langs

In [62]:
from ratelimit import limits
from gspread import Cell


@limits(calls=60, period=60)
def update_cell(sheet: gspread.Worksheet, row_id: int, col_id: int, value: Any):
    sheet.update_cell(row_id, col_id, value)


def fix_ibid(sheet: gspread.Worksheet, record: dict[str, str], row_id: int):
    entry = find_word(record["mbay"])
    if not entry:
        print(f"Could not find {record['mbay']}")
        update_cell(sheet, row_id, columns["flagged"], True)
        update_cell(sheet, row_id, columns["comment"], "could not find in Morkeg")
        return

    update_cell(sheet, row_id, columns["english"], entry["english"])
    update_cell(sheet, row_id, columns["french"], entry["french"])


def fix_ibid2(cells: list[Cell], record: dict[str, str], row_id: int):
    entry = find_word(record["mbay"])
    if not entry:
        print(f"Could not find {record['mbay']}")
        cells.append(Cell(row_id, columns["flagged"], True))
        cells.append(Cell(row_id, columns["comment"], "could not find in Morkeg"))
        return

    cells.append(Cell(row_id, columns["english"], entry["english"]))
    cells.append(Cell(row_id, columns["french"], entry["french"]))

In [63]:
# Get all records of the data
records = sheet.get_all_records()
cells = []
# Iterate over the rows
for row_id, record in enumerate(records, start=2):
    if "ibid" in record["english"] or "ibid" in record["french"]:
        fix_ibid2(cells, record, row_id)

Could not find délɨ́l
Could not find yɨ̀nè


In [64]:
cells

[<Cell R8527C6 'clearly, carefully (see, understand, examine).'>,
 <Cell R8527C5 'clairement, attentivement (voir, comprendre, examiner)'>,
 <Cell R8529C6 'plump, round and (fat) {birds}.'>,
 <Cell R8529C5 'oiseaux dodus, ronds et (gras)'>,
 <Cell R8530C6 'be large and ugly.'>,
 <Cell R8530C5 'être grand et laid.'>,
 <Cell R8531C6 'loudly (cough).'>,
 <Cell R8531C5 'fort (toux).'>,
 <Cell R8536C6 'repetitively, nonsensically (talk).'>,
 <Cell R8536C5 'de manière répétitive, sans aucun sens (parler)'>,
 <Cell R8538C6 'extremely (soft {fruit}).'>,
 <Cell R8538C5 'extrêmement (fruit doux)'>,
 <Cell R8540C6 'hard to the ground (knock down).'>,
 <Cell R8540C5 'difficile au sol (renverser)'>,
 <Cell R8543C6 'fragrant (smell).'>,
 <Cell R8543C5 'parfumé (odeur).'>,
 <Cell R8544C6 'heavily (fall) {living creature, as snake from a tree}.'>,
 <Cell R8544C5 "lourdement (tomber) {être vivant, comme un serpent d'un arbre}">,
 <Cell R8545C6 '(heavy) and hanging.'>,
 <Cell R8545C5 '(lourd) et suspend

In [65]:
sheet.update_cells(cells)

{'spreadsheetId': '1KbpQS0RdcLhUOJNFNdXlnTDZBKEU2S8Q7DnJhPYrr2M',
 'updatedRange': "'mbay-translations-review'!E8527:H10350",
 'updatedRows': 349,
 'updatedColumns': 4,
 'updatedCells': 698}

In [110]:
import openai

openai.api_key = os.environ["OPENAI_API_KEY"]


def translate_en_to_fr_with_context(input_text: str, context: str) -> str:
    if context:
        context = f"\nContext: {context}"
    prompt = f"""
Translate from English to French.{context}
English: {input_text}
French:"""
    response = openai.ChatCompletion.create(
        model="gpt-4", messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content


def fix_failed_translation(cells: list[Cell], record: dict[str, str], row_id: int):
    print(f"Row {row_id} is English, {record['french']} | {record['english']}")
    user_input = input("Do you want to continue with the fix? (yes/no/quit): ")
    match user_input.lower():
        case "yes" | "y":
            while True:
                context = input("Please provide extra context to guide the AI: ")
                translated_text = translate_en_to_fr_with_context(
                    record["english"], context
                )
                resp = input(
                    f"result: {translated_text}.\n[a]ccept / [r]etry / [p]ass?"
                )
                match resp.lower():
                    case "a":
                        break
                    case "r":
                        continue
                    case _:
                        return True

            cells.append(Cell(row_id, columns["french"], translated_text))
            return True
        case "no" | "n":
            return True
        case "quit" | "q":
            return False

In [95]:
translate_en_to_fr_with_context("hide, skin", "this is about animal skin")

'peau, fourrure'

In [111]:
# Get all records of the data
records = sheet.get_all_records()
cells = []
# Iterate over the rows
for row_id, record in enumerate(records, start=2):
    if detect(record["french"]) == "en":
        next = fix_failed_translation(cells, record, row_id)
        if not next:
            break

Row 137 is English, tisserand ou gros-bec | weaver or grosbeak.
Row 200 is English, Je cherche mon stylo. | I am looking for my pen.
Row 350 is English, chaton {informel} | kitty cat {informal}
Row 364 is English, fourmilier {grand pangolin terrestre} | anteater {giant ground pangolin}
Row 369 is English, patch [of cloth] | patch [of cloth]
Row 404 is English, type of snail, shell used in gambling game | type of snail, shell used in gambling game
Row 471 is English, milk; wring out; etc. | milk; wring out; etc.
Row 511 is English, chasse collective | collective hunt
Row 565 is English, lover of hunting/fighting/fishing | lover of hunting/fighting/fishing
Row 570 is English, infirme | cripple
Row 626 is English, bale of animal feed {grasses and leaves dried and tied}; fodder | bale of animal feed {grasses and leaves dried and tied}; fodder
Row 749 is English, bec | beak
Row 783 is English, twig wilter bug {bug with oddly shaped legs, which suck sap from plants} | twig wilter bug {bug wi

In [112]:
print(len(cells))
cells

254


[<Cell R369C5 'morceau [de tissu]'>,
 <Cell R404C5 "type d'escargot, coquille utilisée dans un jeu de hasard">,
 <Cell R471C5 'traire; essorer; etc.'>,
 <Cell R511C5 'chasse collective'>,
 <Cell R565C5 'amateur de chasse / combat / pêche'>,
 <Cell R626C5 'balle de nourriture pour animaux {herbes et feuilles séchées et liées}; fourrage'>,
 <Cell R783C5 'insecte flétrisseur de brindilles {insecte aux pattes bizarrement formées, qui suce la sève des plantes}'>,
 <Cell R810C5 'idiot, imbécile'>,
 <Cell R832C5 'propreté, clarté, luminosité'>,
 <Cell R994C5 'infirme'>,
 <Cell R996C5 'pleureur public'>,
 <Cell R1000C5 'Chrétien {lit: celui qui a promis}'>,
 <Cell R1006C5 'commérage, personne qui commère'>,
 <Cell R1081C5 'peau, cuir'>,
 <Cell R1164C5 'buisson épineux; arbre [Acacia ataxacantha] et [Acacia macrostachya]'>,
 <Cell R1176C5 "type de manioc légèrement amer d'origine centrafricaine; plus couramment cultivé">,
 <Cell R1193C5 'chenille poilue {non comestible et destructrice}'>,
 <Cel

In [114]:
sheet.update_cells(cells)

{'spreadsheetId': '1KbpQS0RdcLhUOJNFNdXlnTDZBKEU2S8Q7DnJhPYrr2M',
 'updatedRange': "'mbay-translations-review'!E369:E10680",
 'updatedRows': 254,
 'updatedColumns': 1,
 'updatedCells': 254}

In [98]:
# entries = [
#     d.Entry(**entry)
#     for entry in client.get_database("dictionary").get_collection("entries").find()
# ]