In [1]:
%pip uninstall -y pyarrow
%pip install pyarrow

Found existing installation: pyarrow 23.0.0
Uninstalling pyarrow-23.0.0:
  Successfully uninstalled pyarrow-23.0.0
Note: you may need to restart the kernel to use updated packages.
Collecting pyarrow
  Using cached pyarrow-23.0.0-cp311-cp311-win_amd64.whl.metadata (3.1 kB)
Using cached pyarrow-23.0.0-cp311-cp311-win_amd64.whl (27.5 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-23.0.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
pd.__version__

'2.1.4'

In [3]:
%pip install psycopg2-binary sqlalchemy pandas

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.11-cp311-cp311-win_amd64.whl.metadata (5.1 kB)
Downloading psycopg2_binary-2.9.11-cp311-cp311-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
    --------------------------------------- 0.1/2.7 MB 825.8 kB/s eta 0:00:04
   -------------- ------------------------- 1.0/2.7 MB 8.7 MB/s eta 0:00:01
   ---------------------------------------  2.7/2.7 MB 17.0 MB/s eta 0:00:01
   ---------------------------------------- 2.7/2.7 MB 15.7 MB/s eta 0:00:00
Installing collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.11
Note: you may need to restart the kernel to use updated packages.


In [42]:
from sqlalchemy import create_engine
engine = create_engine(
    "postgresql+psycopg2://postgres:1620@localhost:5432/laz_test"
)
pd.read_sql("SELECT 1;", engine)

Unnamed: 0,?column?
0,1


In [43]:
import pandas as pd

path = r"Test Verb Present tense.csv"   # adjust path if needed
raw = pd.read_csv(path, low_memory=False)

# keep only non-empty column names that aren't "Unnamed: ..."
df = raw.loc[:, ~raw.columns.str.startswith("Unnamed:")].copy()

df.columns
df.head()

Unnamed: 0,Laz Infinitive,Category,Laz 3rd Person Singular Present,Region,Laz 3rd Person Singular Present Alternative 1,Region Alternative 1,Laz 3rd Person Singular Present Alternative 2,Region Alternative 2,English Translation,Turkish Verb
0,avara doskudu,TVM,avara doskudun,"PZ, AŞ, FA, HO",,,,,to be idle,avare kalmak
1,cebazgu,TVE,cobazgams,"PZ, AŞ",,,,,"to press, to step on something",basmak
2,cebgaru,TVE,cabgars,PZ,,,,,to lament over something,ağıt yakmak
3,ceçamu,TVE,ceçams,"PZ, AŞ",,,,,to hit something/someone,vurmak (bir şeye)
4,ceç̌u,TVE,coç̌ams,"PZ, AŞ",,,,,to begin,başlamak


In [44]:
# clean + unique category codes
cats = (
    df[["Category"]]
    .dropna()
    .assign(code=lambda x: x["Category"].astype(str).str.strip())
    .drop_duplicates(subset=["code"])
    [["code"]]
)

# explicit mappings
english_map = {
    "IVD": "Dative verb",
    "TVM": "Nominative verb",
    "TVE": "Ergative verb",
}

turkish_map = {
    "IVD": "Yönelme fiili",
    "TVM": "Nominatif fiili",
    "TVE": "Ergatif fiili",
}

cats["english_name"] = cats["code"].map(english_map)
cats["turkish_name"] = cats["code"].map(turkish_map)

# final column order
cats = cats[["english_name", "turkish_name", "code"]]

cats

Unnamed: 0,english_name,turkish_name,code
0,Nominative verb,Nominatif fiili,TVM
1,Ergative verb,Ergatif fiili,TVE
7,Dative verb,Yönelme fiili,IVD


In [45]:
pd.read_sql("""
SELECT column_name, is_nullable
FROM information_schema.columns
WHERE table_name = 'verb_category'
ORDER BY ordinal_position;
""", engine)

Unnamed: 0,column_name,is_nullable
0,verb_category_id,NO
1,english_name,NO
2,turkish_name,NO
3,code,NO


In [46]:
cats.to_sql("verb_category", engine, if_exists="append", index=False)

3

In [47]:
pd.read_sql("SELECT * FROM verb_category ORDER BY verb_category_id;", engine)

Unnamed: 0,verb_category_id,english_name,turkish_name,code
0,1,Nominative verb,Nominatif fiili,TVM
1,2,Ergative verb,Ergatif fiili,TVE
2,3,Dative verb,Yönelme fiili,IVD
3,4,Nominative verb,Nominatif fiili,TVM
4,5,Ergative verb,Ergatif fiili,TVE
5,6,Dative verb,Yönelme fiili,IVD


In [48]:
cat_map = pd.read_sql("SELECT verb_category_id, code FROM verb_category;", engine)
cat_map

Unnamed: 0,verb_category_id,code
0,1,TVM
1,2,TVE
2,3,IVD
3,4,TVM
4,5,TVE
5,6,IVD


In [49]:
# 1) build verbs from the CSV
verbs = (
    df[["Laz Infinitive", "Category"]]
    .dropna()
    .assign(
        infinitive=lambda x: x["Laz Infinitive"].astype(str).str.strip(),
        code=lambda x: x["Category"].astype(str).str.strip(),
    )
    .query("infinitive != ''")
    .drop_duplicates(subset=["infinitive", "code"])
    .merge(cat_map, on="code", how="left")
)

# 2) sanity check: any categories that didn't map?
bad = verbs[verbs["verb_category_id"].isna()][["infinitive", "code"]].drop_duplicates()
bad

Unnamed: 0,infinitive,code


In [50]:
dialects = pd.DataFrame([
    {"english_name": "Ardeşen", "turkish_name": "Ardeşen", "laz_name": "Art̆aşeni"},
    {"english_name": "Pazar", "turkish_name": "Pazar", "laz_name": "Atina"},
    {"english_name": "Fındıklı/Arhavi", "turkish_name": "Fındıklı/Arhavi", "laz_name": "Viǯe/Arkabi"},
    {"english_name": "Hopa", "turkish_name": "Hopa", "laz_name": "Xopa"},
])
dialects

Unnamed: 0,english_name,turkish_name,laz_name
0,Ardeşen,Ardeşen,Art̆aşeni
1,Pazar,Pazar,Atina
2,Fındıklı/Arhavi,Fındıklı/Arhavi,Viǯe/Arkabi
3,Hopa,Hopa,Xopa


In [51]:
existing = pd.read_sql("SELECT english_name FROM dialect;", engine)

dialects_new = dialects[~dialects["english_name"].isin(existing["english_name"])].copy()
dialects_new

Unnamed: 0,english_name,turkish_name,laz_name


In [52]:
dialects_new.to_sql("dialect", engine, if_exists="append", index=False)

0

In [53]:
from sqlalchemy import text

with engine.begin() as conn:
    conn.execute(
        text("""
        UPDATE dialect
        SET laz_name = 'Art̆aşeni'
        WHERE english_name = 'Ardeşen';
        """)
    )

In [54]:
pd.read_sql(
    "SELECT dialect_id, english_name, turkish_name, laz_name FROM dialect ORDER BY dialect_id;",
    engine
)

Unnamed: 0,dialect_id,english_name,turkish_name,laz_name
0,1,Ardeşen,Ardeşen,Art̆aşeni
1,2,Pazar,Pazar,Atina
2,3,Fındıklı/Arhavi,Fındıklı/Arhavi,Viǯe/Arkabi
3,4,Hopa,Hopa,Xopa


In [55]:
DEFAULT_DIALECT_ID = int(pd.read_sql(
    "SELECT dialect_id FROM dialect WHERE english_name='Fındıklı/Arhavi';",
    engine
).iloc[0,0])
DEFAULT_DIALECT_ID

3

In [56]:
verbs = (
    df[["Laz Infinitive", "Category"]]
    .dropna()
    .assign(
        infinitive=lambda x: x["Laz Infinitive"].astype(str).str.strip(),
        code=lambda x: x["Category"].astype(str).str.strip(),
    )
    .query("infinitive != ''")
    .drop_duplicates(subset=["infinitive", "code"])
    .merge(cat_map, on="code", how="left")
)

# must be empty
verbs[verbs["verb_category_id"].isna()]

Unnamed: 0,Laz Infinitive,Category,infinitive,code,verb_category_id


In [57]:
pd.read_sql("""
SELECT column_name, is_nullable
FROM information_schema.columns
WHERE table_name = 'verb'
ORDER BY ordinal_position;
""", engine)

Unnamed: 0,column_name,is_nullable
0,verb_id,NO
1,verb_family_id,YES
2,dialect_id,NO
3,verb_category_id,NO
4,infinitive,NO
5,present_3sg,NO
6,meaning_english,NO
7,meaning_turkish,NO
8,requires_marker,NO
9,has_optional_preverb_ko,NO


In [58]:
COL_INF = "Laz Infinitive"
COL_CAT = "Category"
COL_PRES3 = "Laz 3rd Person Singular Present"
COL_ENG = "English Translation"
COL_TR  = "Turkish Verb"

verbs = (
    df[[COL_INF, COL_CAT, COL_PRES3, COL_ENG, COL_TR]]
    .dropna(subset=[COL_INF, COL_CAT, COL_PRES3, COL_ENG, COL_TR])
    .assign(
        infinitive=lambda x: x[COL_INF].astype(str).str.strip(),
        code=lambda x: x[COL_CAT].astype(str).str.strip(),
        present_3sg=lambda x: x[COL_PRES3].astype(str).str.strip(),
        meaning_english=lambda x: x[COL_ENG].astype(str).str.strip(),
        meaning_turkish=lambda x: x[COL_TR].astype(str).str.strip(),
    )
    .query("infinitive != '' and present_3sg != '' and meaning_english != '' and meaning_turkish != ''")
    .drop_duplicates(subset=["infinitive", "code"])
    .merge(cat_map, on="code", how="left")
)

# sanity: should be empty
missing = verbs[verbs["verb_category_id"].isna()]
missing

Unnamed: 0,Laz Infinitive,Category,Laz 3rd Person Singular Present,English Translation,Turkish Verb,infinitive,code,present_3sg,meaning_english,meaning_turkish,verb_category_id


In [59]:
pd.read_sql("SELECT COUNT(*) AS n_verbs FROM verb;", engine)

Unnamed: 0,n_verbs
0,0


In [60]:
df.shape

(327, 10)

In [61]:
df[["Laz Infinitive","Category","Laz 3rd Person Singular Present","English Translation","Turkish Verb","Region"]].head(10)

Unnamed: 0,Laz Infinitive,Category,Laz 3rd Person Singular Present,English Translation,Turkish Verb,Region
0,avara doskudu,TVM,avara doskudun,to be idle,avare kalmak,"PZ, AŞ, FA, HO"
1,cebazgu,TVE,cobazgams,"to press, to step on something",basmak,"PZ, AŞ"
2,cebgaru,TVE,cabgars,to lament over something,ağıt yakmak,PZ
3,ceçamu,TVE,ceçams,to hit something/someone,vurmak (bir şeye),"PZ, AŞ"
4,ceç̌u,TVE,coç̌ams,to begin,başlamak,"PZ, AŞ"
5,cegapu,TVM,cagen,to get used to,alışmak,"PZ, AŞ"
6,ceginu,TVE,coginams,to get someone become familiar with something,alıştırmak,"PZ, AŞ"
7,cedginu,IVD,cadginen,to crave,aşermek,"PZ, AŞ"
8,celabalu,TVE,celabams,to hang,asmak,"PZ, AŞ"
9,cemp̌onu,TVE,comp̌onams,to vaccinate/fertilize,aşılamak,AŞ


In [62]:
COL_INF="Laz Infinitive"
COL_CAT="Category"
COL_PRES3="Laz 3rd Person Singular Present"
COL_ENG="English Translation"
COL_TR="Turkish Verb"
COL_REG="Region"

verbs = (
    df[[COL_INF, COL_CAT, COL_PRES3, COL_ENG, COL_TR, COL_REG]]
    .assign(
        infinitive=lambda x: x[COL_INF].astype(str).str.strip(),
        code=lambda x: x[COL_CAT].astype(str).str.strip(),
        present_3sg=lambda x: x[COL_PRES3].astype(str).str.strip(),
        meaning_english=lambda x: x[COL_ENG].astype(str).str.strip(),
        meaning_turkish=lambda x: x[COL_TR].astype(str).str.strip(),
        region=lambda x: x[COL_REG].astype(str).str.strip(),
    )
)

verbs.shape

(327, 12)

In [63]:
verbs[["infinitive","code","present_3sg","meaning_english","meaning_turkish","region"]].isna().sum()

infinitive         0
code               0
present_3sg        0
meaning_english    0
meaning_turkish    0
region             0
dtype: int64

In [64]:
usable = verbs.query("infinitive != '' and code != '' and present_3sg != ''")
usable.shape

(327, 12)

In [65]:
import pandas as pd

cat_map = pd.read_sql("SELECT verb_category_id, code FROM verb_category;", engine)

dialect_map = pd.read_sql("SELECT dialect_id, english_name FROM dialect;", engine)

# region code -> dialect english_name
region_to_english = {
    "AŞ": "Ardeşen",
    "PZ": "Pazar",
    "FA": "Fındıklı/Arhavi",
    "HO": "Hopa",
}

# turn dialect_map into english_name -> dialect_id
english_to_id = dict(zip(dialect_map["english_name"], dialect_map["dialect_id"]))

# final region code -> dialect_id
region_to_id = {k: english_to_id[v] for k, v in region_to_english.items()}
region_to_id

{'AŞ': 1, 'PZ': 2, 'FA': 3, 'HO': 4}

In [66]:
COL_INF="Laz Infinitive"
COL_CAT="Category"
COL_PRES3="Laz 3rd Person Singular Present"
COL_ENG="English Translation"
COL_TR="Turkish Verb"
COL_REG="Region"

verbs = (
    df[[COL_INF, COL_CAT, COL_PRES3, COL_ENG, COL_TR, COL_REG]]
    .assign(
        infinitive=lambda x: x[COL_INF].astype(str).str.strip(),
        code=lambda x: x[COL_CAT].astype(str).str.strip(),
        present_3sg=lambda x: x[COL_PRES3].astype(str).str.strip(),
        meaning_english=lambda x: x[COL_ENG].astype(str).str.strip(),
        meaning_turkish=lambda x: x[COL_TR].astype(str).str.strip(),
        region=lambda x: x[COL_REG].astype(str).str.strip(),
    )
    .query("infinitive != '' and code != '' and present_3sg != ''")
    .merge(cat_map, on="code", how="left")
)

# split "PZ, AŞ, FA" -> ["PZ","AŞ","FA"]
verbs = verbs.assign(
    region_code=lambda x: x["region"].str.split(",")
).explode("region_code")

verbs["region_code"] = verbs["region_code"].astype(str).str.strip()

# map to dialect_id
verbs["dialect_id"] = verbs["region_code"].map(region_to_id)

# sanity check: should be empty
bad = verbs[verbs["dialect_id"].isna()][["region","region_code"]].drop_duplicates()
bad

Unnamed: 0,region,region_code


In [69]:
import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import insert

meta = sa.MetaData()
verb_tbl = sa.Table("verb", meta, autoload_with=engine)

def insert_ignore_duplicates(table, conn, keys, data_iter):
    """
    Pandas to_sql 'method' callable:
    Insert many rows, but skip conflicts on (dialect_id, infinitive).
    """
    rows = [dict(zip(keys, row)) for row in data_iter]
    if not rows:
        return 0

    stmt = insert(verb_tbl).values(rows)
    stmt = stmt.on_conflict_do_nothing(
        index_elements=["dialect_id", "infinitive"]
    )

    result = conn.execute(stmt)
    return result.rowcount

# build verbs_to_insert exactly like you already do...
# BUT add this:
verbs_to_insert = verbs_to_insert.drop_duplicates(subset=["dialect_id", "infinitive"])

verbs_to_insert.to_sql(
    "verb",
    engine,
    if_exists="append",
    index=False,
    method=insert_ignore_duplicates,
    chunksize=1000,  # good practice
)


# --- Finish verbs dataframe: explode regions -> dialect_id rows ---

# 1) Split "PZ, AŞ, FA" into list and explode
verbs = (
    verbs
    .assign(
        region_list=lambda x: (
            x["region"]
            .str.replace(";", ",", regex=False)
            .str.split(",")
        )
    )
    .explode("region_list")
    .assign(region_code=lambda x: x["region_list"].astype(str).str.strip())
    .drop(columns=["region_list"])
)

# 2) Map region_code -> dialect_id
verbs["dialect_id"] = verbs["region_code"].map(region_to_id)

# 3) Quick diagnostics: anything unmapped?
unmapped = verbs[verbs["dialect_id"].isna()][["infinitive", "region_code"]].drop_duplicates()
print("Unmapped region codes (should be empty):")
display(unmapped.head(50))

# 4) Also check category merge worked
missing_cat = verbs[verbs["verb_category_id"].isna()][["infinitive", "code"]].drop_duplicates()
print("Missing verb_category_id (should be empty):")
display(missing_cat.head(50))

# 5) Keep only valid rows
verbs_clean = verbs.dropna(subset=["dialect_id", "verb_category_id"]).copy()
verbs_clean["dialect_id"] = verbs_clean["dialect_id"].astype(int)
verbs_clean["verb_category_id"] = verbs_clean["verb_category_id"].astype(int)

verbs_clean.head()

Unmapped region codes (should be empty):


Unnamed: 0,infinitive,region_code


Missing verb_category_id (should be empty):


Unnamed: 0,infinitive,code


Unnamed: 0,Laz Infinitive,Category,Laz 3rd Person Singular Present,English Translation,Turkish Verb,Region,infinitive,code,present_3sg,meaning_english,meaning_turkish,region,verb_category_id,region_code,dialect_id
0,avara doskudu,TVM,avara doskudun,to be idle,avare kalmak,"PZ, AŞ, FA, HO",avara doskudu,TVM,avara doskudun,to be idle,avare kalmak,"PZ, AŞ, FA, HO",1,PZ,2
0,avara doskudu,TVM,avara doskudun,to be idle,avare kalmak,"PZ, AŞ, FA, HO",avara doskudu,TVM,avara doskudun,to be idle,avare kalmak,"PZ, AŞ, FA, HO",1,AŞ,1
0,avara doskudu,TVM,avara doskudun,to be idle,avare kalmak,"PZ, AŞ, FA, HO",avara doskudu,TVM,avara doskudun,to be idle,avare kalmak,"PZ, AŞ, FA, HO",1,FA,3
0,avara doskudu,TVM,avara doskudun,to be idle,avare kalmak,"PZ, AŞ, FA, HO",avara doskudu,TVM,avara doskudun,to be idle,avare kalmak,"PZ, AŞ, FA, HO",1,HO,4
0,avara doskudu,TVM,avara doskudun,to be idle,avare kalmak,"PZ, AŞ, FA, HO",avara doskudu,TVM,avara doskudun,to be idle,avare kalmak,"PZ, AŞ, FA, HO",1,PZ,2


In [70]:
# --- Build verbs_to_insert to match your verb table ---

verbs_to_insert = (
    verbs_clean[[
        "verb_category_id",
        "dialect_id",
        "infinitive",
        "present_3sg",
        "meaning_english",
        "meaning_turkish",
    ]]
    .drop_duplicates()
    .copy()
)

# required booleans (set defaults for now; you can refine later)
verbs_to_insert["requires_marker"] = False
verbs_to_insert["has_optional_preverb_ko"] = False
verbs_to_insert["has_optional_preverb_do"] = False

# IMPORTANT: de-dupe on your conflict target (dialect_id, infinitive)
verbs_to_insert = verbs_to_insert.drop_duplicates(subset=["dialect_id", "infinitive"])

verbs_to_insert.head()

Unnamed: 0,verb_category_id,dialect_id,infinitive,present_3sg,meaning_english,meaning_turkish,requires_marker,has_optional_preverb_ko,has_optional_preverb_do
0,1,2,avara doskudu,avara doskudun,to be idle,avare kalmak,False,False,False
0,1,1,avara doskudu,avara doskudun,to be idle,avare kalmak,False,False,False
0,1,3,avara doskudu,avara doskudun,to be idle,avare kalmak,False,False,False
0,1,4,avara doskudu,avara doskudun,to be idle,avare kalmak,False,False,False
2,2,2,cebazgu,cobazgams,"to press, to step on something",basmak,False,False,False


In [71]:
# --- Insert into verb, skipping duplicates safely ---

with engine.begin() as conn:
    verbs_to_insert.to_sql(
        "verb",
        con=conn,
        if_exists="append",
        index=False,
        method=insert_ignore_duplicates,
        chunksize=1000,
    )

pd.read_sql("SELECT COUNT(*) AS n_verbs FROM verb;", engine)

Unnamed: 0,n_verbs
0,653


In [72]:
# --- sanity sample: did inserts land the way we expect? ---
pd.read_sql("""
SELECT dialect_id, infinitive, present_3sg, verb_category_id
FROM verb
ORDER BY verb_id DESC
LIMIT 20;
""", engine)

Unnamed: 0,dialect_id,infinitive,present_3sg,verb_category_id
0,1,cexvamu,cuxvamams,2
1,2,cexvamu,cuxvamams,2
2,3,gexvamu,gyuxvamams,2
3,4,zop̌ini,zop̌ons,2
4,3,zop̌ini,zop̌ons,2
5,1,yoxo cedvalu,yoxo codums,2
6,2,yoxo cedvalu,yoxo codums,2
7,4,yeç̌opu,yeç̌opups,2
8,4,xe oǩotvaʒinu,xe oǩotvaʒinaps,2
9,2,xe oǩoç̌apxu,xe oǩoç̌apxams,2


In [73]:
# --- Post-import checks ---

# How many rows did we insert?
pd.read_sql("SELECT COUNT(*) AS n_verbs FROM verb;", engine)

Unnamed: 0,n_verbs
0,653


In [74]:
# Check for duplicates on your natural key (dialect_id, infinitive)
pd.read_sql("""
SELECT dialect_id, infinitive, COUNT(*) AS n
FROM verb
GROUP BY dialect_id, infinitive
HAVING COUNT(*) > 1
ORDER BY n DESC, dialect_id, infinitive
LIMIT 50;
""", engine)

Unnamed: 0,dialect_id,infinitive,n


In [75]:
# Quick spot-check: sample rows
pd.read_sql("""
SELECT v.verb_id, d.english_name AS dialect, vc.english_name AS category,
       v.infinitive, v.present_3sg, v.meaning_english, v.meaning_turkish
FROM verb v
JOIN dialect d ON d.dialect_id = v.dialect_id
JOIN verb_category vc ON vc.verb_category_id = v.verb_category_id
ORDER BY v.verb_id DESC
LIMIT 25;
""", engine)

Unnamed: 0,verb_id,dialect,category,infinitive,present_3sg,meaning_english,meaning_turkish
0,692,Ardeşen,Ergative verb,cexvamu,cuxvamams,to celebrate,kutlamak
1,691,Pazar,Ergative verb,cexvamu,cuxvamams,to celebrate,kutlamak
2,690,Fındıklı/Arhavi,Ergative verb,gexvamu,gyuxvamams,to celebrate,kutlamak
3,689,Hopa,Ergative verb,zop̌ini,zop̌ons,"to mention, to talk about",bahsetmek
4,688,Fındıklı/Arhavi,Ergative verb,zop̌ini,zop̌ons,"to mention, to talk about",bahsetmek
5,687,Ardeşen,Ergative verb,yoxo cedvalu,yoxo codums,to name something/someone,adlandırmak
6,686,Pazar,Ergative verb,yoxo cedvalu,yoxo codums,to name something/someone,adlandırmak
7,685,Hopa,Ergative verb,yeç̌opu,yeç̌opups,to buy,almak
8,684,Hopa,Ergative verb,xe oǩotvaʒinu,xe oǩotvaʒinaps,to applaud,alkışlamak
9,683,Pazar,Ergative verb,xe oǩoç̌apxu,xe oǩoç̌apxams,to applaud,alkışlamak


In [76]:
# Ensure all dialect_id/category_id mappings existed in the CSV
# (These should be empty if your earlier mapping diagnostics passed)
print("Unmapped dialect_id rows:", verbs[verbs["dialect_id"].isna()].shape[0])
print("Unmapped verb_category_id rows:", verbs[verbs["verb_category_id"].isna()].shape[0])

Unmapped dialect_id rows: 0
Unmapped verb_category_id rows: 0


In [100]:
import pandas as pd

preverbs = [
    "ce","cela","ceşǩa","dolo","e","eşǩa",
    "me","mo", "gela", "ge",
    "meo","moo","mola","gola","moǩo",
    "ama","gama","eo","meşǩa","go","goo","koǯo","eǯa","eǩa",
    "oǩo","ǩoşǩa", "ela",
    "oxo","do", "ye",
]

preverb_df = pd.DataFrame({"spelling": preverbs}).drop_duplicates().sort_values("spelling")

# Optional: normalize whitespace
preverb_df["spelling"] = preverb_df["spelling"].str.strip()

preverb_df

Unnamed: 0,spelling
15,ama
0,ce
1,cela
2,ceşǩa
28,do
3,dolo
4,e
26,ela
17,eo
5,eşǩa


In [102]:
import pandas as pd

# --- Make sure we have a "spelling" column no matter what the source column is called ---

preverb_df_db = preverb_df.copy()

if "spelling" not in preverb_df_db.columns:
    if "code" in preverb_df_db.columns:
        preverb_df_db = preverb_df_db.rename(columns={"code": "spelling"})
    else:
        raise ValueError(f"Expected a 'code' or 'spelling' column, found: {list(preverb_df_db.columns)}")

# keep only the columns that exist in the SQL table
preverb_df_db = preverb_df_db[["spelling"]].copy()

# optional columns (table has them, so we can set them)
preverb_df_db["preverb_group"] = None
preverb_df_db["notes"] = None

# clean + normalize
preverb_df_db["spelling"] = (
    preverb_df_db["spelling"]
    .astype(str)
    .str.strip()
)

# drop blanks just in case
preverb_df_db = preverb_df_db[preverb_df_db["spelling"] != ""]

# dedupe + stable order
preverb_df_db = preverb_df_db.drop_duplicates(subset=["spelling"]).sort_values("spelling").reset_index(drop=True)

# 1) pull existing spellings from DB
existing = pd.read_sql("SELECT spelling FROM preverb;", engine)
existing_set = set(existing["spelling"].astype(str).str.strip())

# 2) keep only new spellings
to_insert = preverb_df_db[~preverb_df_db["spelling"].isin(existing_set)].copy()

print(f"Already in DB: {len(existing_set)}")
print(f"New to insert: {len(to_insert)}")

# 3) insert only new ones
if not to_insert.empty:
    with engine.begin() as conn:
        to_insert.to_sql(
            "preverb",
            con=conn,
            if_exists="append",
            index=False,
            chunksize=1000,
        )

pd.read_sql("""
SELECT preverb_id, spelling, preverb_group, notes
FROM preverb
ORDER BY spelling;
""", engine)

Already in DB: 29
New to insert: 0


Unnamed: 0,preverb_id,spelling,preverb_group,notes
0,1,ama,,
1,2,ce,,
2,3,cela,,
3,4,ceşǩa,,
4,5,do,,
5,6,dolo,,
6,7,e,,
7,11,eǩa,,
8,8,ela,,
9,9,eo,,


In [103]:
pd.read_sql("SELECT preverb_id, spelling, preverb_group, notes FROM preverb ORDER BY spelling;", engine)

Unnamed: 0,preverb_id,spelling,preverb_group,notes
0,1,ama,,
1,2,ce,,
2,3,cela,,
3,4,ceşǩa,,
4,5,do,,
5,6,dolo,,
6,7,e,,
7,11,eǩa,,
8,8,ela,,
9,9,eo,,
