In [13]:
import pandas as pd
import urllib
import re
from dataclasses import dataclass

pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None 

In [14]:
# download the source data
DATA_SOURCE = "https://cdr.lib.unc.edu/downloads/8623j7090?locale=en"
urllib.request.urlretrieve(DATA_SOURCE, "PRONOM2022.csv")

('PRONOM2022.csv', <http.client.HTTPMessage at 0x7f561fe6a950>)

In [15]:
raw = pd.read_csv("PRONOM2022.csv")
# filter out rows without dual/trial values
data = raw[raw.loc[:,"1+2(incl), Restricted I (dual/min);": "3 Restricted II (trial/pauc)"].notnull().any(axis=1)]
data = data.sort_values(by="Language")

In [16]:
# define classes and functions for data analysis
NUMBERS = ("sg", "pl", "dual", "trial")

@dataclass
class Syncretism:
    form: str
    cells: list[str]
    
    def persons(self):
        persons = set()
        for cell in self.cells:
            m = re.match(r"\d(\+\d)?(\+\d)?", cell)
            if m:
                persons.add(m.group(0))
        return persons
    
    def syn_type(self):
        m = g = mg = False
        for cell in self.cells:
            m = m or "sg" in cell
            g = g or "pl" in cell
            mg = mg or ("dual" in cell or "trial" in cell)
        
        if m and g and mg:
            return "other"
        elif m and mg:
            return "minimal"
        elif g and mg:
            return "group"
        
    def numbers(self):
        numbers = set()
        for cell in self.cells:
            for number in NUMBERS:
                if number in cell:
                    numbers.add(number)
        return numbers

class SyncretismSet:
    def __init__(self, language: str):
        self.language: str = language
        self.syncretisms: list[Syncretism] = []

    def __repr__(self):
        return f"SyncretismSet(language='{self.language}', count={len(self.syncretisms)})"

def number_syncretisms(row, person=False):
    forms = {}

    for cell, val in row["1sg":"3 Restricted II (trial/pauc)"].dropna().items():
        if val not in forms:
            forms[val] = []
        forms[val].append(cell)

    syns = []
    for form, cells in forms.items():
        if len(cells) <= 1 or not is_number_syncretism(cells):
            continue
        if not person:
            syns.append(Syncretism(form, cells))
        else:
            # separate form by person values
            if not is_number_syncretism(cells):
                continue
            persons = {}
            for cell in cells:
                p = re.match(r"\d(\+\d)?(\+\d)?", cell).group(0)
                if p is None:
                    continue
                if p not in persons:
                    persons[p] = []
                persons[p].append(cell)
            for person, pcells in persons.items():
                if len(pcells) > 1:
                    syns.append(Syncretism(form, pcells))

    return syns

def is_number_syncretism(cells: list):
    m = g = mg = False
    for cell in cells:
        m = m or "sg" in cell
        g = g or "pl" in cell
        mg = mg or ("dual" in cell or "trial" in cell)
    return mg and (m or g)

In [17]:
"""Syncretisms by paradigm."""
syn_paradigm_df = data.copy()

def join_multiline(coll):
    out = "\n".join(str(e).strip("{}()").replace("'", "") for e in coll)
    return pd.NA if not out else out

def row_to_num_syncretisms(row, person):
    syns = [(syn.form, tuple(syn.cells)) for syn in number_syncretisms(row, person=person)]
    syns.sort(key=lambda s: s[0])
    return join_multiline(syns)

def row_to_persons(row):
    persons = tuple(syn.persons() for syn in number_syncretisms(row))
    return join_multiline(persons)

def row_to_syn_types(row):
    types = tuple(syn.syn_type() for syn in number_syncretisms(row, person=False))
    return join_multiline(types)

def row_to_numbers(row):
    numbers = tuple(syn.numbers() for syn in number_syncretisms(row, person=False))
    return join_multiline(numbers)

syn_paradigm_df['# Syncretisms'] = syn_paradigm_df.apply(lambda r : len(number_syncretisms(r, person=False)), axis=1)
syn_paradigm_df['Syncretisms'] = syn_paradigm_df.apply(lambda r : row_to_num_syncretisms(r, False), axis=1)
syn_paradigm_df = syn_paradigm_df.dropna(subset=['Syncretisms'])
syn_paradigm_df['Syncretisms (person-separated)'] = syn_paradigm_df.apply(lambda r : row_to_num_syncretisms(r, True), axis=1)
syn_paradigm_df['Persons'] = syn_paradigm_df.apply(row_to_persons, axis=1)
syn_paradigm_df['Numbers'] = syn_paradigm_df.apply(row_to_numbers, axis=1)
syn_paradigm_df['Type'] = syn_paradigm_df.apply(row_to_syn_types, axis=1)

syn_paradigm_df = syn_paradigm_df.sort_values(by="Language")

syn_paradigm_df.to_csv("PRONOM2022_syn-num.csv", index=False, mode='w')
syn_paradigm_df

Unnamed: 0,Language counter,Language,Genus,Area,Language code,paradigm,Free/Bound,1sg,2sg,3sg,1+2+3pl(incl),1+3pl(excl),2pl,3pl,"1+2(incl), Restricted I (dual/min);","1+3(excl), Restricted I (dual/min)",2 Restricted I (dual/min),3 Restricted I (dual/min),"1+2+3t(incl), Restricted II (trial/pauc)","1+3(excl), Restricted II (trial/pauc)",2 Restricted II (trial/pauc),3 Restricted II (trial/pauc),1st (No number),1st(incl) (No number),2nd (No number),3rd (No number),Other1,Other2,Notes,Source,Comments,# Syncretisms,Syncretisms,Syncretisms (person-separated),Persons,Numbers,Type
603,218,Aleut,Eskimo-Aleut,North America,ale,"Free Personal Pronouns (Postpositional, 'for/a...",Free,agalkimiŋ,agalkimin,agalˈan,agalkiŋin; agalkin,agalkiŋin; agalkin,agalkimtʃi,agalkimaŋ,agalkiŋin; agalkin,agalkiŋin; agalkin,agalkimiðik,agalkimak,,,,,,,,,,,"These postpositional pronouns indicate ""for"" o...","Geoghegan, Richard Henry. 1944. The Aleut Lang...",,1,"agalkiŋin; agalkin, (1+2+3pl(incl), 1+3pl(excl...","agalkiŋin; agalkin, (1+3pl(excl), 1+3(excl), R...","1+2+3, 1+2, 1+3","pl, dual",group
228,218,Aleut,Eskimo-Aleut,North America,ale,Free Personal Pronouns (Nominative Case),Free,θiŋ,txin,hiŋan,tuman,tuman,txitʃi,iŋakun,tuman,tuman,txiðik,iŋaku,,,,,,,,,txiðin; txitʃiŋanˈan,txitʃiŋˈan,Other I includes second person plural emphatic...,"Geoghegan, Richard Henry. 1944. The Aleut Lang...",,1,"tuman, (1+2+3pl(incl), 1+3pl(excl), 1+2(incl),...","tuman, (1+3pl(excl), 1+3(excl), Restricted I (...","1+2+3, 1+2, 1+3","pl, dual",group
619,231,Bininj Gun-Wok,Gunwinygic,Australia & New Guinea,gup,Verbal Subject and Object Agreement Affixes (T...,Bound (Verbal),(ŋ)abanbani-,jibanbani-,(ga)banbani-,garbanbani-; (ŋ)arbanbani-,(ŋ)arbanbani-,(g)urbanbani-; (ŋ)urbanbani-,(ga)bandi-,(ŋ)arbanbani-,(ŋ)arbanbani-,(g)urbanbani-; (ŋ)urbanbani-,(ga)bandi-,garbanbani-; (ŋ)arbanbani-,,,,,,,,,,The morpheme -bani indicates third person dual...,"Evans, Nicholas. 2003. Bininj Gun-Wok: A pan-d...",,4,"(g)urbanbani-; (ŋ)urbanbani-, (2pl, 2 Restrict...","(g)urbanbani-; (ŋ)urbanbani-, (2pl, 2 Restrict...","1+2+3\n1+2, 1+3\n2\n3","pl, trial\npl, dual\npl, dual\npl, dual",group\ngroup\ngroup\ngroup
617,231,Bininj Gun-Wok,Gunwinygic,Australia & New Guinea,gup,Verbal Subject and Object Agreement Affixes (T...,Bound (Verbal),(ŋ)aban-,jiban-,(ga)ban-,garban-; (ŋ)arban-,(ŋ)arban-,(ŋ)urban-; (g)urban-,(ga)bandi-,(ŋ)arban-,(ŋ)arban-,(ŋ)urban-; (g)urban-,(ga)bandi-,(ŋ)arban-,,,,,,,,,,The morpheme -ban indicates third person plura...,"Evans, Nicholas. 2003. Bininj Gun-Wok: A pan-d...",,3,"(ga)bandi-, (3pl, 3 Restricted I (dual/min)\n(...","(ga)bandi-, (3pl, 3 Restricted I (dual/min)\n(...","1+2+3, 1+2, 1+3\n2\n3","pl, dual, trial\npl, dual\npl, dual",group\ngroup\ngroup
386,17,Cherokee,Southern Iroquoian,North America,chr,Verbal Subject Agreement Affixes (Active Mood),Bound (Verbal),tsi-; k-,h(i)-,ka-; k-; a-; Ø-,it(i)-,ots(i)-,its(i)-,an(i)-,in(i)-,ost(i)-,st(i)-,an(i)-,,,,,,,,,,,There are two sets of 3rd person markers in th...,"King, Duane Harold. 1975. A Grammar and Dictio...",,1,"an(i)-, (3pl, 3 Restricted I (dual/min)","an(i)-, (3pl, 3 Restricted I (dual/min)",3,"pl, dual",group
387,17,Cherokee,Southern Iroquoian,North America,chr,Verbal Subject Agreement Affixes (Stative Mood),Bound (Verbal),aki-; akw-,ts(a)-,u-,ik(i)-,ok(i)-,its(i)-,un(i)-,ikin(i)-,okin(i)-,st(i)-,un(i)-,,,,,,,,,,,,"King, Duane Harold. 1975. A Grammar and Dictio...",,1,"un(i)-, (3pl, 3 Restricted I (dual/min)","un(i)-, (3pl, 3 Restricted I (dual/min)",3,"pl, dual",group
438,52,Gooniyandi,Bunuban,Australia & New Guinea,gni,Free Personal Pronouns (Oblique Case),Free,ŋadagi,ŋa:ŋgi,nho:wo:,ja:daŋi,ŋidaŋi,gidaŋi,bidaŋi,ŋidaŋi,ŋidaŋi,,,,,,,,,,,,,,"McGregor, William. 1990. A Functional Grammar ...",,1,"ŋidaŋi, (1+3pl(excl), 1+2(incl), Restricted I ...","ŋidaŋi, (1+3pl(excl), 1+3(excl), Restricted I ...","1+2, 1+3","pl, dual",group
182,52,Gooniyandi,Bunuban,Australia & New Guinea,gni,Free Personal Pronouns (Nominative Case),Free,ŋaɲi,ŋiɲɟi,niji,ja:di,ŋidi,gidi,bidi,ŋidi,ŋidi,,,,,,,,,,,,,Transcription is phonemic. /ŋidi/ is used for ...,"McGregor, William. 1990. A Functional Grammar ...",,1,"ŋidi, (1+3pl(excl), 1+2(incl), Restricted I (d...","ŋidi, (1+3pl(excl), 1+3(excl), Restricted I (d...","1+2, 1+3","pl, dual",group
335,14,Haida,Haida,North America,"hax, hdn",Free Possessive Pronouns,Free,di,dʌng,ɪl,itɪl,itɪl,dælʌŋ,lthæ,,,,ɪl,,,,,,,,,,,"I.e., ""my"", etc.","Harrison, Charles. 1895. Haida Grammar. Ottawa...",,1,"ɪl, (3sg, 3 Restricted I (dual/min)","ɪl, (3sg, 3 Restricted I (dual/min)",3,"sg, dual",minimal
366,14,Haida,Haida,North America,"hax, hdn",Free Possessive Pronouns (Reflexive),Free,gɪæŋ; kɪægɪn,dʌŋgɪaw,lagɪaw (anim.),itɪlgɪaw,itɪlgɪaw,dælʌŋgɪaw,lthagɪaw,,,,lagɪaw (anim.),,,,,,,,,,,"I.e., ""my own"", etc.","Harrison, Charles. 1895. Haida Grammar. Ottawa...",,1,"lagɪaw (anim.), (3sg, 3 Restricted I (dual/min)","lagɪaw (anim.), (3sg, 3 Restricted I (dual/min)",3,"sg, dual",minimal
