# Step 1: Latin occupations - dataset generation

*AIM*: This script generates a corpus of Latin occupations with its complete morphology.

References:

1) `Waltzing JP. Étude historique sur les corporations professionnelles chez les Romains depuis les origines jusqu’à la chute de l’Empire d’Occident. Louvain: C. Peeters; 1895.`
2) `Petrikovits H v. Die Spezialisierung des römischen Handwerks. Handw Vor- Frühgesch Zeit 1 Hist Rechtshistorische Beitr Untersuchungen Zur Frühgesch Gilde Ber Über Kolloquien Komm Für Altertumskunde Mittel- Nordeur Den Jahren 1977 Bis 1980. 1981; 63–132.`


This script was originally published by Kaše V, Heřmánková P, Sobotková A (2022) Division of labor, specialization and diversity in the ancient Roman cities: A quantitative approach to Latin epigraphy. PLoS ONE 17(6): e0269869. https://doi.org/10.1371/journal.pone.0269869 under a CC BY-SA 4.0 International License.

https://github.com/sdam-au/social_diversity

The *Past Social Networks Project* adapted the script to fit the needs of the project research agenda.

## Data:

**IN**:

1) Occupations `occupations_list_hisco.csv`


**OUT**: 

1) Declined occupations `occups_declined_dict.json`



In [1]:
import geopandas as gpd
import pandas as pd
import re
import json

In [2]:
# load the occupation list from GitHub
occupations_df = pd.read_csv("../../data/data_generation/occupations_list_hisco.csv")
occupations_df.head(5)

Unnamed: 0,Term,gen_sg,Term2,Vocab_nom_sg,Source,HISCO_majorgroup,HISCO_minorgroup,Harris_Category,Subcategory,Translation_eng
0,abetarius,i,,,Petrikovits 1981a,8.0,81.0,Building,Wood worker,"a joiner, wood worker"
1,abietarius,i,,,Petrikovits 1981a,8.0,81.0,Building,Wood worker,"a joiner, wood worker"
2,acceptor,oris,,acceptor,Waltzing - Rome,3.0,31.0,Finance,,"collector, gold quality checker"
3,accomodator,oris,,,Petrikovits 1981a,9.0,99.0,Unclassified,,"uncertain, craftsman"
4,aceptor,oris,,,Petrikovits 1981a,3.0,31.0,Finance,,"collector, gold quality checker"


In [3]:
# custom script updating the morphological variant as nominative singular
def update_vocab_nom_sg(row):
    vocab_nom_sg = row["Vocab_nom_sg"]
    if isinstance(vocab_nom_sg, float):
        vocab_nom_sg = row["Term"]
    return vocab_nom_sg

occupations_df["Vocab_nom_sg"] = occupations_df.apply(lambda row: update_vocab_nom_sg(row), axis=1)
occupations_df.head(5)

Unnamed: 0,Term,gen_sg,Term2,Vocab_nom_sg,Source,HISCO_majorgroup,HISCO_minorgroup,Harris_Category,Subcategory,Translation_eng
0,abetarius,i,,abetarius,Petrikovits 1981a,8.0,81.0,Building,Wood worker,"a joiner, wood worker"
1,abietarius,i,,abietarius,Petrikovits 1981a,8.0,81.0,Building,Wood worker,"a joiner, wood worker"
2,acceptor,oris,,acceptor,Waltzing - Rome,3.0,31.0,Finance,,"collector, gold quality checker"
3,accomodator,oris,,accomodator,Petrikovits 1981a,9.0,99.0,Unclassified,,"uncertain, craftsman"
4,aceptor,oris,,aceptor,Petrikovits 1981a,3.0,31.0,Finance,,"collector, gold quality checker"


In [4]:
occup_tups = [(occup_nom, occup_gen, word_to_dec, term2) for occup_nom, occup_gen, word_to_dec, term2 in zip(occupations_df["Term"], occupations_df["gen_sg"], occupations_df["Vocab_nom_sg"], occupations_df["Term2"])]
occup_tups[:5]

[('abetarius', 'i', 'abetarius', nan),
 ('abietarius', 'i', 'abietarius', nan),
 ('acceptor', 'oris', 'acceptor', nan),
 ('accomodator', 'oris', 'accomodator', nan),
 ('aceptor', 'oris', 'aceptor', nan)]

In [5]:
# reorder from longest to shortest
occup_tups = sorted(occup_tups, key = lambda x: len(x[0]), reverse=True)
occup_tups[:5]

[('negotiator artis vestiariae et lintiariae', 'oris', 'negotiator', nan),
 ('negotiator artis cretaria et vestiaria', 'oris', 'negotiator', nan),
 ('negotiator frumentariae et legumenaria', 'oris', 'negotiator', nan),
 ('negotiator suariae et pecuariae', 'oris', 'negotiator', nan),
 ('exactor auri argenti et aeris', 'oris', 'exactor', nan)]

In [6]:
# manually define declinations
decs = {
"first_f" : ["a", "ae", "am", "e", "as", "arum", "is"],
"first_gr_es" : ["es",  "ae", "en", "am", "e", "as", "arum", "is", "a"],

"sec_m_us" : ["us", "i", "o", "um", "o", "i", "orum", "is", "os"], # sg-nom, sg-gn, sg-dat, sg-ac, sg-abl, sg-nom, sg-gn, sg-dat, sg-ag
"sec_n" : ["um", "i", "o", "a", "orum", "is"],
"sec_m_er" : ["er", "eri", "ero", "erum" , "eros", "erorum", "eris"],
"sec_m_r" : ["er", "ri", "ro", "rum" , "ro", "ri", "rorum", "ris", "ros"], # sg-nom, sg-gn, sg-dat, sg-ac, sg-abl, sg-nom, sg-gn, sg-dat, sg-ag

"sec_gr_os" : ["os", "i", "o", "on" , "e", "rorum", "ris"],
"sec_gr_on" : ["on", "i", "o", "a", "orum", "is"],

"third_m_1" : ["es", "itis", "iti", "ite", "ites", "itibus", "itum"],
"third_m_2" : ["ix", "icis", "icem", "ici", "ice", "ices", "icibus", "icum"],
"third_m_3" : ["ex", "icis", "icem", "ici", "ice", "ices", "icibus", "icum"],
"third_m_4" : ["o", "onis", "onem", "oni", "one", "ones", "onibus", "onum"],
"third_m_5a" : ["or", "oris", "ori", "orem", "ore", "ores", "orum", "oribus", "ores"], # sg-nom, sg-gn, sg-dat, sg-ac, sg-abl, sg-nom, sg-gn, sg-dat, sg-ag
"third_m_5b" : ["ur", "uris", "urem", "uri", "ure", "ures", "uribus", "urum"],
"third_m_6" : ["n", "nis", "nem", "ni", "ne", "nes", "nibus", "num"],
"third_m_7a" : ["ensis", "ensis", "ensem", "ensi", "ense", "enses", "ensibus", "ensum", "ensium"],
"third_m_7b" : ["esis", "esis", "esem", "esi", "ese", "eses", "esibus", "esum", "esium"],
"third_m_8" : ["er", "eris", "erem", "eri", "ere", "eres", "eribus", "erum", "erium"],
"third_m_9" : ["eps", "ipis", "ipem", "ipi", "ipe", "ipes", "ipibus", "ipum"],
"third_m_10" : ["ans", "antis", "antem", "ante", "antes", "antium", "antum", "antibus"],
"third_m_11" : ["er", "ineris", "ineri", "inere", "inera", "inerum", "ineribus"],
"third_m_12" : ["ut", "itis", "iti", "ite", "ita", "itibus", "itum"],
"third_m_13" : ["us", "oris", "ori", "ore", "ora", "orum", "oribus"],
    
"third_f_1" : ["as", "adis", "adi", "ade", "ades", "adum", "adium", "adibus"],
"third_f_2" : ["as", "atis", "ati", "atem", "ate", "ates", "atum", "atibus"],

"third_mix_1" : ["is", "is", "i", "em", "e", "es", "ium", "um", "ibus"],
"third_mix_2" : ["ns", "ntis", "nti", "ntem", "nte", "ntes", "ntium", "ntum", "ntibus"],
    
"fourth_us" : ["us", "us", "ui", "um", "u", "uum", "ibus"]
}

In [7]:
occup_tups = sorted(occup_tups, key = lambda x: len(x[0]), reverse=True)
occup_tups[:5]

[('negotiator artis vestiariae et lintiariae', 'oris', 'negotiator', nan),
 ('negotiator artis cretaria et vestiaria', 'oris', 'negotiator', nan),
 ('negotiator frumentariae et legumenaria', 'oris', 'negotiator', nan),
 ('negotiator suariae et pecuariae', 'oris', 'negotiator', nan),
 ('exactor auri argenti et aeris', 'oris', 'exactor', nan)]

In [8]:
# manual check
term2_occup_tup = [tup for tup in occup_tups if tup[0] == "boarius"][0]
term2_occup_tup

('boarius', 'i', 'boarius', nan)

In [9]:
base_terms = [tup[0] for tup in occup_tups]
base_terms[:10]

['negotiator artis vestiariae et lintiariae',
 'negotiator artis cretaria et vestiaria',
 'negotiator frumentariae et legumenaria',
 'negotiator suariae et pecuariae',
 'exactor auri argenti et aeris',
 'negotiator penoris et vinorum',
 'negotiator salsari leguminari',
 'negotiator artis macellariae',
 'negotiator artis purpurariae',
 'negotiator cellarum vinarium']

In [10]:
# review of problematic terms
problematic = [tup[3] for tup in occup_tups if ((tup[3] not in base_terms)  & (isinstance(tup[3], str)))]
problematic

['sagarius et pellicarius',
 'salsamentarius et vinarius',
 'soliarius baxiarius']

In [11]:
# custom function creating all morphological variants / declensions
def decline(nom_sg, ending):
    possible_decs = []
    for dec in decs.keys(): 
        if ending == decs[dec][1]:
            possible_decs.append(dec)
    for pos_dec in possible_decs:
        nom_end = decs[pos_dec][0]
        if re.match("\w+" + nom_end + "$", nom_sg):
            root = re.split(nom_end + "$", nom_sg)[0]
            all_morphs = [str(root) + end for end in decs[pos_dec]] # unique forms
            break
    try:
        return all_morphs
    except:
        print("declining unsuccesful: " + nom_sg, ending)
        return [nom_sg] 


In [12]:
# declination of occupational terms and groupping

occups_declined = []
for occup_tup in occup_tups:
    base_form = occup_tup[0]
    if re.match("\w+\s\w+", base_form):
        all_morphs = [base_form.replace(str(occup_tup[2]), morph) for morph in decline(occup_tup[2], occup_tup[1])]
    else:
        if isinstance(occup_tup[3], str):
            if " " in occup_tup[3]:
                term_1_declinations = decline(base_form, occup_tup[1])
                if " et " in occup_tup[3]:
                    twoterms = occup_tup[3].split(" et ")
                    et = True
                else:
                    twoterms = occup_tup[3].split()
                    et = False
                term2_occup_tup = [tup for tup in occup_tups if tup[0] == twoterms[0]][0]
                term2_declinations = decline(term2_occup_tup[0], term2_occup_tup[1])
                term3_occup_tup = [tup for tup in occup_tups if tup[0] == twoterms[1]][0]
                term3_declinations = decline(term3_occup_tup[0], term2_occup_tup[1])
                if et == True:
                    all_morphs = [w1form + " " + w2form + " et " + w3form for w1form, w2form, w3form in zip(term_1_declinations, term2_declinations, term3_declinations)]
                    all_morphs += [w1form + " " + term2_declinations[1] + " et " + term3_declinations[1] for w1form in term_1_declinations]
                    all_morphs += [w1form + " " + term2_declinations[6] + " et " + term3_declinations[6] for w1form in term_1_declinations]
                else:
                    all_morphs = [w1form + " " + w2form + w3form for w1form, w2form, w3form in zip(term_1_declinations, term2_declinations, term3_declinations)]
                    all_morphs += [w1form + " " + term2_declinations[1] + " " + term3_declinations[1] for w1form in term_1_declinations]
                    all_morphs += [w1form + " " + term2_declinations[6] + " " + term3_declinations[6] for w1form in term_1_declinations]
                base_form = all_morphs[0]
                print(list(set(all_morphs)))
            else:
                term_1_declinations = decline(base_form, occup_tup[1])
                term2_occup_tup = [tup for tup in occup_tups if tup[0] == occup_tup[3]][0]
                term2_declinations = decline(term2_occup_tup[0], term2_occup_tup[1])
                all_morphs = [w1form + " " + w2form for w1form, w2form in zip(term_1_declinations, term2_declinations)]
                all_morphs += [w1form + " " + term2_declinations[1] for w1form in term_1_declinations]
                all_morphs += [w1form + " " + term2_declinations[6] for w1form in term_1_declinations]
                base_form = all_morphs[0]
        else:
            all_morphs = decline(base_form, occup_tup[1])
    all_morphs = list(set(all_morphs))
    occups_declined.append([base_form, all_morphs])

['negotiatoribus sagariis et pellicariis', 'negotiatorum sagarii et pellicarii', 'negotiatorem sagarium et pellicarium', 'negotiatore sagarii et pellicarii', 'negotiatoribus sagarii et pellicarii', 'negotiatoribus sagariorum et pellicariorum', 'negotiatoris sagarii et pellicarii', 'negotiatoris sagariorum et pellicariorum', 'negotiatori sagarii et pellicarii', 'negotiatori sagario et pellicario', 'negotiatore sagariorum et pellicariorum', 'negotiatores sagarios et pellicarios', 'negotiatori sagariorum et pellicariorum', 'negotiator sagarii et pellicarii', 'negotiatores sagariorum et pellicariorum', 'negotiator sagariorum et pellicariorum', 'negotiatores sagarii et pellicarii', 'negotiatorem sagariorum et pellicariorum', 'negotiatore sagario et pellicario', 'negotiatorum sagariorum et pellicariorum', 'negotiator sagarius et pellicarius', 'negotiatorem sagarii et pellicarii']
['negotiatores salsamentarii et vinarii', 'negotiator salsamentarius et vinarius', 'negotiator salsamentarii et v

In [13]:
occups_declined = sorted(occups_declined, key = lambda x: len(x[0]), reverse=True)
[el[0] for el in occups_declined][:20]

['negotiator artis vestiariae et lintiariae',
 'negotiator artis cretaria et vestiaria',
 'negotiator frumentariae et legumenaria',
 'negotiator salsamentarius et vinarius',
 'negotiator sagarius et pellicarius',
 'negotiator suariae et pecuariae',
 'exactor auri argenti et aeris',
 'negotiator penoris et vinorum',
 'negotiator salsari leguminari',
 'negotiator artis macellariae',
 'negotiator artis purpurariae',
 'negotiator cellarum vinarium',
 'negotiator artis prossariae',
 'negotiator artis vestiariae',
 'negotiator artis ratiariae',
 'inclusor auri et gemmarum',
 'negotiator artis cretaria',
 'negotiator campi pecuarii',
 'negotiator manticularius',
 'negotiator margaritarius']

In [14]:
# how many occupations are declined

len(occups_declined)

882

In [15]:
occups_declined_dict = dict(occups_declined)


In [16]:
# saving

with open("../../data/data_generation/occups_declined_dict.json", "w") as fp:
    json.dump(occups_declined_dict, fp, indent=4)
    
