# 2. Extracting occupational data from inscriptions

*AIM*: This script extract the occupational data from inscriptions.

References:
1) `Harris EM. Workshop, Marketplace and Household: The Nature of Technical Specialization in Classical Athens and its Influence on Economy and Society. In: Carledge P, Cohen EE, Foxhall L, editors. Money, Labour and Land: Approaches to the Economy of Ancient Greece. London—New York: Routledge; 2001. pp. 67–99.`

2) `van Leeuwen MHD, Maas I, Miles A. HISCO: Historical International Standard Classification of Occupations. 2022 2002 [cited 27 Jan 2022]. Available: https://historyofwork.iisg.nl/`


This script was originally published by `Kaše V, Heřmánková P, Sobotková A (2022) Division of labor, specialization and diversity in the ancient Roman cities: A quantitative approach to Latin epigraphy. PLoS ONE 17(6): e0269869. https://doi.org/10.1371/journal.pone.0269869` under a CC BY-SA 4.0 International License. 

https://github.com/sdam-au/social_diversity

The *Past Social Networks Project* adapted the script to fit the needs of the project research agenda.

# Requirements

In [1]:
import json
import numpy as np
import re
import pandas as pd
import geopandas as gpd
import nltk
pd.options.display.max_columns = 1000 # to see all columns
import warnings
warnings.filterwarnings('ignore')
import sddk

# Loading datasets

In [54]:
# local version
LIST = gpd.read_parquet("../../data/large_data/LIST_v1-0.parquet") # from https://zenodo.org/records/8431323

In [3]:
LIST.head(5)

Unnamed: 0,LIST-ID,EDCS-ID,EDH-ID,trismegistos_uri,pleiades_id,transcription,inscription,clean_text_conservative,clean_text_interpretive_sentence,clean_text_interpretive_word,clean_text_interpretive_word_EDCS,diplomatic_text,province,place,inscr_type,status_notation,inscr_process,status,partner_link,last_update,letter_size,type_of_inscription,work_status,year_of_find,present_location,text_edition,support_objecttype,support_material,support_decoration,keywords_term,people,type_of_inscription_clean,type_of_inscription_certainty,height_cm,width_cm,depth_cm,material_clean,type_of_monument_clean,type_of_monument_certainty,province_label_clean,province_label_certainty,country_clean,country_certainty,findspot_ancient_clean,findspot_ancient_certainty,modern_region_clean,modern_region_certainty,findspot_modern_clean,findspot_modern_certainty,findspot_clean,findspot_certainty,language_EDCS,raw_dating,not_after,not_before,Longitude,Latitude,is_geotemporal,geometry,is_within_RE,urban_context,urban_context_city,urban_context_pop_est,type_of_inscription_auto,type_of_inscription_auto_prob
445463,445464,EDCS-24900077,HD056163,https://www.trismegistos.org/text/177366,570485.0,Q(uinto) Caecilio C(ai) f(ilio) Metelo / imper...,Q(uinto) Caecilio C(ai) f(ilio) Metel(l)o / im...,Q Caecilio C f Metelo imperatori Italici quei ...,Quinto Caecilio Cai filio Metelo imperatori It...,Quinto Caecilio Cai filio Metelo imperatori It...,Quinto Caecilio Cai filio Metello imperatori I...,Q CAECILIO C F METELO / IMPERATORI ITALICI / Q...,Achaia,Agia Triada / Merbaka / Midea,tituli honorarii,"officium/professio, ordo senatorius, tria nomi...",,officium/professio; ordo senatorius; tituli ...,http://db.edcs.eu/epigr/partner.php?s_language...,2011-11-11,,honorific inscription,no image,,,\n Quinto Caecilio Cai filio Metelo imperatori...,,,1000,69.0,"[{'age: days': None, 'age: hours': None, 'age:...",honorific inscription,False,,,,,,False,Achaia,False,Greece,False,Midea,False,Pelopónissos,False,Midhéa,False,,False,,-68 to -68,-68.0,-68.0,22.8412,37.6498,True,POINT (22.84120 37.64980),True,rural,,,honorific inscription,1.0
445464,445465,EDCS-03700724,HD052964,https://www.trismegistos.org/text/121715,531064.0,Fortissimo et piis/simo Caesari d(omino) n(ost...,Fortissimo et Piis/simo Caesari d(omino) n(ost...,Fortissimo et piissimo Caesari d n Gal Val P F...,Fortissimo et piissimo Caesari domino nostro G...,Fortissimo et piissimo Caesari domino nostro G...,Fortissimo et Piissimo Caesari domino nostro G...,FORTISSIMO ET PIIS / SIMO CAESARI D N / GAL VA...,Achaia,Agios Athanasios / Photike,tituli honorarii,"Augusti/Augustae, ordo equester, tria nomina",litterae erasae,Augusti/Augustae; litterae erasae; ordo eque...,http://db.edcs.eu/epigr/partner.php?s_language...,2014-09-16,3-5.3 cm,honorific inscription,checked with photo,,Fragma Kalama,\n Fortissimo et piissimo Caesari domino nostr...,57.0,,1000,69.0,"[{'age: days': None, 'age: hours': None, 'age:...",honorific inscription,False,99.0,67.0,67.0,,statue base,False,Epirus,False,Greece,False,Photike,False,Ípeiros,False,Paramythía,False,{Agios Athanasios},False,,309 to 313,313.0,309.0,20.7668,39.4512,True,POINT (20.76680 39.45120),True,rural,,,honorific inscription,1.0
445465,445466,EDCS-13800065,HD017714,https://www.trismegistos.org/text/177100,570049.0,Italicei / quei Aegei negotiantur / P(ublium) ...,Italicei / quei Aegei negotiantur / P(ublium) ...,Italicei quei Aegei negotiantur P Rutilium P f...,Italicei quei Aegei negotiantur Publium Rutili...,Italicei quei Aegei negotiantur Publium Rutili...,Italicei quei Aegei negotiantur Publium Rutili...,ITALICEI / QVEI AEGEI NEGOTIANTVR / P RVTILIVM...,Achaia,Aigio / Egio / Aiyion / Aegeum,tituli honorarii,"officium/professio, ordo senatorius, tria nomi...",,officium/professio; ordo senatorius; tituli ...,http://db.edcs.eu/epigr/partner.php?s_language...,2011-03-29,3.5-3.7 cm,votive inscription,checked with photo,,,\n Italicei quei Aegei negotiantur Publium Rut...,257.0,,1000,372.0,"[{'age: days': None, 'age: hours': None, 'age:...",votive inscription,False,58.0,61.0,16.0,,tabula,False,Achaia,False,Greece,False,Aegeum,False,Dytikí Elláda,False,Aígion,False,,False,,-74 to -74,-74.0,-74.0,22.0845,38.2487,True,POINT (22.08450 38.24870),True,small,Aegium,1000.0,votive inscription,1.0
445466,445467,EDCS-03300852,HD051000,https://www.trismegistos.org/text/177273,240855.0,[Imp(eratori) Caes(ari) M(arco)] / An[nio] Flo...,Imp(eratori) / Floriano / P(io) F(elici) Aug(u...,An Floriano P F Aug p p m p III Imp Caes M Aur...,Imperatori Caesari Marco Annio Floriano Pio Fe...,Imperatori Caesari Marco Annio Floriano Pio Fe...,Imperatori Floriano Pio Felici Augusto patri p...,[ ] / AN[ ] FLORIANO / P F AVG / P P / M P III...,Achaia,Alea / Tegea,miliaria,"Augusti/Augustae, viri",,Augusti/Augustae; miliaria; viri,http://db.edcs.eu/epigr/partner.php?s_language...,2011-05-24,3.3-6 cm,mile-/leaguestone,checked with photo,,"Tegea, Mus.",\n Imperatori Caesari Marco Annio Floriano Pio...,89.0,,1000,,"[{'age: days': None, 'age: hours': None, 'age:...",mile-/leaguestone,False,44.0,24.0,,,mile-/leaguestone,False,Achaia,False,Greece,False,Tegea,False,Pelopónissos,False,Alea,False,"Stringu, bei",False,,a: 276 to 276; b: 276 to 282,282.0,276.0,22.4171,37.4319,True,POINT (22.41710 37.43190),True,large,Tegea,46362.0,mile-/leaguestone,1.0
445467,445468,EDCS-28500283,HD021396,https://www.trismegistos.org/text/177131,,T[i(berius)] Claudius Caesar Aug(ustus) / G[er...,T[(iberius)] Claudius Caesar Aug(ustus) / G[er...,T Claudius Caesar Aug Ganicus pontif max trib ...,Tiberius Claudius Caesar Augustus Germanicus p...,Tiberius Claudius Caesar Augustus Germanicus p...,Tiberius Claudius Caesar Augustus Germanicus p...,T[ ] CLAVDIVS CAESAR AVG / G[ ]ANICVS PONTIF M...,Achaia,Alea / Tegea,leges,"Augusti/Augustae, viri",,Augusti/Augustae; leges; viri,http://db.edcs.eu/epigr/partner.php?s_language...,2011-05-10,,public legal inscription,checked with photo,,,\n Tiberius Claudius Caesar Augustus Germanicu...,257.0,,1000,5.0,"[{'age: days': None, 'age: hours': None, 'age:...",public legal inscription,False,160.0,58.0,17.0,,tabula,False,Achaia,False,Greece,False,Tegea,False,Pelopónissos,False,Alea,False,,False,,49 to 50,50.0,49.0,22.420877,37.454501,True,POINT (22.42088 37.45450),True,large,Tegea,46362.0,public legal inscription,1.0


In [4]:
# list of all columns

print(LIST.columns)

Index(['LIST-ID', 'EDCS-ID', 'EDH-ID', 'trismegistos_uri', 'pleiades_id',
       'transcription', 'inscription', 'clean_text_conservative',
       'clean_text_interpretive_sentence', 'clean_text_interpretive_word',
       'clean_text_interpretive_word_EDCS', 'diplomatic_text', 'province',
       'place', 'inscr_type', 'status_notation', 'inscr_process', 'status',
       'partner_link', 'last_update', 'letter_size', 'type_of_inscription',
       'work_status', 'year_of_find', 'present_location', 'text_edition',
       'support_objecttype', 'support_material', 'support_decoration',
       'keywords_term', 'people', 'type_of_inscription_clean',
       'type_of_inscription_certainty', 'height_cm', 'width_cm', 'depth_cm',
       'material_clean', 'type_of_monument_clean',
       'type_of_monument_certainty', 'province_label_clean',
       'province_label_certainty', 'country_clean', 'country_certainty',
       'findspot_ancient_clean', 'findspot_ancient_certainty',
       'modern_region_cle

#  Custom function to extract occupations

In [5]:
occups_declined_dict = json.load(open("../../data/occups_declined_dict.json"))

In [6]:
# older functional version
def extract_occup(inscription_text):
    occups_found = []
    if not isinstance(inscription_text, str): # if not valid string
        inscription_text = ""
    for occup in occups_declined_dict.keys():
        for occup_morph in occups_declined_dict[occup]:
            try:
                if occup_morph in inscription_text: # first check it this way, otherwise skip
                    occup_morph_N = len(re.findall("(\W|^)" + occup_morph + "(\W|$)", inscription_text))
                    if occup_morph_N > 0:
                        occups_found.extend([occup] * occup_morph_N)
                        inscription_text = re.sub("(\W|^)(" + occup_morph + ")(\W|$)", r"\1", inscription_text)
            except: pass
    return occups_found

In [7]:
extract_occup("curatores, procuratores et negotiatores curatori navium et curatori")

['curator navium', 'negotiator', 'curator', 'curator']

# Occupational data extraction

In [8]:
# check that our occupations are properly arranged (from the longest...)
list(occups_declined_dict.keys())[:20]

['negotiator artis vestiariae et lintiariae',
 'negotiator artis cretaria et vestiaria',
 'negotiator frumentariae et legumenaria',
 'negotiator salsamentarius et vinarius',
 'negotiator sagarius et pellicarius',
 'negotiator suariae et pecuariae',
 'exactor auri argenti et aeris',
 'negotiator penoris et vinorum',
 'negotiator salsari leguminari',
 'negotiator artis macellariae',
 'negotiator artis purpurariae',
 'negotiator cellarum vinarium',
 'negotiator artis prossariae',
 'negotiator artis vestiariae',
 'negotiator artis ratiariae',
 'inclusor auri et gemmarum',
 'negotiator artis cretaria',
 'negotiator campi pecuarii',
 'negotiator manticularius',
 'negotiator margaritarius']

In [55]:
%%time

# extraction process, takes couple minutes
LIST["occups"] = LIST["clean_text_interpretive_word"].apply(extract_occup)

CPU times: user 3min 25s, sys: 7.45 ms, total: 3min 25s
Wall time: 3min 25s


In [56]:
LIST["occups_N"] = LIST["occups"].apply(len)

In [11]:
# how many times occupation is mentioned
LIST["occups_N"].sum() # LIRE dataset had 5222 instances

10570

In [12]:
# how many inscriptions at least 1 occupation
len(LIST[LIST["occups_N"]>0])

8475

In [13]:
# overview of the most common occupations
LIST_occups_list = [el for sublist in LIST["occups"].tolist() for el in sublist]
occupations_counts = pd.DataFrame(nltk.FreqDist(LIST_occups_list).most_common(), columns=["occupation", "count"])
occupations_counts.head(10)

Unnamed: 0,occupation,count
0,curator,1934
1,faber,958
2,aerarius,453
3,medicus,448
4,scriba,421
5,sagittarius,347
6,frumentarius,213
7,centonarius,202
8,negotiator,179
9,argentarius,176


In [14]:
# how many unique occupations there are
len(occupations_counts)

514

In [15]:
#occupations with their counts
occupations_counts

Unnamed: 0,occupation,count
0,curator,1934
1,faber,958
2,aerarius,453
3,medicus,448
4,scriba,421
...,...,...
509,funerarius,1
510,negotiator margaritarius,1
511,farmacopola,1
512,sarcitor,1


In [16]:
# how many occupations occur only once
len(occupations_counts[occupations_counts["count"]==1])

169

In [17]:
# overview of occupations with their counts
LIST_occups_list = [el for sublist in LIST["occups"].tolist() for el in sublist]
occupations_counts = pd.DataFrame(nltk.FreqDist(LIST_occups_list).most_common(), columns=["occupation", "count"])
occupations_counts.head(10)

Unnamed: 0,occupation,count
0,curator,1934
1,faber,958
2,aerarius,453
3,medicus,448
4,scriba,421
5,sagittarius,347
6,frumentarius,213
7,centonarius,202
8,negotiator,179
9,argentarius,176


# Load the occupation list with all metadata


In [18]:
# load the occupation list with all metadata
occupations_df = pd.read_csv("../../data/occupations_list_hisco.csv")
occupations_df.head(5)

Unnamed: 0,Term,gen_sg,Term2,Vocab_nom_sg,Source,HISCO_majorgroup,HISCO_minorgroup,Harris_Category,Subcategory,Translation_eng
0,abetarius,i,,,Petrikovits 1981a,8.0,81.0,Building,Wood worker,"a joiner, wood worker"
1,abietarius,i,,,Petrikovits 1981a,8.0,81.0,Building,Wood worker,"a joiner, wood worker"
2,acceptor,oris,,acceptor,Waltzing - Rome,3.0,31.0,Finance,,"collector, gold quality checker"
3,accomodator,oris,,,Petrikovits 1981a,9.0,99.0,Unclassified,,"uncertain, craftsman"
4,aceptor,oris,,,Petrikovits 1981a,3.0,31.0,Finance,,"collector, gold quality checker"


In [19]:
# categorise occupations according to their HISCO group

def get_int(x):
    try: return str(int(x))
    except: return ""
occupations_df["HISCO_majorgroup"] = occupations_df["HISCO_majorgroup"].apply(get_int)
occupations_df["HISCO_minorgroup"] = occupations_df["HISCO_minorgroup"].apply(get_int)

In [20]:
occupations_df.head(5)

Unnamed: 0,Term,gen_sg,Term2,Vocab_nom_sg,Source,HISCO_majorgroup,HISCO_minorgroup,Harris_Category,Subcategory,Translation_eng
0,abetarius,i,,,Petrikovits 1981a,8,81,Building,Wood worker,"a joiner, wood worker"
1,abietarius,i,,,Petrikovits 1981a,8,81,Building,Wood worker,"a joiner, wood worker"
2,acceptor,oris,,acceptor,Waltzing - Rome,3,31,Finance,,"collector, gold quality checker"
3,accomodator,oris,,,Petrikovits 1981a,9,99,Unclassified,,"uncertain, craftsman"
4,aceptor,oris,,,Petrikovits 1981a,3,31,Finance,,"collector, gold quality checker"


In [57]:
def term1_plus_term2(row):
    term1_2 = row["Term"]
    if isinstance(row["Term2"], str):
        term1_2 += " " + row["Term2"]
    return term1_2

occupations_df["Term"] = occupations_df.apply(lambda row: term1_plus_term2(row), axis=1)

In [22]:
occupation_dict = {}
keys = ["Harris_Category", "Source", "HISCO_majorgroup", "Subcategory", "HISCO_minorgroup", "Translation_eng"]
for n in range(len(occupations_df)):
    occupation_dict[occupations_df.iloc[n]["Term"]] = dict([(key, occupations_df.iloc[n][key]) for key in keys])

In [23]:
occupation_dict

{'abetarius': {'Harris_Category': 'Building',
  'Source': 'Petrikovits 1981a',
  'HISCO_majorgroup': '8',
  'Subcategory': 'Wood worker',
  'HISCO_minorgroup': '81',
  'Translation_eng': 'a joiner, wood worker'},
 'abietarius': {'Harris_Category': 'Building',
  'Source': 'Petrikovits 1981a',
  'HISCO_majorgroup': '8',
  'Subcategory': 'Wood worker',
  'HISCO_minorgroup': '81',
  'Translation_eng': 'a joiner, wood worker'},
 'acceptor': {'Harris_Category': 'Finance',
  'Source': 'Waltzing - Rome',
  'HISCO_majorgroup': '3',
  'Subcategory': nan,
  'HISCO_minorgroup': '31',
  'Translation_eng': 'collector, gold quality checker'},
 'accomodator': {'Harris_Category': 'Unclassified',
  'Source': 'Petrikovits 1981a',
  'HISCO_majorgroup': '9',
  'Subcategory': nan,
  'HISCO_minorgroup': '99',
  'Translation_eng': 'uncertain, craftsman'},
 'aceptor': {'Harris_Category': 'Finance',
  'Source': 'Petrikovits 1981a',
  'HISCO_majorgroup': '3',
  'Subcategory': nan,
  'HISCO_minorgroup': '31',
  '

In [24]:
for key in keys:
    occupations_counts[key] = occupations_counts["occupation"].apply(lambda x: occupation_dict[x][key])
occupations_counts.head(10)

Unnamed: 0,occupation,count,Harris_Category,Source,HISCO_majorgroup,Subcategory,HISCO_minorgroup,Translation_eng
0,curator,1934,Managerial,Waltzing,2,,21,"he who takes charge, a manager, overseer, supe..."
1,faber,958,Unclassified,Waltzing - Rome,9,,99,"a worker in wood, stone, metal, etc., a forger..."
2,aerarius,453,Metal-Working,Waltzing - Rome,8,,83,metal worker
3,medicus,448,Miscellaneous Services,Waltzing - Rome,0,,6,a surgeon
4,scriba,421,Education,Waltzing - Rome,3,,30,"a public writer, official scribe, professional..."
5,sagittarius,347,Metal-Working,Petrikovits 1981a,8,,83,"arrow-makers, arrow-smiths"
6,frumentarius,213,Retail,EDH/EDCS,4,,43,trader with corn
7,centonarius,202,Clothing,Waltzing - Rome,7,,79,"a maker of patchwork, a dealer in rags"
8,negotiator,179,Retail,Waltzing - Rome,4,,43,"one who does business by wholesale, a wholesal..."
9,argentarius,176,Metal-Working,Waltzing - Rome,8,,88,"jewellery maker, banker"


In [26]:
(occupations_counts["count"]==1).sum()

169

In [27]:
# HISCO major groups definition

HISCO_majorgroup_dict = {
    "0" : "Professional, technical and related workers",
    "1" : "Professional, technical and related workers",
    "2" : "Administrative and managerial workers",
    "3" : "Clerical and related workers",
    "4" : "Sales workers",
    "5" : "Service workers",
    "6" : "Agricultural, animal husbandry and forestry workers, fishermen and hunters",
    "7" : "Production and related workers, transport equipment operators and labourers",
    "8" : "Production and related workers, transport equipment operators and labourers",
    "9" : "Production and related workers, transport equipment operators and labourers",
    "" : ""
}

In [29]:
occupations_counts["HISCO_majorgroup_descr"] =  occupations_counts["HISCO_majorgroup"].apply(lambda x: HISCO_majorgroup_dict[x])
occupations_counts.head(5)

Unnamed: 0,occupation,count,Harris_Category,Source,HISCO_majorgroup,Subcategory,HISCO_minorgroup,Translation_eng,HISCO_majorgroup_descr
0,curator,1934,Managerial,Waltzing,2,,21,"he who takes charge, a manager, overseer, supe...",Administrative and managerial workers
1,faber,958,Unclassified,Waltzing - Rome,9,,99,"a worker in wood, stone, metal, etc., a forger...","Production and related workers, transport equi..."
2,aerarius,453,Metal-Working,Waltzing - Rome,8,,83,metal worker,"Production and related workers, transport equi..."
3,medicus,448,Miscellaneous Services,Waltzing - Rome,0,,6,a surgeon,"Professional, technical and related workers"
4,scriba,421,Education,Waltzing - Rome,3,,30,"a public writer, official scribe, professional...",Clerical and related workers


In [30]:
# save a list of occupations with their counts
occupations_counts.to_csv("../../data/occupations_counts.csv")

# Exploring individual cases: Faber (worker) and Metal-Working category

In [61]:
# how many times there is a term faber (worker) = generic term for a manual and relativelly unskilled labour
occupations_counts[occupations_counts["occupation"]=="faber"]

Unnamed: 0,occupation,count,Harris_Category,Source,HISCO_majorgroup,Subcategory,HISCO_minorgroup,Translation_eng,HISCO_majorgroup_descr
1,faber,958,Unclassified,Waltzing - Rome,9,,99,"a worker in wood, stone, metal, etc., a forger...","Production and related workers, transport equi..."


In [62]:
# how many occurences of occupations belonging to the Metal-working category based on Harris 2001
occupations_counts[occupations_counts["Harris_Category"]=="Metal-Working"]["count"].sum()


1221

In [33]:
659 / 1211

0.5441783649876135

## Occupations - basic summary

In [34]:
print("LIST - number of occupation occurances: " + str(LIST["occups_N"].sum()))
print("LIST - number of inscriptions with at least one occupation mentioned: " + str(len(LIST[LIST["occups_N"] > 0])))

LIST - number of occupation occurances: 10570
LIST - number of inscriptions with at least one occupation mentioned: 8475


In [35]:
# How many occupations come from the EDH data
LIST[LIST["EDH-ID"].notnull()]["occups_N"].sum() # LIRE had 1272

3139

In [63]:
# How many occupations come from the EDCS data
LIST[LIST["EDH-ID"].isnull()]["occups_N"].sum() # LIRE had 2568

7431

In [64]:
# grouping of occupations by their HISCO major group name
occupations_counts.groupby("HISCO_majorgroup_descr").sum()

Unnamed: 0_level_0,count
HISCO_majorgroup_descr,Unnamed: 1_level_1
,10
Administrative and managerial workers,2153
"Agricultural, animal husbandry and forestry workers, fishermen and hunters",260
Clerical and related workers,705
"Production and related workers, transport equipment operators and labourers",4121
"Professional, technical and related workers",1675
Sales workers,916
Service workers,730


In [65]:
# grouping of occupations by their HISCO major group number
occupations_counts.groupby("HISCO_majorgroup").sum()

Unnamed: 0_level_0,count
HISCO_majorgroup,Unnamed: 1_level_1
,10
0.0,1026
1.0,649
2.0,2153
3.0,705
4.0,916
5.0,730
6.0,260
7.0,691
8.0,1667


# Organizational terms - extraction

In [66]:
# laod declined organizational terms
orgs_declined_dict = json.load(open("../../data/organizations_declined_dict.json", "r"))

In [41]:
# check that the terms are properly arranged (from the longest...)
list(orgs_declined_dict.keys())[:15]

['collegatarius',
 'collegiarius',
 'collegiatus',
 'corporatus',
 'sodalicium',
 'collegium',
 'collegius',
 'concilium',
 'conlegium',
 'sodalitas',
 'colegium',
 'sociatio',
 'societas',
 'collega',
 'corpus']

In [42]:
# custom function extracting the terms

def extract_orgs(inscription_text):
    orgs_found = []
    if not isinstance(inscription_text, str): # if not valid string
        inscription_text = ""
    for org in orgs_declined_dict.keys():
        for orgs_morph in orgs_declined_dict[org]:
            try:
                if orgs_morph in inscription_text: # first check it this way, otherwise skip
                    orgs_morph_N = len(re.findall("(\W|^)" + orgs_morph + "(\W|$)", inscription_text))
                    if orgs_morph_N > 0:
                        orgs_found.extend([org] * orgs_morph_N)
                        inscription_text = re.sub("(\W|^)(" + orgs_morph + ")(\W|$)", r"\1", inscription_text)
            except: pass
    return orgs_found

In [43]:
%%time
LIST["organizations"] = LIST["clean_text_interpretive_word"].apply(extract_orgs)

CPU times: user 4.27 s, sys: 4.01 ms, total: 4.27 s
Wall time: 4.29 s


In [44]:
LIST["organizations_N"] = LIST["organizations"].apply(len)

In [45]:
# how many organizational terms there are in the corpus

LIST_organizations_list = [el for sublist in LIST["organizations"].tolist() for el in sublist]
print(len(LIST_organizations_list)) 
print(nltk.FreqDist(LIST_organizations_list).most_common(30))

3240
[('collegium', 1642), ('corpus', 1080), ('collega', 248), ('corporatus', 95), ('collegius', 37), ('concilium', 35), ('societas', 27), ('sodalicium', 26), ('collegiatus', 25), ('conlegium', 21), ('sodalitas', 2), ('colegium', 1), ('collegiarius', 1)]


In [46]:
# overview of organizational terms and their counts

organizations_counts = pd.DataFrame(nltk.FreqDist(LIST_organizations_list).most_common(), columns=["term", "count"])
organizations_counts

Unnamed: 0,term,count
0,collegium,1642
1,corpus,1080
2,collega,248
3,corporatus,95
4,collegius,37
5,concilium,35
6,societas,27
7,sodalicium,26
8,collegiatus,25
9,conlegium,21


## Organizations - basic summary

In [47]:
# How many ocrganizational terms come from the EDH data
LIST[LIST["EDH-ID"].notnull()]["organizations_N"].sum()

914

In [48]:
# How many ocrganizational terms come from the EDCS data
LIST[LIST["EDH-ID"].isnull()]["organizations_N"].sum()

2326

In [50]:
# saving organizational terms and their counts
organizations_counts.to_csv("../../data/organizations_counts.csv")

In [68]:
#overview
print("LIST - number of organization occurances: " + str(LIST["organizations_N"].sum()))
print("LIST - number of inscriptions with at least one organization mentioned: " + str(len(LIST[LIST["organizations_N"] > 0])))

KeyError: 'organizations_N'

#   Saving locally

In [53]:
LIST.to_parquet("../../data/large_data/LISTg_occupsorgs.parquet")