# 8. Extraction of the dataset

*AIM:* Exptraction of the data in multiple formats for external use.


## Data:

### IN:

1) Inscriptions with occupational data, urban contexts, industry types `LIST_occupsorgs_industry.parquet`

### OUT: 

1) Simplified (only to necessary attributes) inscriptions with occupational data, urban contexts, industry types
   * `LIST_occupsorgs_industry_simple_20231206.parquet` with geometries
   * `LIST_occupsorgs_industry_simple_20231206.json` without geometries


# Requirements

In [17]:
import pandas as pd
pd.set_option('display.max_columns', 200)
import numpy as np
import fiona

import geopandas as gpd

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import json

from nltk import ngrams

import seaborn as sns
import geoplot as gplt

import warnings
warnings.filterwarnings('ignore')

import sddk
import tempun

In [18]:
# global parameters for plots
#plt.style.use("seaborn-white")
plt.style.use("seaborn-v0_8-white")
plt.rcParams['figure.dpi'] = 300
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams['font.size'] = 8

def save_figure(fig, fig_name):
    #each figure to be saved in four different versions
    fig.savefig("../../figures/" + fig_name + ".tiff")
    fig.savefig("../../figures/" + fig_name + ".svg")
    fig.savefig("../../figures/" + fig_name + ".png")
    s.write_file(figpath + fig_name + ".eps", fig)
    s.write_file(figpath + fig_name + ".png", fig)

In [21]:
# to access gsheet, you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:
# but you can have yours stored locally on your computer and read it in directly
s = sddk.cloudSession("sciencedata.dk", "./PSNP_shared/lat_socdiv/", "648560@au.dk")
figpath = "./PSNP_shared/lat_socdiv/"

Your ScienceData username (e.g. '123456@au.dk'):  648560@au.dk
Your ScienceData password:  ········


connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/./PSNP_shared/lat_socdiv//


# Import the dataset

In [23]:
LIST = gpd.read_parquet("../../data/large_data/LIST_occupsorgs_industry.parquet")
LIST.head(3)

Unnamed: 0,LIST-ID,EDCS-ID,EDH-ID,trismegistos_uri,pleiades_id,transcription,inscription,clean_text_conservative,clean_text_interpretive_sentence,clean_text_interpretive_word,clean_text_interpretive_word_EDCS,diplomatic_text,province,place,inscr_type,status_notation,inscr_process,status,partner_link,last_update,letter_size,type_of_inscription,work_status,year_of_find,present_location,text_edition,support_objecttype,support_material,support_decoration,keywords_term,people,type_of_inscription_clean,type_of_inscription_certainty,height_cm,width_cm,depth_cm,material_clean,type_of_monument_clean,type_of_monument_certainty,province_label_clean,province_label_certainty,country_clean,country_certainty,findspot_ancient_clean,findspot_ancient_certainty,modern_region_clean,modern_region_certainty,findspot_modern_clean,findspot_modern_certainty,findspot_clean,findspot_certainty,language_EDCS,raw_dating,not_after,not_before,Longitude,Latitude,is_geotemporal,geometry,is_within_RE,urban_context,urban_context_city,urban_context_pop_est,type_of_inscription_auto,type_of_inscription_auto_prob,occups,occups_N,organizations,organizations_N,is_western,large_city_ID,large_city_geo,large_city_dist,medium_city_ID,medium_city_geo,medium_city_dist,small_city_ID,small_city_geo,small_city_dist,urban_context_alt,occups_cats
0,445464,EDCS-24900077,HD056163,https://www.trismegistos.org/text/177366,570485,Q(uinto) Caecilio C(ai) f(ilio) Metelo / imper...,Q(uinto) Caecilio C(ai) f(ilio) Metel(l)o / im...,Q Caecilio C f Metelo imperatori Italici quei ...,Quinto Caecilio Cai filio Metelo imperatori It...,Quinto Caecilio Cai filio Metelo imperatori It...,Quinto Caecilio Cai filio Metello imperatori I...,Q CAECILIO C F METELO / IMPERATORI ITALICI / Q...,Achaia,Agia Triada / Merbaka / Midea,tituli honorarii,"officium/professio, ordo senatorius, tria nomi...",,officium/professio; ordo senatorius; tituli ...,http://db.edcs.eu/epigr/partner.php?s_language...,2011-11-11,,honorific inscription,no image,,,\n Quinto Caecilio Cai filio Metelo imperatori...,,,1000,69,"[{'age: days': None, 'age: hours': None, 'age:...",honorific inscription,False,,,,,,False,Achaia,False,Greece,False,Midea,False,Pelopónissos,False,Midhéa,False,,False,,-68 to -68,-68.0,-68.0,22.8412,37.6498,True,POINT (22.841 37.650),True,rural,,,honorific inscription,1.0,[],0,[],0,False,13,"[22.719769, 37.631278]",10912.81,85,"[22.641069, 37.831664]",26806.86,36,"[23.132244, 37.626573]",25817.21,rural,[]
1,445465,EDCS-03700724,HD052964,https://www.trismegistos.org/text/121715,531064,Fortissimo et piis/simo Caesari d(omino) n(ost...,Fortissimo et Piis/simo Caesari d(omino) n(ost...,Fortissimo et piissimo Caesari d n Gal Val P F...,Fortissimo et piissimo Caesari domino nostro G...,Fortissimo et piissimo Caesari domino nostro G...,Fortissimo et Piissimo Caesari domino nostro G...,FORTISSIMO ET PIIS / SIMO CAESARI D N / GAL VA...,Achaia,Agios Athanasios / Photike,tituli honorarii,"Augusti/Augustae, ordo equester, tria nomina",litterae erasae,Augusti/Augustae; litterae erasae; ordo eque...,http://db.edcs.eu/epigr/partner.php?s_language...,2014-09-16,3-5.3 cm,honorific inscription,checked with photo,,Fragma Kalama,\n Fortissimo et piissimo Caesari domino nostr...,57.0,,1000,69,"[{'age: days': None, 'age: hours': None, 'age:...",honorific inscription,False,99.0,67.0,67.0,,statue base,False,Epirus,False,Greece,False,Photike,False,Ípeiros,False,Paramythía,False,{Agios Athanasios},False,,309 to 313,313.0,309.0,20.7668,39.4512,True,POINT (20.767 39.451),True,rural,,,honorific inscription,1.0,[],0,[],0,False,65,"[20.733395, 39.026505]",47237.78,7,"[20.987, 39.158]",37687.28,31,"[20.787767, 39.546432]",10725.84,rural,[]
2,445466,EDCS-13800065,HD017714,https://www.trismegistos.org/text/177100,570049,Italicei / quei Aegei negotiantur / P(ublium) ...,Italicei / quei Aegei negotiantur / P(ublium) ...,Italicei quei Aegei negotiantur P Rutilium P f...,Italicei quei Aegei negotiantur Publium Rutili...,Italicei quei Aegei negotiantur Publium Rutili...,Italicei quei Aegei negotiantur Publium Rutili...,ITALICEI / QVEI AEGEI NEGOTIANTVR / P RVTILIVM...,Achaia,Aigio / Egio / Aiyion / Aegeum,tituli honorarii,"officium/professio, ordo senatorius, tria nomi...",,officium/professio; ordo senatorius; tituli ...,http://db.edcs.eu/epigr/partner.php?s_language...,2011-03-29,3.5-3.7 cm,votive inscription,checked with photo,,,\n Italicei quei Aegei negotiantur Publium Rut...,257.0,,1000,372,"[{'age: days': None, 'age: hours': None, 'age:...",votive inscription,False,58.0,61.0,16.0,,tabula,False,Achaia,False,Greece,False,Aegeum,False,Dytikí Elláda,False,Aígion,False,,False,,-74 to -74,-74.0,-74.0,22.0845,38.2487,True,POINT (22.084 38.249),True,small,Aegium,1000.0,votive inscription,1.0,[],0,[],0,False,92,"[22.726044, 37.9799]",63682.52,24,"[22.044647, 37.885611]",40453.51,6,"[22.081952, 38.252707]",497.57,small,[]


# Simplify the dataset

In [24]:
# making sure the data exists and loaded properly, printing out all the attribute names
print(LIST.columns.tolist())

['LIST-ID', 'EDCS-ID', 'EDH-ID', 'trismegistos_uri', 'pleiades_id', 'transcription', 'inscription', 'clean_text_conservative', 'clean_text_interpretive_sentence', 'clean_text_interpretive_word', 'clean_text_interpretive_word_EDCS', 'diplomatic_text', 'province', 'place', 'inscr_type', 'status_notation', 'inscr_process', 'status', 'partner_link', 'last_update', 'letter_size', 'type_of_inscription', 'work_status', 'year_of_find', 'present_location', 'text_edition', 'support_objecttype', 'support_material', 'support_decoration', 'keywords_term', 'people', 'type_of_inscription_clean', 'type_of_inscription_certainty', 'height_cm', 'width_cm', 'depth_cm', 'material_clean', 'type_of_monument_clean', 'type_of_monument_certainty', 'province_label_clean', 'province_label_certainty', 'country_clean', 'country_certainty', 'findspot_ancient_clean', 'findspot_ancient_certainty', 'modern_region_clean', 'modern_region_certainty', 'findspot_modern_clean', 'findspot_modern_certainty', 'findspot_clean', 

In [25]:
LISTsimple = LIST[['LIST-ID', 'EDCS-ID', 'EDH-ID',
                   'Longitude', 'Latitude', 'geometry',
                   'urban_context', 'urban_context_city', 'urban_context_pop_est', 'urban_context_alt',
                   'occups', 'occups_N','occups_cats',
                   'clean_text_interpretive_word', 'type_of_inscription_auto']]

In [26]:
LISTsimple

Unnamed: 0,LIST-ID,EDCS-ID,EDH-ID,Longitude,Latitude,geometry,urban_context,urban_context_city,urban_context_pop_est,urban_context_alt,occups,occups_N,occups_cats,clean_text_interpretive_word,type_of_inscription_auto
0,445464,EDCS-24900077,HD056163,22.841200,37.649800,POINT (22.841 37.650),rural,,,rural,[],0,[],Quinto Caecilio Cai filio Metelo imperatori It...,honorific inscription
1,445465,EDCS-03700724,HD052964,20.766800,39.451200,POINT (20.767 39.451),rural,,,rural,[],0,[],Fortissimo et piissimo Caesari domino nostro G...,honorific inscription
2,445466,EDCS-13800065,HD017714,22.084500,38.248700,POINT (22.084 38.249),small,Aegium,1000.0,small,[],0,[],Italicei quei Aegei negotiantur Publium Rutili...,votive inscription
3,445467,EDCS-03300852,HD051000,22.417100,37.431900,POINT (22.417 37.432),large,Tegea,46362.0,large,[],0,[],Imperatori Caesari Marco Annio Floriano Pio Fe...,mile-/leaguestone
4,445468,EDCS-28500283,HD021396,22.420877,37.454501,POINT (22.421 37.455),large,Tegea,46362.0,large,[],0,[],Tiberius Claudius Caesar Augustus Germanicus p...,public legal inscription
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511968,445459,EDCS-10700071,,13.026040,46.461796,POINT (13.026 46.462),small,Iulium Carnicum,1000.0,small,[],0,[],ACLAI,identification inscription
511969,445460,EDCS-10700072,,13.026040,46.461796,POINT (13.026 46.462),small,Iulium Carnicum,1000.0,small,[],0,[],IN P,identification inscription
511970,445461,EDCS-10700073,,13.026040,46.461796,POINT (13.026 46.462),small,Iulium Carnicum,1000.0,small,[],0,[],RIC,identification inscription
511971,445462,EDCS-10700074,,13.026040,46.461796,POINT (13.026 46.462),small,Iulium Carnicum,1000.0,small,[],0,[],ius Cai s L,identification inscription


# Save the dataset in multiple formats

## JSON - without geometries

In [30]:
LISTsimplejson = LIST[['LIST-ID', 'EDCS-ID', 'EDH-ID',
                   'Longitude', 'Latitude', 
                   'urban_context', 'urban_context_city', 'urban_context_pop_est', 'urban_context_alt',
                   'occups', 'occups_N','occups_cats',
                   'clean_text_interpretive_word', 'type_of_inscription_auto']]

In [31]:
# to json on SDDK - DOES NOT WORK beacuse of the serialised data

#s.write_file("../data/large_data/LIST_occupsorgs_industry_simple_20231206.json", LISTsimple)
#s.write_file("LIST_occupsorgs_industry_simple_20231206.json", LISTsimple)
s.write_file("LIST_occupsorgs_industry_simple_20231206.json", pd.DataFrame(LISTsimplejson))

Your <class 'pandas.core.frame.DataFrame'> object has been succesfully written as "https://sciencedata.dk/files/./PSNP_shared/lat_socdiv//LIST_occupsorgs_industry_simple_20231206.json"


In [32]:
# to json locally 

pd.DataFrame(LISTsimplejson).to_json('../../data/large_data/LIST_occupsorgs_industry_simple_20231206.json')

## GEOJSON - does not work currently

In [68]:
# to geojson on SDDK - does not currently work

#s.write_file("data_public/LIST_occupsorgs_industry_simple_20231206.geojson", LISTsimple)

In [33]:
# to geojson locally - does not currently work

#LISTsimple.to_file("../../data/large_data/LIST_occupsorgs_industry_simple_20231206.geojson", driver='GeoJSON')


## Parquet - with geometries

In [14]:
# locally

LISTsimple.to_parquet("../../data/large_data/LIST_occupsorgs_industry_simple_20231206.parquet")

# Loading the data back to make sure it works

In [35]:
# from local version
LISTjson = pd.read_json("../../data/large_data/LIST_occupsorgs_industry_simple_20231206.json")
LISTjson.head(3)

Unnamed: 0,LIST-ID,EDCS-ID,EDH-ID,Longitude,Latitude,urban_context,urban_context_city,urban_context_pop_est,urban_context_alt,occups,occups_N,occups_cats,clean_text_interpretive_word,type_of_inscription_auto
0,445464,EDCS-24900077,HD056163,22.8412,37.6498,rural,,,rural,[],0,[],Quinto Caecilio Cai filio Metelo imperatori It...,honorific inscription
1,445465,EDCS-03700724,HD052964,20.7668,39.4512,rural,,,rural,[],0,[],Fortissimo et piissimo Caesari domino nostro G...,honorific inscription
2,445466,EDCS-13800065,HD017714,22.0845,38.2487,small,Aegium,1000.0,small,[],0,[],Italicei quei Aegei negotiantur Publium Rutili...,votive inscription


In [67]:
#LISTgeojson = gpd.read_file("../../data/large_data/LIST_occupsorgs_industry_simple_20231206.geojson", driver="GeoJSON")
#LISTgeojson.head(3)

In [16]:
LISTparquet = gpd.read_parquet("../../data/large_data/LIST_occupsorgs_industry_simple_20231206.parquet")
LISTparquet.head(3)

Unnamed: 0,LIST-ID,EDCS-ID,EDH-ID,Longitude,Latitude,geometry,urban_context,urban_context_city,urban_context_pop_est,urban_context_alt,occups,occups_N,occups_cats,clean_text_interpretive_word,type_of_inscription_auto
0,445464,EDCS-24900077,HD056163,22.8412,37.6498,POINT (22.841 37.650),rural,,,rural,[],0,[],Quinto Caecilio Cai filio Metelo imperatori It...,honorific inscription
1,445465,EDCS-03700724,HD052964,20.7668,39.4512,POINT (20.767 39.451),rural,,,rural,[],0,[],Fortissimo et piissimo Caesari domino nostro G...,honorific inscription
2,445466,EDCS-13800065,HD017714,22.0845,38.2487,POINT (22.084 38.249),small,Aegium,1000.0,small,[],0,[],Italicei quei Aegei negotiantur Publium Rutili...,votive inscription
