In [2]:
import pandas as pd
import numpy as np

Generate a dataframe where the primary keys are geographic names. For each GN, a list of p numbers it appears, a list of neighbors for each p number, and a list of dates based on p number.

In [3]:
files = ['filtered_'+str(n)+'_with_neighbors_normalized.csv' for n in range(1,10)]

In [4]:
geo_names = {}
p_number = ''
places = []
geo_names_list = []

for file in files:
    f = pd.read_csv(file)
    for index, row in f.iterrows():
        if row['id_word'][:7] != p_number:
            geo_names[p_number] = places
            p_number = row['id_word'][:7]
            places = []
        w = row['lemma']
        if w[-2:]=='GN':
            places.append(w)
            if w not in geo_names_list:
                geo_names_list.append(w)
geo_names[p_number] = [places]
del geo_names['']

In [5]:
geo_dict = {}
geo_neighbors = {}

for p in geo_names:
    lst = geo_names[p]
    for gn in lst:
        if isinstance(gn, str):
            if gn in geo_dict:
                geo_dict[gn].append(p)
            else:
                geo_dict[gn] = [p]
            if gn in geo_neighbors:
                geo_neighbors[gn][p] = [i for i in lst if i != gn]
            else:
                geo_neighbors[gn] = {p: [i for i in lst if i != gn]}

In [6]:
d = {'geo_name':list(geo_dict.keys()),'p_nums': list(geo_dict.values())}
geo_df = pd.DataFrame.from_dict(data = d)
geo_df.loc[:, 'neighbors'] = geo_df['geo_name'].map(geo_neighbors)
geo_df.head()

Unnamed: 0,geo_name,p_nums,neighbors
0,tum-ma-al{ki}[]GN,"[P124149, P103274, P107843, P134976, P131918, ...","{'P124149': ['eridu{ki}[]GN'], 'P103274': ['ša..."
1,eridu{ki}[]GN,"[P124149, P107420, P212153, P126352, P210370, ...","{'P124149': ['tum-ma-al{ki}[]GN'], 'P107420': ..."
2,unu{ki}[]GN,"[P405489, P104590, P103800, P107843, P123981, ...","{'P405489': [], 'P104590': ['nibru{ki}-a[]GN',..."
3,zimbir{ki}[]GN,"[P332036, P131582, P111934, P117626, P134703, ...","{'P332036': ['uri₅{ki}-ma[]GN', 'mu-ri-iq-ti-i..."
4,uri₅{ki}-ma[]GN,"[P332036, P124307, P103657, P106357, P104590, ...","{'P332036': ['zimbir{ki}[]GN', 'mu-ri-iq-ti-id..."


In [7]:
geo_df['neighbors'][0]

{'P124149': ['eridu{ki}[]GN'],
 'P103274': ['ša-aš-ru{ki}[]GN'],
 'P107843': ['unu{ki}[]GN'],
 'P134976': ['ki-maš{ki}[]GN'],
 'P131918': [],
 'P320910': [],
 'P123770': [],
 'P134033': [],
 'P390787': ['ha-ar-ši{ki}[]GN'],
 'P113794': ['ur-bi₂-lum{ki}[]GN', 'ša-aš-ru{ki}[]GN', 'ša-aš-ru{ki}[]GN'],
 'P126465': ['ki-maš{ki}[]GN', 'hur-ti{ki}[]GN'],
 'P407961': ['ur-bi₂-lum{ki}[]GN'],
 'P109476': [],
 'P113713': [],
 'P128830': ['ki-maš{ki}[]GN', 'hur-ti{ki}[]GN'],
 'P123521': ['ha-ar-ši{ki}[]GN', 'ki-maš{ki}[]GN'],
 'P116837': [],
 'P134267': ['ki-maš{ki}[]GN'],
 'P133905': ['ša-aš-ru{ki}[]GN'],
 'P107770': ['unu{ki}-ga-še₃[]GN', 'ur-bi₂-lum{ki}[]GN'],
 'P110439': ['ki-maš{ki}[]GN', 'hur-ti{ki}[]GN'],
 'P131929': ['umma{ki}[]GN'],
 'P113737': [],
 'P126389': [],
 'P143976': [],
 'P126414': ['ki-maš{ki}[]GN'],
 'P103709': ['ki-maš{ki}[]GN'],
 'P128852': ['ki-maš{ki}[]GN'],
 'P127861': [],
 'P126333': ['ki-maš{ki}[]GN'],
 'P123616': ['unu{ki}-ga[]GN', 'ki-maš{ki}[]GN'],
 'P122752': [],
 '

In [15]:
dates = pd.read_csv("bdtns_metadata/dates.csv")

In [16]:
dates[dates['CDLI No'] == 'P142868']

Unnamed: 0.1,Unnamed: 0,BDTNS No,CDLI No,Original date,Converted Date
23,23,38779,P142868,AS02 - 05 - 00,68.05


In [27]:
dates = dates[['CDLI No', 'Original date', 'Converted Date']]
date_dict = dates.set_index('CDLI No').T.to_dict('list')
date_dict

  


{'P142785': ['IS01 - 07 - 00', 85.07],
 'P142787': ['SS05 - 06 - 12', 80.0612],
 'P142788': ['SH48 - 12 - 20', 66.122],
 'P142789': ['SS04 - XX - 15', 79.0015],
 'P142790': ['XXXX - 01 - 01', 0.0101],
 'P142791': ['AS06 - 11 - 19', 72.1119],
 'P142793': ['SS07 - 11 - 00', 82.11],
 'P142794': ['SH44 - 12 - 28', 62.1228],
 'P142799': ['SH45 - 11 - 00', 63.11],
 'P142803': ['SS06 - 11d - 29', 81.1129],
 'P142805': ['XXXX - XX - 23', 0.0023],
 'P142807': ['SS07 - 02 - 00', 82.02],
 'P142809': ['AS05 - 09 - 00?', 71.09],
 'P142810': ['SS03 - 00 - 00', 78.0],
 'P142811': ['SS01 - 05 - 03+', 76.0503],
 'P142812': ['AS03 - 08 - 00', 69.08],
 'P142813': ['AS03 - 11d - 28', 69.1128],
 'P142849': ['SH47 - 02 - 00', 65.02],
 'P142852': ['XXXX - XX - XX', 0.0],
 'P142855': ['AS01 - 06 - 25', 67.0625],
 'P142857': ['SH42 - 09 - 16', 60.09160000000001],
 'P142858': ['SS07 - 11 - 03', 82.1103],
 'P142859': ['SS04 - 07 - 00', 79.07],
 'P142868': ['AS02 - 05 - 00', 68.05],
 'P248580': ['AS01 - 00 - 00',

In [20]:
def get_converted_dates(p_nums):
    ''' 
    Input: a list containg p numbers.
    Output: a list containing the converted date for the corresponding p number.
    '''
    return [date_dict[i][1] if i in date_dict else np.nan for i in p_nums]

date_col = geo_df['p_nums'].apply(get_converted_dates)
date_col

0      [74.08149999999998, 72.08, 71.08149999999998, ...
1      [74.08149999999998, 74.0822, 74.10079999999998...
2      [71.11, 76.1, 65.071, 71.08149999999998, 71.08...
3      [80.1108, 77.03, 79.12, 73.05, 69.0917, 84.01,...
4      [80.1108, 75.12100000000002, 68.1, 76.0128, 76...
                             ...                        
580                                              [0.003]
581                                            [85.0625]
582               [65.12270000000001, 65.12270000000001]
583                                            [69.1228]
584                                             [74.061]
Name: p_nums, Length: 585, dtype: object

In [24]:
def min_no_0(dates):
    '''
    Input: a list of converted dates.
    Output: the minimum date excluding 0. 
            If every value in DATES is 0, return NaN.
    '''
    no_0 = [i for i in dates if i != 0]
    if len(no_0) == 0:
        return np.nan
    else:
        return min(no_0)

geo_df['converted_dates'] = date_col
geo_df['max date'] = geo_df['converted_dates'].apply(max)
geo_df['min date'] = geo_df['converted_dates'].apply(min_no_0)
geo_df.head()

Unnamed: 0,geo_name,p_nums,neighbors,converted_dates,max date,min date
0,tum-ma-al{ki}[]GN,"[P124149, P103274, P107843, P134976, P131918, ...","{'P124149': ['eridu{ki}[]GN'], 'P103274': ['ša...","[74.08149999999998, 72.08, 71.08149999999998, ...",84.0828,0.0019
1,eridu{ki}[]GN,"[P124149, P107420, P212153, P126352, P210370, ...","{'P124149': ['tum-ma-al{ki}[]GN'], 'P107420': ...","[74.08149999999998, 74.0822, 74.10079999999998...",86.0708,0.0026
2,unu{ki}[]GN,"[P405489, P104590, P103800, P107843, P123981, ...","{'P405489': [], 'P104590': ['nibru{ki}-a[]GN',...","[71.11, 76.1, 65.071, 71.08149999999998, 71.08...",86.11,0.0729
3,zimbir{ki}[]GN,"[P332036, P131582, P111934, P117626, P134703, ...","{'P332036': ['uri₅{ki}-ma[]GN', 'mu-ri-iq-ti-i...","[80.1108, 77.03, 79.12, 73.05, 69.0917, 84.01,...",84.01,0.0725
4,uri₅{ki}-ma[]GN,"[P332036, P124307, P103657, P106357, P104590, ...","{'P332036': ['zimbir{ki}[]GN', 'mu-ri-iq-ti-id...","[80.1108, 75.12100000000002, 68.1, 76.0128, 76...",87.12,0.0006


However, some p number (CDLI No) corresponds to multiple dates.