In [1]:
import sys
import os

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))
import pandas as pd
import numpy as np
import time
import sys
import os

from utils import download_data, fetch_urls, weighted_mean, get_max
from mappings import value_mapping


In [2]:
df = download_data("https://seshatdata.com/api/core/polities/?page_size=1000")
# create the dattaframe staring with the polity data
template = pd.DataFrame(columns = ["NGA", "PolityID", "PolityName"])
# specify the columns data types
template['PolityID'] = template['PolityID'].astype('int')

# polity_home_nga_id, polity_id, polity_name 
polityIDs = df.id.unique()

for polID in polityIDs:
    pol_df = df.loc[df.id == polID, ['home_nga_name', 'id', 'new_name','start_year','end_year']]
    # create a temporary dataframe with all data for current polity
    pol_df_new = pd.DataFrame(dict({"NGA" : pol_df.home_nga_name.values[0], 
                                    "PolityID": pol_df.id.values[0], 
                                    "PolityName": pol_df.new_name.values[0], 
                                    "StartYear": pol_df.start_year.values[0],
                                    "EndYear": pol_df.end_year.values[0]}), index = [0])

    # Ensure the index is unique before concatenating
    if not pol_df_new.index.is_unique:
        pol_df_new = pol_df_new.reset_index(drop=True)
    template = pd.concat([template, pol_df_new])
template.reset_index(drop=True, inplace=True)

Downloaded 839 rows


In [3]:
def compare_dicts(dict1, dict2):
    differences = {}
    
    # Get all keys from both dictionaries
    all_keys = set(dict1.keys()).union(set(dict2.keys()))
    
    for key in all_keys:
        value1 = dict1.get(key, None)
        value2 = dict2.get(key, None)
        
        if value1 != value2:
            if pd.isnull(value1) and pd.isnull(value2):
                continue
            differences[key] = (value1, value2)
    
    return differences

def compare_rows(row1, row2):
    differences = compare_dicts(dict(row1), dict(row2))
    
    return differences

def is_same(row1, row2):
    return compare_rows(row1, row2) == {}

def check_for_nans(d):

    if not isinstance(d, dict):
        
        if np.isnan(d):
            return False
        else:
            print(d)
        return False
    
    def contains_nan(values):
        # Check if the values are numeric and contain NaNs
        if isinstance(values, (list, np.ndarray)):
            return any(isinstance(v, (int, float)) and np.isnan(v) for v in values)
        return False
    
    ts = d.get('t', [])
    if contains_nan(ts):
        return True
    
    vals = d.get('value', [])
    for val_row in vals:
        for (x, y) in val_row:
            if (isinstance(x, (int, float)) and np.isnan(x)) or (isinstance(y, (int, float)) and np.isnan(y)):
                return True
    
    years = d.get('polity_years', [])
    if contains_nan(years):
        return True
    
    return False

def check_nan_polities(pol, df, variable_name):
    pol_df = df.loc[df.polity_id == pol]
    if pol_df.empty:
        return True
    if pol_df[variable_name].isnull().all():
        return True
    return False

def get_values(val_from, val_to):
    if (val_from is None) and (val_to is None):
        return None
    elif (val_from is not None) and (val_to is None):
        val_to = val_from
    elif (val_from is None) and (val_to is not None):
        val_from = val_to
    return (val_from, val_to)

def add_empty_col(template, variable_name):
    range_var =  variable_name + "_from" in df.columns
    template[variable_name] = np.nan
    template[variable_name] = template[variable_name].astype('object')
    return template, range_var

In [4]:
urls = {}
urls.update(fetch_urls('sc'))

In [5]:
key = list(urls.keys())[7]
url = urls[key]

In [6]:
for key in urls.keys():
    url = urls[key]
    polities = template.PolityID.unique()
    # download the data
    print(key)
    df = download_data(url)
    variable_name = df.name.unique()[0].lower()
    range_var =  variable_name + "_from" in df.columns
    template, range_var = add_empty_col(template, variable_name)

    for pol in polities:
        # create a dataframe with only the data for the current polity and sort it by year
        # this allows to assume entries are dealth with in chronological order
        pol_df = df.loc[df.polity_id == pol]
        pol_df = pol_df.sort_values(by = 'year_from')
        pol_df = pol_df.reset_index(drop=True)

        polity_years = np.array([template.loc[template.PolityID == pol, 'StartYear'].values[0], template.loc[template.PolityID == pol, 'EndYear'].values[0]])
        if pol_df.empty:
            continue
        # reset variable dict variables
        times = []
        values = [[]]

        for ind,row in pol_df.iterrows():
            # reset variables
            disp = False
            unc = False
            t = []
            value = []
            # check if the polity has multiple rows
            if ind > 0:
                if range_var:
                    relevant_columns = ['polity_id','year_from', 'year_to', 'is_disputed', 'is_uncertain', variable_name+'_from', variable_name +'_to']
                else:
                    relevant_columns = ['polity_id','year_from', 'year_to', 'is_disputed', 'is_uncertain', variable_name]
                # if the row is a duplicate of the previous row, skip it
                if pol_df.loc[:ind, relevant_columns].apply(lambda x: is_same(x, pol_df.loc[ind,relevant_columns]), axis=1).any():
                    print("Duplicate rows found")
                    # display(pol_df[['polity_id','polity_new_name','year_from', 'year_to', 'is_disputed', 'is_uncertain', variable_name+'_from', variable_name +'_to']])
                    continue
                elif pol_df.loc[ind,'is_disputed']:
                    # check if the disputed row has the same year as a previous row
                    if pol_df.loc[:ind-1,'year_from'].apply(lambda x: x == pol_df.loc[ind,'year_from']).any():
                        disp = True
                        print("Disputed rows found")
                    # check if the disputed row doesn't have a year
                    elif pol_df.loc[:ind-1,'year_from'].isna().any() and pol_df.loc[ind,'year_from'].isna():
                        disp = True
                        print("Disputed rows found")
                elif pol_df.loc[ind,'is_uncertain']:
                    if pol_df.loc[:ind-1,'year_from'].apply(lambda x: x == pol_df.loc[ind,'year_from']).any():
                        unc = True
                        print("Uncertain rows found")
                    elif pol_df.loc[:ind-1,'year_from'].isna().any() and pol_df.loc[ind,'year_from'].isna():
                        unc = True
                        print("Uncertain rows found")

            if ind < len(pol_df)-1:
                # in the case of the year to being the same as the year from of the next row, subtract one year to the year from to remove overlap
                if (pol_df.loc[ind,'year_from'] is not None):
                    if (pol_df.loc[ind,'year_to'] == pol_df.loc[ind+1,'year_from']) and (pol_df.loc[ind,'year_from'] != pol_df.loc[ind+1,'year_from']):
                        if row.year_to == row.year_from:
                            sys.exit(7)
                        row.year_to = row.year_to - 1
                    
            # check if polity has no year data and in that case use the polity start and end year
            if (row.year_from is None) and (row.year_to is None):
                # if the variable is a range variable, check if the range is defined
                if range_var:
                    val_from = row[variable_name + "_from"]
                    val_to = row[variable_name + "_to"]
                    # if no range variables are defined skip the row
                    val = get_values(val_from, val_to)
                    if val is None:
                        continue
                else:
                    if (value_mapping[row[variable_name]] is None) or pd.isna(value_mapping[row[variable_name]]):
                        continue
                    val = (value_mapping[row[variable_name]], value_mapping[row[variable_name]])

                # append the values and times to the lists
                value.append(val)
                value.append(val)
                t.append(template.loc[template.PolityID == pol, 'StartYear'].values[0])
                t.append(template.loc[template.PolityID == pol, 'EndYear'].values[0])
                
            # check if only one year is defined, either because the year_from and year_to are the same or one of them is None
            elif (row.year_from == row.year_to) or ((row.year_from is None) and (row.year_to is not None)) or ((row.year_from is not None) and (row.year_to is None)):
                # if variable is a range variable, check if the range is defined
                if range_var:
                    val_from = row[variable_name + "_from"]
                    val_to = row[variable_name + "_to"]
                    # if no range variables are defined skip the row
                    if (val_from is None) and (val_to is None):
                        continue
                    # if only one of the range variables are defined, set the other to the same value
                    elif (val_from is not None) and (val_to is None):
                        val_to = val_from
                    elif (val_from is None) and (val_to is not None):
                        val_from = val_to
                    val = (val_from, val_to)
                else:
                    if (value_mapping[row[variable_name]] is None) or pd.isna(value_mapping[row[variable_name]]):
                        continue
                    val = (value_mapping[row[variable_name]], value_mapping[row[variable_name]])
                value.append(val)
                year = row.year_from if row.year_from is not None else row.year_to
                
                if year < template.loc[template.PolityID == pol, 'StartYear'].values[0]:
                    print("Error: The year is outside the polity's start and end year")
                    continue
                elif year > template.loc[template.PolityID == pol, 'EndYear'].values[0]:
                    print("Error: The year is outside the polity's start and end year")
                    continue
                t.append(year)

            elif (row.year_from != row.year_to) and (row.year_from is not None) and (row.year_to is not None):
                
                if range_var:
                    val_from = row[variable_name + "_from"]
                    val_to = row[variable_name + "_to"]
                    # if no range variables are defined skip the row
                    val = get_values(val_from, val_to)
                    if val is None:
                        continue
                else:
                    if (value_mapping[row[variable_name]] is None) or pd.isna(value_mapping[row[variable_name]]):
                        continue
                    val = (value_mapping[row[variable_name]], value_mapping[row[variable_name]])

                value.append(val)
                value.append(val)
                t_from = row.year_from
                t_to = row.year_to
                if t_from<template.loc[template.PolityID == pol, 'StartYear'].values[0]:
                    print("Error: The start year is outside the polity's start and end year")
                    # t_from = template.loc[template.PolityID == pol, 'StartYear'].values[0]
                    continue
                elif t_to > template.loc[template.PolityID == pol, 'EndYear'].values[0]:
                    print("Error: The end year is outside the polity's start and end year")
                    # t_to = template.loc[template.PolityID == pol, 'EndYear'].values[0]
                    continue
                    
                t.append(t_from)
                t.append(t_to)
            else:
                print('new')
                sys.exit(1) 
                
            if disp or unc:
                # find position in t vector of disputed years
                if times == []:
                    times = t
                else:
                    times = times + t
                    times = list(np.unique(times))
                # find the position of the disputed years in the t vector
                positions = list(np.where(np.isin(times, t))[0].astype(int))
                # create a list of new timelines
                new_vals = []
                for val_row in values:
                    new_row = np.array(val_row.copy())
                    new_row[positions] = val
                    new_vals.append(list(new_row))
                #  append new timeline to the value entry of the dictionary
                values = values + new_vals
            else:
                for val_row in range(len(values)):
                    values[val_row] = values[val_row] + value
                if times == []:
                    times = t
                else:
                    times = times + t
                    times = list(np.unique(times))

                
        variable_dict = dict({"t": times, "value": values, "polity_years": polity_years})
        for dict_row in variable_dict['value']:
            if len(variable_dict["t"]) != len(dict_row):
                print("Error: The length of the time and value arrays are not the same")
                if pol == 601:
                    continue
                if pol == 508 and variable_name == "administrative_level":
                    continue
                sys.exit(3)

        if variable_dict['t'] == []:
            continue
            
        template.loc[template.PolityID == pol, variable_name] = [variable_dict]

    if template[variable_name].apply(lambda x: check_for_nans(x)).any():
        print("Error: NaNs found in the data")
        sys.exit(4)
    if range_var:
        var_name = variable_name + "_from"
    else:
        var_name = variable_name
    if (template['PolityID'].apply(lambda x: check_nan_polities(x, df, var_name)) > template[variable_name].isna()).all():
        print("Nans in template that are not in the template")
        sys.exit(5)
    elif (template['PolityID'].apply(lambda x: check_nan_polities(x, df, var_name)) < template[variable_name].isna()).all():
        print("Extra entries in the template")
        sys.exit(6)

sc/polity-territories
Downloaded 464 rows
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Error: The year is outside the polity's start and end year
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows found
Duplicate rows fo

In [7]:
template

Unnamed: 0,NGA,PolityID,PolityName,StartYear,EndYear,polity_territory,polity_population,population_of_the_largest_settlement,settlement_hierarchy,administrative_level,...,fiction,article,token,precious_metal,foreign_coin,indigenous_coin,paper_currency,courier,postal_station,general_postal_service
0,Southern Mesopotamia,132,iq_abbasid_cal_1,750.0,946.0,"{'t': [800], 'value': [[(8300000, 8300000)]], ...","{'t': [900], 'value': [[(9000000, 11000000)]],...","{'t': [800], 'value': [[(700000, 700000)]], 'p...","{'t': [750.0, 946.0], 'value': [[(6, 6), (6, 6...","{'t': [750.0, 946.0], 'value': [[(6, 6), (6, 6...",...,"{'t': [750.0, 946.0], 'value': [[(1, 1), (1, 1...","{'t': [750.0, 946.0], 'value': [[(1, 1), (1, 1...",,,"{'t': [750.0, 946.0], 'value': [[(1, 1), (1, 1...","{'t': [750.0, 946.0], 'value': [[(1, 1), (1, 1...","{'t': [750.0, 946.0], 'value': [[(0, 0), (0, 0...","{'t': [750.0, 946.0], 'value': [[(1, 1), (1, 1...","{'t': [750.0, 946.0], 'value': [[(1, 1), (1, 1...","{'t': [750.0, 946.0], 'value': [[(1, 1), (1, 1..."
1,Southern Mesopotamia,484,iq_abbasid_cal_2,1191.0,1258.0,"{'t': [1200], 'value': [[(750000, 750000)]], '...","{'t': [1200], 'value': [[(3900000, 3900000)]],...","{'t': [1191.0, 1258.0], 'value': [[(1000000, 1...",,"{'t': [1191.0, 1258.0], 'value': [[(5, 5), (5,...",...,"{'t': [1191.0, 1258.0], 'value': [[(1, 1), (1,...","{'t': [1191.0, 1258.0], 'value': [[(1, 1), (1,...",,"{'t': [1191.0, 1258.0], 'value': [[(1, 1), (1,...",,"{'t': [1191.0, 1258.0], 'value': [[(1, 1), (1,...","{'t': [1191.0, 1258.0], 'value': [[(0, 0), (0,...",,,"{'t': [1191.0, 1258.0], 'value': [[(1, 1), (1,..."
2,Susiana,107,ir_achaemenid_emp,-550.0,-331.0,"{'t': [-539, -501], 'value': [[(2500000, 41000...","{'t': [-500], 'value': [[(20000000, 26000000)]...","{'t': [-550.0, -331.0], 'value': [[(200000, 20...","{'t': [-550.0, -331.0], 'value': [[(5, 6), (5,...","{'t': [-550.0, -331.0], 'value': [[(6, 8), (6,...",...,"{'t': [-550.0, -331.0], 'value': [[(1, 1), (1,...","{'t': [-550.0, -331.0], 'value': [[(1, 1), (1,...",,"{'t': [-550.0, -331.0], 'value': [[(1, 1), (1,...","{'t': [-550.0, -331.0], 'value': [[(1, 1), (1,...","{'t': [-550, -516], 'value': [[(0, 0), (0, 0)]...","{'t': [-550.0, -331.0], 'value': [[(0, 0), (0,...","{'t': [-550.0, -331.0], 'value': [[(1, 1), (1,...","{'t': [-550.0, -331.0], 'value': [[(1, 1), (1,...",
3,,637,so_adal_sultanate,1375.0,1543.0,,,,,"{'t': [1375.0, 1543.0], 'value': [[(4, 7), (4,...",...,,,"{'t': [1375.0, 1543.0], 'value': [[(1, 1), (1,...",,,,,,,
4,,872,tn_aghlabid_dyn,800.0,908.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
834,,227,et_zagwe,1137.0,1269.0,,,,,,...,,,,,,,,,,
835,,231,dz_zayyanid_dyn,1235.0,1509.0,,,,,,...,,,,,,,,,,
836,,222,tn_zirid_dyn,973.0,1148.0,,,,,,...,,,,,,,,,,
837,Orkhon Valley,444,mn_zungharian_emp,1670.0,1757.0,"{'t': [1700], 'value': [[(170000, 210000)]], '...","{'t': [1670.0, 1757.0], 'value': [[(600000, 16...",,"{'t': [1670.0, 1757.0], 'value': [[(2, 3), (2,...",,...,,"{'t': [1670.0, 1757.0], 'value': [[(1, 1), (1,...","{'t': [1670.0, 1757.0], 'value': [[(1, 1), (1,...","{'t': [1670.0, 1757.0], 'value': [[(0, 0), (0,...",,"{'t': [1670.0, 1757.0], 'value': [[(1, 1), (1,...","{'t': [1670.0, 1757.0], 'value': [[(0, 0), (0,...","{'t': [1670.0, 1757.0], 'value': [[(1, 1), (1,...","{'t': [1670.0, 1757.0], 'value': [[(1, 1), (1,...",


In [44]:
from Template import Template
template2 = Template(categories = ['sc'], file_path = "/Users/mperuzzo/Documents/repos/SeshatDatasetAnalysis/datasets/test.csv")
template2.template

Loaded template from /Users/mperuzzo/Documents/repos/SeshatDatasetAnalysis/datasets/test.csv


Unnamed: 0,NGA,PolityID,PolityName,StartYear,EndYear,polity_territory,polity_population,population_of_the_largest_settlement,settlement_hierarchy,administrative_level,...,fiction,article,token,precious_metal,foreign_coin,indigenous_coin,paper_currency,courier,postal_station,general_postal_service
0,Southern Mesopotamia,132,iq_abbasid_cal_1,750.0,946.0,"{'t': [800], 'value': [[(8300000, 8300000)]], ...","{'t': [900], 'value': [[(9000000, 11000000)]],...","{'t': [800], 'value': [[(700000, 700000)]], 'p...","{'t': [750.0, 946.0], 'value': [[(6, 6), (6, 6...","{'t': [750.0, 946.0], 'value': [[(6, 6), (6, 6...",...,"{'t': [750.0, 946.0], 'value': [[(1, 1), (1, 1...","{'t': [750.0, 946.0], 'value': [[(1, 1), (1, 1...",,,"{'t': [750.0, 946.0], 'value': [[(1, 1), (1, 1...","{'t': [750.0, 946.0], 'value': [[(1, 1), (1, 1...","{'t': [750.0, 946.0], 'value': [[(0, 0), (0, 0...","{'t': [750.0, 946.0], 'value': [[(1, 1), (1, 1...","{'t': [750.0, 946.0], 'value': [[(1, 1), (1, 1...","{'t': [750.0, 946.0], 'value': [[(1, 1), (1, 1..."
1,Southern Mesopotamia,484,iq_abbasid_cal_2,1191.0,1258.0,"{'t': [1200], 'value': [[(750000, 750000)]], '...","{'t': [1200], 'value': [[(3900000, 3900000)]],...","{'t': [1191.0, 1258.0], 'value': [[(1000000, 1...",,"{'t': [1191.0, 1258.0], 'value': [[(5, 5), (5,...",...,"{'t': [1191.0, 1258.0], 'value': [[(1, 1), (1,...","{'t': [1191.0, 1258.0], 'value': [[(1, 1), (1,...",,"{'t': [1191.0, 1258.0], 'value': [[(1, 1), (1,...",,"{'t': [1191.0, 1258.0], 'value': [[(1, 1), (1,...","{'t': [1191.0, 1258.0], 'value': [[(0, 0), (0,...",,,"{'t': [1191.0, 1258.0], 'value': [[(1, 1), (1,..."
2,Susiana,107,ir_achaemenid_emp,-550.0,-331.0,"{'t': [-539, -501], 'value': [[(2500000, 41000...","{'t': [-500], 'value': [[(20000000, 26000000)]...","{'t': [-550.0, -331.0], 'value': [[(200000, 20...","{'t': [-550.0, -331.0], 'value': [[(5, 6), (5,...","{'t': [-550.0, -331.0], 'value': [[(6, 8), (6,...",...,"{'t': [-550.0, -331.0], 'value': [[(1, 1), (1,...","{'t': [-550.0, -331.0], 'value': [[(1, 1), (1,...",,"{'t': [-550.0, -331.0], 'value': [[(1, 1), (1,...","{'t': [-550.0, -331.0], 'value': [[(1, 1), (1,...","{'t': [-550, -516], 'value': [[(0, 0), (0, 0)]...","{'t': [-550.0, -331.0], 'value': [[(0, 0), (0,...","{'t': [-550.0, -331.0], 'value': [[(1, 1), (1,...","{'t': [-550.0, -331.0], 'value': [[(1, 1), (1,...",
3,,637,so_adal_sultanate,1375.0,1543.0,,,,,"{'t': [1375.0, 1543.0], 'value': [[(4, 7), (4,...",...,,,"{'t': [1375.0, 1543.0], 'value': [[(1, 1), (1,...",,,,,,,
4,,872,tn_aghlabid_dyn,800.0,908.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
834,,227,et_zagwe,1137.0,1269.0,,,,,,...,,,,,,,,,,
835,,231,dz_zayyanid_dyn,1235.0,1509.0,,,,,,...,,,,,,,,,,
836,,222,tn_zirid_dyn,973.0,1148.0,,,,,,...,,,,,,,,,,
837,Orkhon Valley,444,mn_zungharian_emp,1670.0,1757.0,"{'t': [1700], 'value': [[(170000, 210000)]], '...","{'t': [1670.0, 1757.0], 'value': [[(600000, 16...",,"{'t': [1670.0, 1757.0], 'value': [[(2, 3), (2,...",,...,,"{'t': [1670.0, 1757.0], 'value': [[(1, 1), (1,...","{'t': [1670.0, 1757.0], 'value': [[(1, 1), (1,...","{'t': [1670.0, 1757.0], 'value': [[(0, 0), (0,...",,"{'t': [1670.0, 1757.0], 'value': [[(1, 1), (1,...","{'t': [1670.0, 1757.0], 'value': [[(0, 0), (0,...","{'t': [1670.0, 1757.0], 'value': [[(1, 1), (1,...","{'t': [1670.0, 1757.0], 'value': [[(1, 1), (1,...",


In [46]:
url = "https://seshatdata.com/api/crisisdb/power-transitions/"
df = download_data(url)

Downloaded 2390 rows


Index(['id', 'year_from', 'year_to', 'description', 'note', 'finalized',
       'created_date', 'modified_date', 'tag', 'is_disputed', 'is_uncertain',
       'expert_reviewed', 'drb_reviewed', 'predecessor', 'successor',
       'reign_number_predecessor', 'name', 'culture_group', 'contested',
       'overturn', 'predecessor_assassination', 'intra_elite',
       'military_revolt', 'popular_uprising', 'separatist_rebellion',
       'external_invasion', 'external_interference', 'comment', 'citations',
       'curator', 'polity_id', 'polity_name', 'polity_start_year',
       'polity_end_year', 'polity_long_name', 'polity_new_name',
       'polity_polity_tag', 'polity_general_description',
       'polity_shapefile_name', 'polity_private_comment',
       'polity_created_date', 'polity_modified_date', 'polity_home_nga_id',
       'polity_home_nga_name', 'polity_home_nga_subregion',
       'polity_home_nga_longitude', 'polity_home_nga_latitude',
       'polity_home_nga_capital_city', 'polity_h

nan