In [1]:
import csv
import contextlib
import os, errno
from collections import OrderedDict, Counter

from IPython.core.display import display, HTML

from pandas import DataFrame

from periodo_reconciler import (
    RProperty,
    RQuery,
    PeriodoReconciler,
    CsvReconciler
)

In [2]:
def output_path_name(inpath, test_data_dir="../test-data/",
                     test_output_dir="../test-output/"):
    """
    given a path in the test directory, return the path for corresponding output in the test output dir
    """
 
    return os.path.join(test_output_dir, os.path.relpath(inpath, start=test_data_dir))

def makedirs_for_path(fpath):
    """
    make sure that the directory for fpath exists
    """
    
    (fdir, fname) = os.path.split(fpath)
    try:
        os.makedirs(fdir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise
            
    return True


In [3]:
def reconcile_to_csv_df(csv_path, kw, p_recon):
    """
    Inputs:  
    csv_path: the path of the csv file to reconcile
    kw: a dict holding the query, location, start, stop fields
    p_recon: a PeriodoReconciler object
    
    Side-effect:
    * a csv file with the matches in the output directory
    * returns a Pandas DataFrame holding the input data along with match data
    
    """

    output_path = output_path_name(csv_path)
    makedirs_for_path(output_path)
    
    # https://stackoverflow.com/a/19412700/7782
    with contextlib.ExitStack() as stack:
        csvfile = stack.enter_context(open(csv_path))
        outputfile = stack.enter_context(open(output_path, "w"))                  

        # p_recon = PeriodoReconciler(host='localhost:8142')
        c_recon = CsvReconciler(csvfile, p_recon, **kw)

        matches = list(c_recon.matches())

        c_recon.to_csv(outputfile, matches)
        df = DataFrame(matches)

        return df


# simple example

In [4]:
# simple example

csv_path = "../test-data/periodo_simple_example.csv"
kw = {
   'location': 'location',
   'query': 'query',
   'start': 'start',
   'stop': 'end'
}

p_recon = PeriodoReconciler(host='localhost:8142')
df = reconcile_to_csv_df(csv_path, kw, p_recon)

df.match_num.value_counts()

1    2
0    1
Name: match_num, dtype: int64

# OpenContext examples

In [5]:
# the list of OpenContext CSV files along with the columns to be fed to the reconciler

OPENCONTEXT_TEST_FILES = [{'csv_path': '../test-data/OpenContext/Cyprus PKAP Survey.csv',
  'kw': {'location': 'Context (1)',
   'query': 'Period',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE'}},
 {'csv_path': '../test-data/OpenContext/European Cattle with Periods.csv',
  'kw': {'location': 'Context (1)',
   'query': 'Period',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE',
   'ignored_queries':'Not determined'
  }},
 {'csv_path': '../test-data/OpenContext/Petra Artifacts.csv',
  'kw': {'location': 'Context (1)',
   'query': 'Culture',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE',
   'ignored_queries':'other,lb'
    }}]


kw_map = dict([(f['csv_path'], f['kw']) for f in OPENCONTEXT_TEST_FILES])

In [6]:
# write out incantation for the command line CSV reconciler

for f in OPENCONTEXT_TEST_FILES:
    options_from_kw = " ".join(['--{}="{}"'.format(k,v) for (k,v) in f['kw'].items()])
    print('periodo-reconciler-py {} "{}" -'.format(options_from_kw, f['csv_path'] ))

periodo-reconciler-py --location="Context (1)" --query="Period" --start="Early BCE/CE" --stop="Late BCE/CE" "../test-data/OpenContext/Cyprus PKAP Survey.csv" -
periodo-reconciler-py --location="Context (1)" --query="Period" --start="Early BCE/CE" --stop="Late BCE/CE" --ignored_queries="Not determined" "../test-data/OpenContext/European Cattle with Periods.csv" -
periodo-reconciler-py --location="Context (1)" --query="Culture" --start="Early BCE/CE" --stop="Late BCE/CE" --ignored_queries="other,lb" "../test-data/OpenContext/Petra Artifacts.csv" -


In [7]:
matching_results = {}

p_recon = PeriodoReconciler(host='localhost:8142')

for file in OPENCONTEXT_TEST_FILES:
    csv_path = file['csv_path']
    kw = file['kw']
    print ("\r{}".format(csv_path), end='')
    df = reconcile_to_csv_df(csv_path, kw, p_recon)
    matching_results[csv_path] = df # df.match_num.value_counts()


../test-data/OpenContext/Petra Artifacts.csvh Periods.csv

In [8]:
from itertools import islice

for (csv_path, df) in islice(matching_results.items(),None):
    print (csv_path, df.match_num.value_counts().get(0), df.match_num.value_counts().get(1))

        

../test-data/OpenContext/Cyprus PKAP Survey.csv 8278 159
../test-data/OpenContext/European Cattle with Periods.csv 5668 None
../test-data/OpenContext/Petra Artifacts.csv 52382 105


In [9]:
df = matching_results[OPENCONTEXT_TEST_FILES[1]['csv_path']]

df.head()

Unnamed: 0,URI,Item Label,Project Label,Project URI,Context (1),Context (2),Context (3),Context URI,Latitude (WGS-84),Longitude (WGS-84),...,Specimen Location,"Period, note",ID,Phase,Date C14 BP,Date cal BC,Date C14,match_num,match_name,match_id
0,http://opencontext.org/subjects/8436872f-dbe1-...,5033-Bonelit,Biometrical Database of European Aurochs and D...,http://opencontext.org/projects/1816A043-92E2-...,Sweden,Stora Slågarp,,http://opencontext.org/subjects/405a1fa1-e932-...,55.45,13.16667,...,,Mesolithic,898,,8330±80,7420,,0,,
1,http://opencontext.org/subjects/6632f80d-2565-...,5002-Bonelit,Biometrical Database of European Aurochs and D...,http://opencontext.org/projects/1816A043-92E2-...,Sweden,Nevishög,,http://opencontext.org/subjects/fca9ebe5-7e3f-...,55.63333,13.21667,...,,Mesolithic,894,,8380±90,7470-7440,,0,,
2,http://opencontext.org/subjects/7ab9d34c-5f72-...,4929-Bonelit,Biometrical Database of European Aurochs and D...,http://opencontext.org/projects/1816A043-92E2-...,Sweden,Ageröd I D,,http://opencontext.org/subjects/1d5c729f-35cd-...,55.93333,13.4,...,,Mesolithic,966,,"7220±70, 7710±80, 7740±80, 7770±80, 7860±80, 7...",6206-6017,,0,,
3,http://opencontext.org/subjects/6bc9ec6a-0874-...,4445-Bonelit,Biometrical Database of European Aurochs and D...,http://opencontext.org/projects/1816A043-92E2-...,Portugal,Castro do Zambujal,,http://opencontext.org/subjects/b43d08c9-d1e2-...,39.07446,-9.28544,...,,Late Chalcolithic,739,4.0,,2400-1600 (whole site),,0,,
4,http://opencontext.org/subjects/db3d2d84-6ef0-...,4443-Bonelit,Biometrical Database of European Aurochs and D...,http://opencontext.org/projects/1816A043-92E2-...,Portugal,Castro do Zambujal,,http://opencontext.org/subjects/b43d08c9-d1e2-...,39.07446,-9.28544,...,,Late Chalcolithic,737,3.0,,2400-1600 (whole site),,0,,


## rows matched vs not matched for each test file

In [10]:

rows_html = ("".join(["<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(
 csv_path, df.match_num.value_counts().get(1, 0), df.match_num.value_counts().get(0, 0))
for (csv_path, df) in matching_results.items()]))


html_ = """<table>
<tr>
    <th>path</th>
    <th>matches</th>
    <th>non-matches</th>
</tr>
{}
</table>""".format(rows_html)

display(HTML(html_))

path,matches,non-matches
../test-data/OpenContext/Cyprus PKAP Survey.csv,159,8278
../test-data/OpenContext/European Cattle with Periods.csv,0,5668
../test-data/OpenContext/Petra Artifacts.csv,105,52382


## which combination of query/location/start/stop were matched?

In [11]:
matching_results.keys()

dict_keys(['../test-data/OpenContext/Cyprus PKAP Survey.csv', '../test-data/OpenContext/European Cattle with Periods.csv', '../test-data/OpenContext/Petra Artifacts.csv'])

In [12]:
rows_html = []

for (csv_path, df) in matching_results.items():
    kw = kw_map[csv_path]
    counter = Counter(list(df.apply(lambda row: (row[kw['query']],
                      row[kw['location']],
                      row[kw['start']],
                      row[kw['stop']],
                      row['match_num'],
                      row['match_id'],
                      row['match_name']
                     ), axis=1)))
    
    matching_items = [(k,c) for (k,c) in counter.items() if k[4] > 0]

    for (match, count) in matching_items:
        row_html = """<tr><td>{}</td><td>{}</td><td>{}</td><td>{}</td>
                      <td>{}</td><td>{}</td><td>{}</td><td>{}</td></tr>""".format(
             csv_path, match[0], match[1], match[2], match[3], count, match[5], match[6])
        rows_html.append(row_html)
        

rows_html = "".join(rows_html)

html_ = """<table>
<tr>
    <th>path</th>
    <th>query</th>
    <th>location</th>
    <th>start</th>
    <th>stop</th>
    <th>num of rows</th>
    <th>match_id</th>
    <th>match_name</th>
</tr>
{}
</table>""".format(rows_html)
        
display(HTML(html_))

path,query,location,start,stop,num of rows,match_id,match_name
../test-data/OpenContext/Cyprus PKAP Survey.csv,Cypro-Archaic,Cyprus,-750,-312,3,http://n2t.net/ark:/99152/p08m57h2cv3,Cypro-Archaic [Cyprus: -0749 to -0449]
../test-data/OpenContext/Cyprus PKAP Survey.csv,Post-Prehistoric,Cyprus,-999,-999,60,http://n2t.net/ark:/99152/p0dg76f98q7,"Post-Prehistoric [Cyprus, Cyprus: -0998 to 2008]"
../test-data/OpenContext/Cyprus PKAP Survey.csv,Cypro-Archaic,Cyprus,-750,-475,1,http://n2t.net/ark:/99152/p08m57h2cv3,Cypro-Archaic [Cyprus: -0749 to -0449]
../test-data/OpenContext/Cyprus PKAP Survey.csv,Ancient-Medieval,Cyprus,-3500,1570,92,http://n2t.net/ark:/99152/p0dg76fjh5t,"Ancient-Medieval [Cyprus, Cyprus: -3499 to 1570]"
../test-data/OpenContext/Cyprus PKAP Survey.csv,Roman-Modern,Cyprus,-99,-99,3,http://n2t.net/ark:/99152/p0dg76fd4wb,"Roman-Modern [Cyprus, Cyprus: -0098 to 2008]"
../test-data/OpenContext/Petra Artifacts.csv,Herodian,Jordan,-200,360,4,http://n2t.net/ark:/99152/p0m63njw2xp,"Herodian Period [Israel, Jordan, Israel, Jordan: -0037 to 0070]"
../test-data/OpenContext/Petra Artifacts.csv,Islamic,Jordan,-200,360,93,http://n2t.net/ark:/99152/p044n254d9s,"Medieval Islamic [South eastern Kazakhstan, Kazakhstan: 0750 to 1200]"
../test-data/OpenContext/Petra Artifacts.csv,Roman Provincial,Jordan,-200,360,3,http://n2t.net/ark:/99152/p0gjgrsmhnc,"Greek and Roman Provincial [United Kingdom, United Kingdom: -0699 to 0297]"
../test-data/OpenContext/Petra Artifacts.csv,Hasmonaean? or Herodian?,Jordan,-200,360,1,http://n2t.net/ark:/99152/p0m63njw2xp,"Herodian Period [Israel, Jordan, Israel, Jordan: -0037 to 0070]"
../test-data/OpenContext/Petra Artifacts.csv,Roman (Provincial?),Jordan,-200,360,2,http://n2t.net/ark:/99152/p0gjgrsmhnc,"Greek and Roman Provincial [United Kingdom, United Kingdom: -0699 to 0297]"


In [13]:
len(counter)

99

# studying non-matches vs matches

In [14]:
# how to merge two counters?

from collections import Counter

def recon_data(df, csv_path, kw):

    if len(df):
        k0 = df.apply(lambda row: (
                          csv_path,
                          row[kw['query']],
                          row[kw['location']],
                          row[kw['start']],
                          row[kw['stop']],
                          row['match_num'],
                          row['match_id'],
                          row['match_name']
                         ), axis=1)
        k = list(k0)
        return (k)
    else:
        return []
    

matches = Counter()
non_matches = Counter()

for (csv_path, df) in matching_results.items():
    kw = kw_map[csv_path]
    matches.update(recon_data(df[df.match_num > 0], csv_path, kw))
    non_matches.update(recon_data(df[df.match_num == 0], csv_path, kw))



len(matches), len(non_matches)

(11, 335)

In [15]:
(csv_path, df) = list(matching_results.items())[1]
kw = kw_map[csv_path]
df2 = df[df.match_num > 0]
k = recon_data(df2, csv_path, kw)
len(k)

0

In [16]:
for row in matches.keys():
    print(row)

('../test-data/OpenContext/Cyprus PKAP Survey.csv', 'Cypro-Archaic', 'Cyprus', '-750', '-312', 1, 'http://n2t.net/ark:/99152/p08m57h2cv3', 'Cypro-Archaic [Cyprus: -0749 to -0449]')
('../test-data/OpenContext/Cyprus PKAP Survey.csv', 'Post-Prehistoric', 'Cyprus', '-999', '-999', 1, 'http://n2t.net/ark:/99152/p0dg76f98q7', 'Post-Prehistoric [Cyprus, Cyprus: -0998 to 2008]')
('../test-data/OpenContext/Cyprus PKAP Survey.csv', 'Cypro-Archaic', 'Cyprus', '-750', '-475', 1, 'http://n2t.net/ark:/99152/p08m57h2cv3', 'Cypro-Archaic [Cyprus: -0749 to -0449]')
('../test-data/OpenContext/Cyprus PKAP Survey.csv', 'Ancient-Medieval', 'Cyprus', '-3500', '1570', 1, 'http://n2t.net/ark:/99152/p0dg76fjh5t', 'Ancient-Medieval [Cyprus, Cyprus: -3499 to 1570]')
('../test-data/OpenContext/Cyprus PKAP Survey.csv', 'Roman-Modern', 'Cyprus', '-99', '-99', 1, 'http://n2t.net/ark:/99152/p0dg76fd4wb', 'Roman-Modern [Cyprus, Cyprus: -0098 to 2008]')
('../test-data/OpenContext/Petra Artifacts.csv', 'Herodian', 'Jor

In [17]:
[ "|{}|".format("|".join([str(col) for col in row]))
 for row in matches.keys()]

['|../test-data/OpenContext/Cyprus PKAP Survey.csv|Cypro-Archaic|Cyprus|-750|-312|1|http://n2t.net/ark:/99152/p08m57h2cv3|Cypro-Archaic [Cyprus: -0749 to -0449]|',
 '|../test-data/OpenContext/Cyprus PKAP Survey.csv|Post-Prehistoric|Cyprus|-999|-999|1|http://n2t.net/ark:/99152/p0dg76f98q7|Post-Prehistoric [Cyprus, Cyprus: -0998 to 2008]|',
 '|../test-data/OpenContext/Cyprus PKAP Survey.csv|Cypro-Archaic|Cyprus|-750|-475|1|http://n2t.net/ark:/99152/p08m57h2cv3|Cypro-Archaic [Cyprus: -0749 to -0449]|',
 '|../test-data/OpenContext/Cyprus PKAP Survey.csv|Ancient-Medieval|Cyprus|-3500|1570|1|http://n2t.net/ark:/99152/p0dg76fjh5t|Ancient-Medieval [Cyprus, Cyprus: -3499 to 1570]|',
 '|../test-data/OpenContext/Cyprus PKAP Survey.csv|Roman-Modern|Cyprus|-99|-99|1|http://n2t.net/ark:/99152/p0dg76fd4wb|Roman-Modern [Cyprus, Cyprus: -0098 to 2008]|',
 '|../test-data/OpenContext/Petra Artifacts.csv|Herodian|Jordan|-200|360|1|http://n2t.net/ark:/99152/p0m63njw2xp|Herodian Period [Israel, Jordan, Isra

In [18]:
from IPython.display import Markdown, HTML, display
from jinja2 import Template

matches_template = Template("""
|path|query|location|start|stop|match_id|match_name|
|--|--|--|--|--|--|--|
{% for item in items %}{{item}}\n{% endfor %}
""")

Markdown(matches_template.render(items=[ "|{}|".format("|".join([str(col) for col in row]))
 for row in matches.keys()]))


|path|query|location|start|stop|match_id|match_name|
|--|--|--|--|--|--|--|
|../test-data/OpenContext/Cyprus PKAP Survey.csv|Cypro-Archaic|Cyprus|-750|-312|1|http://n2t.net/ark:/99152/p08m57h2cv3|Cypro-Archaic [Cyprus: -0749 to -0449]|
|../test-data/OpenContext/Cyprus PKAP Survey.csv|Post-Prehistoric|Cyprus|-999|-999|1|http://n2t.net/ark:/99152/p0dg76f98q7|Post-Prehistoric [Cyprus, Cyprus: -0998 to 2008]|
|../test-data/OpenContext/Cyprus PKAP Survey.csv|Cypro-Archaic|Cyprus|-750|-475|1|http://n2t.net/ark:/99152/p08m57h2cv3|Cypro-Archaic [Cyprus: -0749 to -0449]|
|../test-data/OpenContext/Cyprus PKAP Survey.csv|Ancient-Medieval|Cyprus|-3500|1570|1|http://n2t.net/ark:/99152/p0dg76fjh5t|Ancient-Medieval [Cyprus, Cyprus: -3499 to 1570]|
|../test-data/OpenContext/Cyprus PKAP Survey.csv|Roman-Modern|Cyprus|-99|-99|1|http://n2t.net/ark:/99152/p0dg76fd4wb|Roman-Modern [Cyprus, Cyprus: -0098 to 2008]|
|../test-data/OpenContext/Petra Artifacts.csv|Herodian|Jordan|-200|360|1|http://n2t.net/ark:/99152/p0m63njw2xp|Herodian Period [Israel, Jordan, Israel, Jordan: -0037 to 0070]|
|../test-data/OpenContext/Petra Artifacts.csv|Islamic|Jordan|-200|360|1|http://n2t.net/ark:/99152/p044n254d9s|Medieval Islamic [South eastern Kazakhstan, Kazakhstan: 0750 to 1200]|
|../test-data/OpenContext/Petra Artifacts.csv|Roman Provincial|Jordan|-200|360|1|http://n2t.net/ark:/99152/p0gjgrsmhnc|Greek and Roman Provincial [United Kingdom, United Kingdom: -0699 to 0297]|
|../test-data/OpenContext/Petra Artifacts.csv|Hasmonaean? or Herodian?|Jordan|-200|360|1|http://n2t.net/ark:/99152/p0m63njw2xp|Herodian Period [Israel, Jordan, Israel, Jordan: -0037 to 0070]|
|../test-data/OpenContext/Petra Artifacts.csv|Roman (Provincial?)|Jordan|-200|360|1|http://n2t.net/ark:/99152/p0gjgrsmhnc|Greek and Roman Provincial [United Kingdom, United Kingdom: -0699 to 0297]|
|../test-data/OpenContext/Petra Artifacts.csv|Roman (Provincial)?|Jordan|-200|360|1|http://n2t.net/ark:/99152/p0gjgrsmhnc|Greek and Roman Provincial [United Kingdom, United Kingdom: -0699 to 0297]|


In [19]:
# matches that are wrong

queries = [
    RQuery("Not determined", label="with query",  properties=[
        RProperty('location', 'Spain'),
        RProperty('start', -1500),
        RProperty('end', -714)
    ]),     
    RQuery("", label="empty query",  properties=[
        RProperty('location', 'Spain'),
        RProperty('start', -1500),
        RProperty('end', -714)
    ]),
    RQuery("Spain", label="use location for query",  properties=[
        RProperty('location', 'Spain'),
        RProperty('start', -1500),
        RProperty('end', -714)
    ]),
    RQuery("Bronze", label="use location for query",  properties=[
        RProperty('location', 'Spain'),
        RProperty('start', -1500),
        RProperty('end', -714)
    ])   
]

r = p_recon.reconcile(queries, method='post')

r

{'empty query': {'result': []},
 'use location for query': {'result': [{'id': 'http://n2t.net/ark:/99152/p0z3skmnss7',
    'match': False,
    'name': 'Bronze [Palestine, Israel, Jordan: -3299 to -1199]',
    'score': 0,
    'type': [{'id': 'http://www.w3.org/2004/02/skos/core#Concept',
      'name': 'Period definition'}]},
   {'id': 'http://n2t.net/ark:/99152/p03wskdjqsh',
    'match': False,
    'name': 'Bronze Age Malta (ca. 2,500-700 BC) [Malta, Malta: -2500 to -0700]',
    'score': 1,
    'type': [{'id': 'http://www.w3.org/2004/02/skos/core#Concept',
      'name': 'Period definition'}]},
   {'id': 'http://n2t.net/ark:/99152/p06v8w4bz9k',
    'match': False,
    'name': 'Age de Bronze [Morocco, Morocco: -2300 to -1000]',
    'score': 1,
    'type': [{'id': 'http://www.w3.org/2004/02/skos/core#Concept',
      'name': 'Period definition'}]},
   {'id': 'http://n2t.net/ark:/99152/p0rrjd9pg6v',
    'match': False,
    'name': 'âge du Bronze [France, France: -2200 to -0800]',
    'score'