In [1]:
import csv
import contextlib
import os, errno
from collections import OrderedDict, Counter

from IPython.core.display import display, HTML

from pandas import DataFrame
import pandas as pd
import numpy as np

from periodo_reconciler import (
    RProperty,
    RQuery,
    PeriodoReconciler,
    CsvReconciler
)

In [2]:
def output_path_name(inpath, test_data_dir="../test-data/",
                     test_output_dir="../test-output/"):
    """
    given a path in the test directory, return the path for corresponding output in the test output dir
    """
 
    return os.path.join(test_output_dir, os.path.relpath(inpath, start=test_data_dir))

def makedirs_for_path(fpath):
    """
    make sure that the directory for fpath exists
    """
    
    (fdir, fname) = os.path.split(fpath)
    try:
        os.makedirs(fdir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise
            
    return True


In [3]:
def reconcile_to_csv_df(csv_path, kw, p_recon):
    """
    Inputs:  
    csv_path: the path of the csv file to reconcile
    kw: a dict holding the query, location, start, stop fields
    p_recon: a PeriodoReconciler object
    
    Side-effect:
    * a csv file with the matches in the output directory
    * returns a Pandas DataFrame holding the input data along with match data
    
    """

    output_path = output_path_name(csv_path)
    makedirs_for_path(output_path)
    
    # https://stackoverflow.com/a/19412700/7782
    with contextlib.ExitStack() as stack:
        csvfile = stack.enter_context(open(csv_path))
        outputfile = stack.enter_context(open(output_path, "w"))                  

        # p_recon = PeriodoReconciler(host='localhost:8142')
        c_recon = CsvReconciler(csvfile, p_recon, **kw)

        matches = list(c_recon.matches())

        c_recon.to_csv(outputfile, matches)
        df = DataFrame(matches)

        return df


# simple example of running the reconciler against a CSV file

In [4]:
# simple example

csv_path = "../test-data/periodo_simple_example.csv"
kw = {
   'location': 'location',
   'query': 'query',
   'start': 'start',
   'stop': 'end'
}

p_recon = PeriodoReconciler(host='localhost:8142')
df = reconcile_to_csv_df(csv_path, kw, p_recon)

df.match_num.value_counts()

1    2
0    1
Name: match_num, dtype: int64

# OpenContext examples

In [5]:
# the list of OpenContext CSV files along with the columns to be fed to the reconciler

OPENCONTEXT_TEST_FILES = [{'csv_path': '../test-data/OpenContext/Cyprus PKAP Survey.csv',
  'kw': {'location': 'Context (1)',
   'query': 'Period',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE'}},
 {'csv_path': '../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv',
  'kw': {'location': 'Context (1)',
   'query': 'periodo-pre-match',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE'}},
 {'csv_path': '../test-data/OpenContext/European Cattle with Periods.csv',
  'kw': {'location': 'Context (1)',
   'query': 'Period',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE',
   'ignored_queries':'Not determined'
  }},
 {'csv_path': '../test-data/OpenContext/Petra Artifacts.csv',
  'kw': {'location': 'Context (1)',
   'query': 'Culture',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE',
   'ignored_queries':'other,lb'
    }}]


kw_map = dict([(f['csv_path'], f['kw']) for f in OPENCONTEXT_TEST_FILES])

In [6]:
# write out incantation for the command line CSV reconciler

for f in OPENCONTEXT_TEST_FILES:
    options_from_kw = " ".join(['--{}="{}"'.format(k,v) for (k,v) in f['kw'].items()])
    print('periodo-reconciler-py {} "{}" -'.format(options_from_kw, f['csv_path'] ))

periodo-reconciler-py --location="Context (1)" --query="Period" --start="Early BCE/CE" --stop="Late BCE/CE" "../test-data/OpenContext/Cyprus PKAP Survey.csv" -
periodo-reconciler-py --location="Context (1)" --query="periodo-pre-match" --start="Early BCE/CE" --stop="Late BCE/CE" "../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv" -
periodo-reconciler-py --location="Context (1)" --query="Period" --start="Early BCE/CE" --stop="Late BCE/CE" --ignored_queries="Not determined" "../test-data/OpenContext/European Cattle with Periods.csv" -
periodo-reconciler-py --location="Context (1)" --query="Culture" --start="Early BCE/CE" --stop="Late BCE/CE" --ignored_queries="other,lb" "../test-data/OpenContext/Petra Artifacts.csv" -


In [7]:
matching_results = {}

p_recon = PeriodoReconciler(host='localhost:8142')

for file in OPENCONTEXT_TEST_FILES:
    csv_path = file['csv_path']
    kw = file['kw']
    print ("\r{}".format(csv_path), end='')
    df = reconcile_to_csv_df(csv_path, kw, p_recon)
    matching_results[csv_path] = df # df.match_num.value_counts()


../test-data/OpenContext/Petra Artifacts.csvh Periods.csvv

In [8]:
from itertools import islice

for (csv_path, df) in islice(matching_results.items(),None):
    print (csv_path, df.match_num.value_counts().get(0), df.match_num.value_counts().get(1))

        

../test-data/OpenContext/Cyprus PKAP Survey.csv 8278 159
../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv 8278 159
../test-data/OpenContext/European Cattle with Periods.csv 5668 None
../test-data/OpenContext/Petra Artifacts.csv 52382 105


In [9]:
df = matching_results[OPENCONTEXT_TEST_FILES[1]['csv_path']]

df.head()

Unnamed: 0,URI,Item Label,Project Label,Project URI,Context (1),Context (2),Context (3),Context (4),Context (5),Context URI,...,periodo,recon-match-id,match-is-best,judgement,reconcile-key,Extant Part,Fabric group,match_num,match_name,match_id
0,http://opencontext.org/subjects/8C2A609B-B20B-...,Batch 131 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff,True,matched,Late Roman|Cyprus|300|749,rim,coarse,0,,
1,http://opencontext.org/subjects/636D6826-BCBD-...,Batch 17,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Dhekeleia,Unit 223,,http://opencontext.org/subjects/8E773523-4391-...,...,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff,True,matched,Late Roman|Cyprus|300|749,handle,amphora,0,,
2,http://opencontext.org/subjects/CA8C3EE0-0F09-...,Batch 19,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,West Pyla,Unit 546,,http://opencontext.org/subjects/34C7E1EC-F44D-...,...,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff,True,matched,Late Roman|Cyprus|300|749,handle,amphora,0,,
3,http://opencontext.org/subjects/622BEE81-FC52-...,Batch 140 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,"Archaic-Classical [Cyprus, Cyprus: -0749 to -0...",http://n2t.net/ark:/99152/p0dg76fk4nc,False,matched,Classical|Cyprus|-474|-312,handle,fine,0,,
4,http://opencontext.org/subjects/58920419-0245-...,Batch 7 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,"Archaic-Hellenistic [Cyprus, Cyprus: -0749 to ...",http://n2t.net/ark:/99152/p0dg76f4rhx,True,matched,Archaic-Hellenistic|Cyprus|-750|-100,handle,coarse,0,,


# PKAP specifically

In [10]:
# let's look at the PKAP dataset
# look for unique 4-tuples in the PKAP dataset

from collections import Counter

recon_inputs = ['query', 'location', 'start', 'stop']

#path = '../test-data/OpenContext/Cyprus PKAP Survey.csv'
path = '../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv'

df = matching_results[path]

_ = df[[kw_map[path][input_] for input_ in recon_inputs]]
pkap_counts = Counter([tuple(x) for x in _.values])
len(pkap_counts)

78

In [11]:
queries = []

# use str(key) as label

for key in pkap_counts.keys():
    q_dict = dict(zip(recon_inputs, key))
    queries.append(
        RQuery(q_dict['query'], label=str(key), 
        properties=[
          RProperty('location', q_dict['location']),
          RProperty('start', q_dict['start']),
          RProperty('stop', q_dict['stop'])
        ])
    )

recon_results = p_recon.reconcile(queries, method='post')

In [12]:
# compare with the hand-reconciled CSV file
# ../test-data/OpenContext/Petra Artifacts-reconiled.csv

import csv

#csv_path = "../test-data/OpenContext/Petra Artifacts-reconciled.csv"
csv_path = "../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv"

with contextlib.ExitStack() as stack:
    csvfile = stack.enter_context(open(csv_path))

    df = pd.read_csv(csvfile)
    df = df.fillna('')
    df['rquery'] = df.apply(lambda row: tuple([row[kw_map[path][input_]] for input_ in recon_inputs]), axis=1)

In [13]:
# establish that Adam was consistent in reconciliation judgments

k = df.groupby('rquery').apply(lambda rows: len(rows['recon-match-id'].unique()))
assert np.alltrue(k==1)


In [14]:
# mapping

reconciled_map = df.groupby('rquery').apply(lambda rows: rows['recon-match-id'].unique()[0])

In [15]:
# now compare mapping to recon_results

def id_loc_in_list(id_, list_):
    if len(id_) == 0:
        if len(list_) == 0:
            return -1
        else:
            return -1
        
    pos = -1
    for (i, item) in enumerate(list_):
        if id_ == item:
            pos = i
            break
            
    return pos


matched_results = []

for key in pkap_counts.keys():
    v = recon_results[str(key)]
    result_ids = [r['id'] for r in v['result']]
    print (key, reconciled_map[key], len(result_ids), id_loc_in_list(reconciled_map[key], result_ids))
    matched_results.append({'qt':key, 'candidate0': reconciled_map[key]})


('Late Roman', 'Cyprus', '300', '749') http://n2t.net/ark:/99152/p0dg76fbqff 15 2
('Classical', 'Cyprus', '-474', '-312') http://n2t.net/ark:/99152/p0dg76fk4nc 33 17
('Archaic-Hellenistic', 'Cyprus', '-750', '-100') http://n2t.net/ark:/99152/p0dg76f4rhx 89 0
('Classical-Hellenistic', 'Cyprus', '-474', '-100') http://n2t.net/ark:/99152/p0dg76fkv78 70 0
('Cypro-Archaic', 'Cyprus', '-750', '-312') http://n2t.net/ark:/99152/p08m57h2cv3 1 0
('Roman', 'Cyprus', '-99', '749') http://n2t.net/ark:/99152/p0dg76fx784 43 1
('Iron Age', 'Cyprus', '-1050', '-312') http://n2t.net/ark:/99152/p0dg76f6zd3 97 14
('Hellenistic', 'Cyprus', '-311', '-100') http://n2t.net/ark:/99152/p0dg76fqj57 38 14
('Hellenistic-Early Roman', 'Cyprus', '-311', '299') http://n2t.net/ark:/99152/p0dg76fk9wq 53 0
('Ancient Historic', 'Cyprus', '-750', '749') http://n2t.net/ark:/99152/p0dg76fxkgg 2 0
('Ancient', 'Cyprus', '-9000', '749') http://n2t.net/ark:/99152/p0dg76fj92d 3 0
('Cypro-Geometric', 'Cyprus', '-1050', '-751') ht

In [16]:
# compare the sets of 4-tuples that feed into the reconcilers 

raw_tuples = set(pkap_counts.keys())
reconciled_tuples = set(df['rquery'])

raw_tuples == reconciled_tuples

True

In [17]:
# Check by hand

def reconcile(p_recon, qt, candidate0=None):
    """
    returns all candidates, position of candidate0 among the candidates
    """
    queries = [RQuery(qt[0], label='q0', properties=[
            RProperty('location', qt[1]),
            RProperty('start', qt[2]),
            RProperty('end', qt[3])
    ])]
    r = p_recon.reconcile(queries, method='post')['q0']
    results = r['result']
    
    if candidate0 is not None:
        result_ids = [r['id'] for r in results]
        loc_id = id_loc_in_list(candidate0, result_ids)
        if loc_id > -1:
            matched = results[loc_id]['match']
        else:
            matched = False
    else:
        loc_id = -1
    
    return {
        'results': results,
        'candidate0': candidate0,
        'loc_id':loc_id,
        'matched':matched,
        'results_count': len(results)
    }


    


In [18]:
# r = reconcile(p_recon,('Late Roman', 'Cyprus', '300', '749'),  'http://n2t.net/ark:/99152/p0dg76fbqff')
r = reconcile(p_recon,('Classical', 'Cyprus', '-474', '-312'), 'http://n2t.net/ark:/99152/p0dg76fk4nc')

r['loc_id'], r['results_count'], r['results'][r['loc_id']]['name'] if r['loc_id'] > -1 else '', r['matched']

(17, 33, 'Archaic-Classical [Cyprus, Cyprus: -0749 to -0311]', False)

In [19]:
# Loop through the matches done in OpenReconcile
# and redo the reconcilation, calculating whether the results matched and which results matched

results = []

for match in matched_results:
    r = reconcile(p_recon, match['qt'], match['candidate0'])
    try:
        match_name = r['results'][r['loc_id']]['name'] if (r['loc_id'] > -1 and r['results_count'] > 0) else ''
        results.append({'query': match['qt'][0],
                'location': match['qt'][1],
                'start': match['qt'][2],
                'stop': match['qt'][3],
                'loc_id': r['loc_id'], 
                'results_count': r['results_count'], 
                'match_name': match_name,
                'match_id':match['candidate0'],
                'matched':r['matched']
            })
    except Exception as e:
        print (e)

In [20]:
df2 = DataFrame(results, columns=['query', 'location', 'start', 'stop', 'loc_id', 'results_count',
                                  'match_name', 'match_id', 'matched'])
df2.head()

Unnamed: 0,query,location,start,stop,loc_id,results_count,match_name,match_id,matched
0,Late Roman,Cyprus,300,749,2,15,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff,False
1,Classical,Cyprus,-474,-312,17,33,"Archaic-Classical [Cyprus, Cyprus: -0749 to -0...",http://n2t.net/ark:/99152/p0dg76fk4nc,False
2,Archaic-Hellenistic,Cyprus,-750,-100,0,89,"Archaic-Hellenistic [Cyprus, Cyprus: -0749 to ...",http://n2t.net/ark:/99152/p0dg76f4rhx,False
3,Classical-Hellenistic,Cyprus,-474,-100,0,70,"Classical-Hellenistic [Cyprus, Cyprus: -0473 t...",http://n2t.net/ark:/99152/p0dg76fkv78,False
4,Cypro-Archaic,Cyprus,-750,-312,0,1,Cypro-Archaic [Cyprus: -0749 to -0449],http://n2t.net/ark:/99152/p08m57h2cv3,True


In [21]:
df2.matched.value_counts()

False    73
True      5
Name: matched, dtype: int64

In [22]:
df2[df2.matched]

Unnamed: 0,query,location,start,stop,loc_id,results_count,match_name,match_id,matched
4,Cypro-Archaic,Cyprus,-750,-312,0,1,Cypro-Archaic [Cyprus: -0749 to -0449],http://n2t.net/ark:/99152/p08m57h2cv3,True
26,Post-Prehistoric,Cyprus,-999,-999,0,1,"Post-Prehistoric [Cyprus, Cyprus: -0998 to 2008]",http://n2t.net/ark:/99152/p0dg76f98q7,True
35,Cypro-Archaic,Cyprus,-750,-475,0,1,Cypro-Archaic [Cyprus: -0749 to -0449],http://n2t.net/ark:/99152/p08m57h2cv3,True
37,Ancient-Medieval,Cyprus,-3500,1570,0,1,"Ancient-Medieval [Cyprus, Cyprus: -3499 to 1570]",http://n2t.net/ark:/99152/p0dg76fjh5t,True
41,Roman-Modern,Cyprus,-99,-99,0,1,"Roman-Modern [Cyprus, Cyprus: -0098 to 2008]",http://n2t.net/ark:/99152/p0dg76fd4wb,True


# sort these rows

* matched = True, False (matched desc)
* if loc_id > -1, ==-1
* loc_id ascending 



In [23]:
df2['sort_key'] = df2.apply(lambda row: (not row['matched'], 
                                         row['loc_id'] < 0, 
                                         row['loc_id'], 
                                         len(row['query']) == 0),
                             axis=1)
df2.head()

Unnamed: 0,query,location,start,stop,loc_id,results_count,match_name,match_id,matched,sort_key
0,Late Roman,Cyprus,300,749,2,15,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff,False,"(True, False, 2, False)"
1,Classical,Cyprus,-474,-312,17,33,"Archaic-Classical [Cyprus, Cyprus: -0749 to -0...",http://n2t.net/ark:/99152/p0dg76fk4nc,False,"(True, False, 17, False)"
2,Archaic-Hellenistic,Cyprus,-750,-100,0,89,"Archaic-Hellenistic [Cyprus, Cyprus: -0749 to ...",http://n2t.net/ark:/99152/p0dg76f4rhx,False,"(True, False, 0, False)"
3,Classical-Hellenistic,Cyprus,-474,-100,0,70,"Classical-Hellenistic [Cyprus, Cyprus: -0473 t...",http://n2t.net/ark:/99152/p0dg76fkv78,False,"(True, False, 0, False)"
4,Cypro-Archaic,Cyprus,-750,-312,0,1,Cypro-Archaic [Cyprus: -0749 to -0449],http://n2t.net/ark:/99152/p08m57h2cv3,True,"(False, False, 0, False)"


In [24]:
df2.sort_values(by='sort_key')

Unnamed: 0,query,location,start,stop,loc_id,results_count,match_name,match_id,matched,sort_key
35,Cypro-Archaic,Cyprus,-750,-475,0,1,Cypro-Archaic [Cyprus: -0749 to -0449],http://n2t.net/ark:/99152/p08m57h2cv3,True,"(False, False, 0, False)"
37,Ancient-Medieval,Cyprus,-3500,1570,0,1,"Ancient-Medieval [Cyprus, Cyprus: -3499 to 1570]",http://n2t.net/ark:/99152/p0dg76fjh5t,True,"(False, False, 0, False)"
4,Cypro-Archaic,Cyprus,-750,-312,0,1,Cypro-Archaic [Cyprus: -0749 to -0449],http://n2t.net/ark:/99152/p08m57h2cv3,True,"(False, False, 0, False)"
26,Post-Prehistoric,Cyprus,-999,-999,0,1,"Post-Prehistoric [Cyprus, Cyprus: -0998 to 2008]",http://n2t.net/ark:/99152/p0dg76f98q7,True,"(False, False, 0, False)"
41,Roman-Modern,Cyprus,-99,-99,0,1,"Roman-Modern [Cyprus, Cyprus: -0098 to 2008]",http://n2t.net/ark:/99152/p0dg76fd4wb,True,"(False, False, 0, False)"
22,Ceramic Age,Cyprus,-1700,-1700,0,11,"Ceramic Age [Cyprus, Cyprus: -3499 to 2008]",http://n2t.net/ark:/99152/p0dg76fzg6j,False,"(True, False, 0, False)"
21,Classical-Roman,Cyprus,-474,749,0,75,"Classical-Roman [Cyprus, Cyprus: -0473 to 0749]",http://n2t.net/ark:/99152/p0dg76frgs8,False,"(True, False, 0, False)"
46,Late Cypriot-Roman,Cyprus,-1650,749,0,49,"Late Cypriot-Roman\t [Cyprus, Cyprus: -1649 to...",http://n2t.net/ark:/99152/p0dg76f6jkf,False,"(True, False, 0, False)"
50,Late Cypriot,Cyprus,-1650,-1050,0,7,"Late Bronze Age (= Late Cypriot) [Cyprus, Cypr...",http://n2t.net/ark:/99152/p083p5rpb46,False,"(True, False, 0, False)"
18,Archaic-Classical,Cyprus,-750,-312,0,84,"Archaic-Classical [Cyprus, Cyprus: -0749 to -0...",http://n2t.net/ark:/99152/p0dg76fk4nc,False,"(True, False, 0, False)"


In [25]:
df2.loc_id.value_counts()

-1     25
 0     24
 2      5
 1      4
 9      2
 17     2
 3      2
 14     2
 6      2
 7      1
 47     1
 10     1
 37     1
 13     1
 15     1
 16     1
 20     1
 28     1
 12     1
Name: loc_id, dtype: int64

In [26]:
!ls ../test-data/

[34mOpenContext[m[m                      periodo_simple_example-recon.csv
[34mperiodo_reconciler_testdata[m[m      periodo_simple_example.csv


In [28]:
df2.sort_values(by='sort_key').to_csv('../test-data/OpenContext/Cyprus PKAP Survey-2019.04.02.csv')

## rows matched vs not matched for each test file

In [29]:

rows_html = ("".join(["<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(
 csv_path, df.match_num.value_counts().get(1, 0), df.match_num.value_counts().get(0, 0))
for (csv_path, df) in matching_results.items()]))


html_ = """<table>
<tr>
    <th>path</th>
    <th>matches</th>
    <th>non-matches</th>
</tr>
{}
</table>""".format(rows_html)

display(HTML(html_))

path,matches,non-matches
../test-data/OpenContext/Cyprus PKAP Survey.csv,159,8278
../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv,159,8278
../test-data/OpenContext/European Cattle with Periods.csv,0,5668
../test-data/OpenContext/Petra Artifacts.csv,105,52382


## which combination of query/location/start/stop were matched?

In [30]:
matching_results.keys()

dict_keys(['../test-data/OpenContext/Cyprus PKAP Survey.csv', '../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv', '../test-data/OpenContext/European Cattle with Periods.csv', '../test-data/OpenContext/Petra Artifacts.csv'])

In [31]:
rows_html = []

for (csv_path, df) in matching_results.items():
    kw = kw_map[csv_path]
    counter = Counter(list(df.apply(lambda row: (row[kw['query']],
                      row[kw['location']],
                      row[kw['start']],
                      row[kw['stop']],
                      row['match_num'],
                      row['match_id'],
                      row['match_name']
                     ), axis=1)))
    
    matching_items = [(k,c) for (k,c) in counter.items() if k[4] > 0]

    for (match, count) in matching_items:
        row_html = """<tr><td>{}</td><td>{}</td><td>{}</td><td>{}</td>
                      <td>{}</td><td>{}</td><td>{}</td><td>{}</td></tr>""".format(
             csv_path, match[0], match[1], match[2], match[3], count, match[5], match[6])
        rows_html.append(row_html)
        

rows_html = "".join(rows_html)

html_ = """<table>
<tr>
    <th>path</th>
    <th>query</th>
    <th>location</th>
    <th>start</th>
    <th>stop</th>
    <th>num of rows</th>
    <th>match_id</th>
    <th>match_name</th>
</tr>
{}
</table>""".format(rows_html)
        
display(HTML(html_))

path,query,location,start,stop,num of rows,match_id,match_name
../test-data/OpenContext/Cyprus PKAP Survey.csv,Cypro-Archaic,Cyprus,-750,-312,3,http://n2t.net/ark:/99152/p08m57h2cv3,Cypro-Archaic [Cyprus: -0749 to -0449]
../test-data/OpenContext/Cyprus PKAP Survey.csv,Post-Prehistoric,Cyprus,-999,-999,60,http://n2t.net/ark:/99152/p0dg76f98q7,"Post-Prehistoric [Cyprus, Cyprus: -0998 to 2008]"
../test-data/OpenContext/Cyprus PKAP Survey.csv,Cypro-Archaic,Cyprus,-750,-475,1,http://n2t.net/ark:/99152/p08m57h2cv3,Cypro-Archaic [Cyprus: -0749 to -0449]
../test-data/OpenContext/Cyprus PKAP Survey.csv,Ancient-Medieval,Cyprus,-3500,1570,92,http://n2t.net/ark:/99152/p0dg76fjh5t,"Ancient-Medieval [Cyprus, Cyprus: -3499 to 1570]"
../test-data/OpenContext/Cyprus PKAP Survey.csv,Roman-Modern,Cyprus,-99,-99,3,http://n2t.net/ark:/99152/p0dg76fd4wb,"Roman-Modern [Cyprus, Cyprus: -0098 to 2008]"
../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv,Cypro-Archaic,Cyprus,-750,-312,3,http://n2t.net/ark:/99152/p08m57h2cv3,Cypro-Archaic [Cyprus: -0749 to -0449]
../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv,Post-Prehistoric,Cyprus,-999,-999,60,http://n2t.net/ark:/99152/p0dg76f98q7,"Post-Prehistoric [Cyprus, Cyprus: -0998 to 2008]"
../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv,Cypro-Archaic,Cyprus,-750,-475,1,http://n2t.net/ark:/99152/p08m57h2cv3,Cypro-Archaic [Cyprus: -0749 to -0449]
../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv,Ancient-Medieval,Cyprus,-3500,1570,92,http://n2t.net/ark:/99152/p0dg76fjh5t,"Ancient-Medieval [Cyprus, Cyprus: -3499 to 1570]"
../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv,Roman-Modern,Cyprus,-99,-99,3,http://n2t.net/ark:/99152/p0dg76fd4wb,"Roman-Modern [Cyprus, Cyprus: -0098 to 2008]"


In [32]:
len(counter)

99

# studying non-matches vs matches

In [33]:
# how to merge two counters?

from collections import Counter

def recon_data(df, csv_path, kw):

    if len(df):
        k0 = df.apply(lambda row: (
                          csv_path,
                          row[kw['query']],
                          row[kw['location']],
                          row[kw['start']],
                          row[kw['stop']],
                          row['match_num'],
                          row['match_id'],
                          row['match_name']
                         ), axis=1)
        k = list(k0)
        return (k)
    else:
        return []
    

matches = Counter()
non_matches = Counter()

for (csv_path, df) in matching_results.items():
    kw = kw_map[csv_path]
    matches.update(recon_data(df[df.match_num > 0], csv_path, kw))
    non_matches.update(recon_data(df[df.match_num == 0], csv_path, kw))



len(matches), len(non_matches)

(16, 408)

In [34]:
(csv_path, df) = list(matching_results.items())[1]
kw = kw_map[csv_path]
df2 = df[df.match_num > 0]
k = recon_data(df2, csv_path, kw)
len(k)

159

In [35]:
for row in matches.keys():
    print(row)

('../test-data/OpenContext/Cyprus PKAP Survey.csv', 'Cypro-Archaic', 'Cyprus', '-750', '-312', 1, 'http://n2t.net/ark:/99152/p08m57h2cv3', 'Cypro-Archaic [Cyprus: -0749 to -0449]')
('../test-data/OpenContext/Cyprus PKAP Survey.csv', 'Post-Prehistoric', 'Cyprus', '-999', '-999', 1, 'http://n2t.net/ark:/99152/p0dg76f98q7', 'Post-Prehistoric [Cyprus, Cyprus: -0998 to 2008]')
('../test-data/OpenContext/Cyprus PKAP Survey.csv', 'Cypro-Archaic', 'Cyprus', '-750', '-475', 1, 'http://n2t.net/ark:/99152/p08m57h2cv3', 'Cypro-Archaic [Cyprus: -0749 to -0449]')
('../test-data/OpenContext/Cyprus PKAP Survey.csv', 'Ancient-Medieval', 'Cyprus', '-3500', '1570', 1, 'http://n2t.net/ark:/99152/p0dg76fjh5t', 'Ancient-Medieval [Cyprus, Cyprus: -3499 to 1570]')
('../test-data/OpenContext/Cyprus PKAP Survey.csv', 'Roman-Modern', 'Cyprus', '-99', '-99', 1, 'http://n2t.net/ark:/99152/p0dg76fd4wb', 'Roman-Modern [Cyprus, Cyprus: -0098 to 2008]')
('../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv', 'C

In [36]:
[ "|{}|".format("|".join([str(col) for col in row]))
 for row in matches.keys()]

['|../test-data/OpenContext/Cyprus PKAP Survey.csv|Cypro-Archaic|Cyprus|-750|-312|1|http://n2t.net/ark:/99152/p08m57h2cv3|Cypro-Archaic [Cyprus: -0749 to -0449]|',
 '|../test-data/OpenContext/Cyprus PKAP Survey.csv|Post-Prehistoric|Cyprus|-999|-999|1|http://n2t.net/ark:/99152/p0dg76f98q7|Post-Prehistoric [Cyprus, Cyprus: -0998 to 2008]|',
 '|../test-data/OpenContext/Cyprus PKAP Survey.csv|Cypro-Archaic|Cyprus|-750|-475|1|http://n2t.net/ark:/99152/p08m57h2cv3|Cypro-Archaic [Cyprus: -0749 to -0449]|',
 '|../test-data/OpenContext/Cyprus PKAP Survey.csv|Ancient-Medieval|Cyprus|-3500|1570|1|http://n2t.net/ark:/99152/p0dg76fjh5t|Ancient-Medieval [Cyprus, Cyprus: -3499 to 1570]|',
 '|../test-data/OpenContext/Cyprus PKAP Survey.csv|Roman-Modern|Cyprus|-99|-99|1|http://n2t.net/ark:/99152/p0dg76fd4wb|Roman-Modern [Cyprus, Cyprus: -0098 to 2008]|',
 '|../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv|Cypro-Archaic|Cyprus|-750|-312|1|http://n2t.net/ark:/99152/p08m57h2cv3|Cypro-Archaic [Cy

In [37]:
from IPython.display import Markdown, HTML, display
from jinja2 import Template

matches_template = Template("""
|path|query|location|start|stop|match_id|match_name|
|--|--|--|--|--|--|--|
{% for item in items %}{{item}}\n{% endfor %}
""")

Markdown(matches_template.render(items=[ "|{}|".format("|".join([str(col) for col in row]))
 for row in matches.keys()]))


|path|query|location|start|stop|match_id|match_name|
|--|--|--|--|--|--|--|
|../test-data/OpenContext/Cyprus PKAP Survey.csv|Cypro-Archaic|Cyprus|-750|-312|1|http://n2t.net/ark:/99152/p08m57h2cv3|Cypro-Archaic [Cyprus: -0749 to -0449]|
|../test-data/OpenContext/Cyprus PKAP Survey.csv|Post-Prehistoric|Cyprus|-999|-999|1|http://n2t.net/ark:/99152/p0dg76f98q7|Post-Prehistoric [Cyprus, Cyprus: -0998 to 2008]|
|../test-data/OpenContext/Cyprus PKAP Survey.csv|Cypro-Archaic|Cyprus|-750|-475|1|http://n2t.net/ark:/99152/p08m57h2cv3|Cypro-Archaic [Cyprus: -0749 to -0449]|
|../test-data/OpenContext/Cyprus PKAP Survey.csv|Ancient-Medieval|Cyprus|-3500|1570|1|http://n2t.net/ark:/99152/p0dg76fjh5t|Ancient-Medieval [Cyprus, Cyprus: -3499 to 1570]|
|../test-data/OpenContext/Cyprus PKAP Survey.csv|Roman-Modern|Cyprus|-99|-99|1|http://n2t.net/ark:/99152/p0dg76fd4wb|Roman-Modern [Cyprus, Cyprus: -0098 to 2008]|
|../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv|Cypro-Archaic|Cyprus|-750|-312|1|http://n2t.net/ark:/99152/p08m57h2cv3|Cypro-Archaic [Cyprus: -0749 to -0449]|
|../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv|Post-Prehistoric|Cyprus|-999|-999|1|http://n2t.net/ark:/99152/p0dg76f98q7|Post-Prehistoric [Cyprus, Cyprus: -0998 to 2008]|
|../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv|Cypro-Archaic|Cyprus|-750|-475|1|http://n2t.net/ark:/99152/p08m57h2cv3|Cypro-Archaic [Cyprus: -0749 to -0449]|
|../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv|Ancient-Medieval|Cyprus|-3500|1570|1|http://n2t.net/ark:/99152/p0dg76fjh5t|Ancient-Medieval [Cyprus, Cyprus: -3499 to 1570]|
|../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv|Roman-Modern|Cyprus|-99|-99|1|http://n2t.net/ark:/99152/p0dg76fd4wb|Roman-Modern [Cyprus, Cyprus: -0098 to 2008]|
|../test-data/OpenContext/Petra Artifacts.csv|Herodian|Jordan|-200|360|1|http://n2t.net/ark:/99152/p0m63njw2xp|Herodian Period [Israel, Jordan, Israel, Jordan: -0037 to 0070]|
|../test-data/OpenContext/Petra Artifacts.csv|Islamic|Jordan|-200|360|1|http://n2t.net/ark:/99152/p044n254d9s|Medieval Islamic [South eastern Kazakhstan, Kazakhstan: 0750 to 1200]|
|../test-data/OpenContext/Petra Artifacts.csv|Roman Provincial|Jordan|-200|360|1|http://n2t.net/ark:/99152/p0gjgrsmhnc|Greek and Roman Provincial [United Kingdom, United Kingdom: -0699 to 0297]|
|../test-data/OpenContext/Petra Artifacts.csv|Hasmonaean? or Herodian?|Jordan|-200|360|1|http://n2t.net/ark:/99152/p0m63njw2xp|Herodian Period [Israel, Jordan, Israel, Jordan: -0037 to 0070]|
|../test-data/OpenContext/Petra Artifacts.csv|Roman (Provincial?)|Jordan|-200|360|1|http://n2t.net/ark:/99152/p0gjgrsmhnc|Greek and Roman Provincial [United Kingdom, United Kingdom: -0699 to 0297]|
|../test-data/OpenContext/Petra Artifacts.csv|Roman (Provincial)?|Jordan|-200|360|1|http://n2t.net/ark:/99152/p0gjgrsmhnc|Greek and Roman Provincial [United Kingdom, United Kingdom: -0699 to 0297]|


In [38]:
# matches that are wrong

queries = [
    RQuery("Not determined", label="with query",  properties=[
        RProperty('location', 'Spain'),
        RProperty('start', -1500),
        RProperty('end', -714)
    ]),     
    RQuery("", label="empty query",  properties=[
        RProperty('location', 'Spain'),
        RProperty('start', -1500),
        RProperty('end', -714)
    ]),
    RQuery("Spain", label="use location for query",  properties=[
        RProperty('location', 'Spain'),
        RProperty('start', -1500),
        RProperty('end', -714)
    ]),
    RQuery("Bronze", label="use location for query",  properties=[
        RProperty('location', 'Spain'),
        RProperty('start', -1500),
        RProperty('end', -714)
    ])   
]

r = p_recon.reconcile(queries, method='post')

r

{'empty query': {'result': []},
 'use location for query': {'result': [{'id': 'http://n2t.net/ark:/99152/p0z3skmnss7',
    'match': False,
    'name': 'Bronze [Palestine, Israel, Jordan: -3299 to -1199]',
    'score': 0,
    'type': [{'id': 'http://www.w3.org/2004/02/skos/core#Concept',
      'name': 'Period definition'}]},
   {'id': 'http://n2t.net/ark:/99152/p03wskdfd83',
    'match': False,
    'name': 'Bronze Age Britain (ca. 2500 - ca. 800 BC/BCE) [Britain, United Kingdom: -2500 to -0800]',
    'score': 1,
    'type': [{'id': 'http://www.w3.org/2004/02/skos/core#Concept',
      'name': 'Period definition'}]},
   {'id': 'http://n2t.net/ark:/99152/p03wskdjqsh',
    'match': False,
    'name': 'Bronze Age Malta (ca. 2,500-700 BC) [Malta, Malta: -2500 to -0700]',
    'score': 1,
    'type': [{'id': 'http://www.w3.org/2004/02/skos/core#Concept',
      'name': 'Period definition'}]},
   {'id': 'http://n2t.net/ark:/99152/p06v8w4bz9k',
    'match': False,
    'name': 'Age de Bronze [Moroc