In [1]:
import csv
import contextlib
import os, errno
from collections import OrderedDict, Counter

from IPython.core.display import display, HTML

from pandas import DataFrame

from periodo_reconciler import (
    RProperty,
    RQuery,
    PeriodoReconciler,
    CsvReconciler
)

In [2]:
def output_path_name(inpath, test_data_dir="../test-data/",
                     test_output_dir="../test-output/"):
    """
    given a path in the test directory, return the path for corresponding output in the test output dir
    """
 
    return os.path.join(test_output_dir, os.path.relpath(inpath, start=test_data_dir))

def makedirs_for_path(fpath):
    """
    make sure that the directory for fpath exists
    """
    
    (fdir, fname) = os.path.split(fpath)
    try:
        os.makedirs(fdir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise
            
    return True


In [3]:
def reconcile_to_csv_df(csv_path, kw, p_recon):
    """
    Inputs:  
    csv_path: the path of the csv file to reconcile
    kw: a dict holding the query, location, start, stop fields
    p_recon: a PeriodoReconciler object
    
    Side-effect:
    * a csv file with the matches in the output directory
    * returns a Pandas DataFrame holding the input data along with match data
    
    """

    output_path = output_path_name(csv_path)
    makedirs_for_path(output_path)
    
    # https://stackoverflow.com/a/19412700/7782
    with contextlib.ExitStack() as stack:
        csvfile = stack.enter_context(open(csv_path))
        outputfile = stack.enter_context(open(output_path, "w"))                  

        # p_recon = PeriodoReconciler(host='localhost:8142')
        c_recon = CsvReconciler(csvfile, p_recon, **kw)

        matches = list(c_recon.matches())

        c_recon.to_csv(outputfile, matches)
        df = DataFrame(matches)

        return df


# simple example

In [4]:
# simple example

csv_path = "../test-data/periodo_simple_example.csv"
kw = {
   'location': 'location',
   'query': 'query',
   'start': 'start',
   'stop': 'end'
}

p_recon = PeriodoReconciler(host='localhost:8142')
df = reconcile_to_csv_df(csv_path, kw, p_recon)

df.match_num.value_counts()

1    2
0    1
Name: match_num, dtype: int64

# OpenContext examples

In [5]:
# the list of OpenContext CSV files along with the columns to be fed to the reconciler

OPENCONTEXT_TEST_FILES = [{'csv_path': '../test-data/OpenContext/Cyprus PKAP Survey.csv',
  'kw': {'location': 'Context (1)',
   'query': 'Period',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE'}},
 {'csv_path': '../test-data/OpenContext/European Cattle with Periods.csv',
  'kw': {'location': 'Context (1)',
   'query': 'Period',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE'}},
 {'csv_path': '../test-data/OpenContext/Petra Artifacts.csv',
  'kw': {'location': 'Context (1)',
   'query': 'Culture',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE'}}]


kw_map = dict([(f['csv_path'], f['kw']) for f in OPENCONTEXT_TEST_FILES])

In [6]:
# write out incantation for the command line CSV reconciler


for f in OPENCONTEXT_TEST_FILES:
    options_from_kw = " ".join(['--{}="{}"'.format(k,v) for (k,v) in f['kw'].items()])
    print('periodo-reconciler-py {} "{}" -'.format(options_from_kw, f['csv_path'] ))

periodo-reconciler-py --location="Context (1)" --query="Period" --start="Early BCE/CE" --stop="Late BCE/CE" "../test-data/OpenContext/Cyprus PKAP Survey.csv" -
periodo-reconciler-py --location="Context (1)" --query="Period" --start="Early BCE/CE" --stop="Late BCE/CE" "../test-data/OpenContext/European Cattle with Periods.csv" -
periodo-reconciler-py --location="Context (1)" --query="Culture" --start="Early BCE/CE" --stop="Late BCE/CE" "../test-data/OpenContext/Petra Artifacts.csv" -


In [7]:
matching_results = {}

p_recon = PeriodoReconciler(host='localhost:8142')

for file in OPENCONTEXT_TEST_FILES:
    csv_path = file['csv_path']
    kw = file['kw']
    print ("\r{}".format(csv_path), end='')
    df = reconcile_to_csv_df(csv_path, kw, p_recon)
    matching_results[csv_path] = df # df.match_num.value_counts()


../test-data/OpenContext/Petra Artifacts.csvh Periods.csv

## rows matched vs not matched for each test file

In [8]:

rows_html = ("".join(["<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(
 csv_path, df.match_num.value_counts()[1], df.match_num.value_counts()[0])
for (csv_path, df) in matching_results.items()]))


html_ = """<table>
<tr>
    <th>path</th>
    <th>matches</th>
    <th>non-matches</th>
</tr>
{}
</table>""".format(rows_html)

display(HTML(html_))

path,matches,non-matches
../test-data/OpenContext/Cyprus PKAP Survey.csv,159,8278
../test-data/OpenContext/European Cattle with Periods.csv,11,5657
../test-data/OpenContext/Petra Artifacts.csv,114,52373


## which combination of query/location/start/stop were matched?

In [9]:
rows_html = []

for (csv_path, df) in matching_results.items():
    kw = kw_map[csv_path]
    counter = Counter(list(df.apply(lambda row: (row[kw['query']],
                      row[kw['location']],
                      row[kw['start']],
                      row[kw['stop']],
                      row['match_num'],
                      row['match_id'],
                      row['match_name']
                     ), axis=1)))
    
    matching_items = [(k,c) for (k,c) in counter.items() if k[4] > 0 ]

    for (match, count) in matching_items:
        row_html = """<tr><td>{}</td><td>{}</td><td>{}</td><td>{}</td>
                      <td>{}</td><td>{}</td><td>{}</td><td>{}</td></tr>""".format(
             csv_path, match[0], match[1], match[2], match[3], count, match[5], match[6])
        rows_html.append(row_html)
        

rows_html = "".join(rows_html)

html_ = """<table>
<tr>
    <th>path</th>
    <th>query</th>
    <th>location</th>
    <th>start</th>
    <th>stop</th>
    <th>num of rows</th>
    <th>match_id</th>
    <th>match_name</th>
</tr>
{}
</table>""".format(rows_html)
        
display(HTML(html_))

path,query,location,start,stop,num of rows,match_id,match_name
../test-data/OpenContext/Cyprus PKAP Survey.csv,Cypro-Archaic,Cyprus,-750,-312,3,http://n2t.net/ark:/99152/p08m57h2cv3,Cypro-Archaic [Cyprus: -0749 to -0449]
../test-data/OpenContext/Cyprus PKAP Survey.csv,Post-Prehistoric,Cyprus,-999,-999,60,http://n2t.net/ark:/99152/p0dg76f98q7,"Post-Prehistoric [Cyprus, Cyprus: -0998 to 2008]"
../test-data/OpenContext/Cyprus PKAP Survey.csv,Cypro-Archaic,Cyprus,-750,-475,1,http://n2t.net/ark:/99152/p08m57h2cv3,Cypro-Archaic [Cyprus: -0749 to -0449]
../test-data/OpenContext/Cyprus PKAP Survey.csv,Ancient-Medieval,Cyprus,-3500,1570,92,http://n2t.net/ark:/99152/p0dg76fjh5t,"Ancient-Medieval [Cyprus, Cyprus: -3499 to 1570]"
../test-data/OpenContext/Cyprus PKAP Survey.csv,Roman-Modern,Cyprus,-99,-99,3,http://n2t.net/ark:/99152/p0dg76fd4wb,"Roman-Modern [Cyprus, Cyprus: -0098 to 2008]"
../test-data/OpenContext/European Cattle with Periods.csv,Not determined,Spain,-1500,-714,7,http://n2t.net/ark:/99152/p086kj9s4tx,"Not Reported [Missouri, Missouri: 1950 to 1950]"
../test-data/OpenContext/European Cattle with Periods.csv,Not determined,Spain,-2000,-714,2,http://n2t.net/ark:/99152/p086kj9s4tx,"Not Reported [Missouri, Missouri: 1950 to 1950]"
../test-data/OpenContext/European Cattle with Periods.csv,Not determined,Italy,-9500,1950,2,http://n2t.net/ark:/99152/p086kj9s4tx,"Not Reported [Missouri, Missouri: 1950 to 1950]"
../test-data/OpenContext/Petra Artifacts.csv,Herodian,Jordan,-200,360,4,http://n2t.net/ark:/99152/p0m63njw2xp,"Herodian Period [Israel, Jordan, Israel, Jordan: -0037 to 0070]"
../test-data/OpenContext/Petra Artifacts.csv,Islamic,Jordan,-200,360,93,http://n2t.net/ark:/99152/p044n254d9s,"Medieval Islamic [South eastern Kazakhstan, Kazakhstan: 0750 to 1200]"


In [10]:
len(counter)

99

# studying non-matches vs matches

In [11]:
matching_results.keys()

dict_keys(['../test-data/OpenContext/Cyprus PKAP Survey.csv', '../test-data/OpenContext/European Cattle with Periods.csv', '../test-data/OpenContext/Petra Artifacts.csv'])

In [12]:
# let's study the first csv file

df = matching_results[OPENCONTEXT_TEST_FILES[0]['csv_path']]
df.head()

Unnamed: 0,URI,Item Label,Project Label,Project URI,Context (1),Context (2),Context (3),Context (4),Context (5),Context URI,...,Has type,Chronotype,Collection Type,Material,Period,Extant Part,Fabric group,match_num,match_name,match_id
0,http://opencontext.org/subjects/8C2A609B-B20B-...,Batch 131 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,,"Coarse Ware, Roman Late",Survey,pottery,"Roman, Late",rim,coarse,0,,
1,http://opencontext.org/subjects/636D6826-BCBD-...,Batch 17,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Dhekeleia,Unit 223,,http://opencontext.org/subjects/8E773523-4391-...,...,Late Roman 1 Amphora; amphora,"Amphora, Late Roman 1",Survey,pottery,"Roman, Late",handle,amphora,0,,
2,http://opencontext.org/subjects/CA8C3EE0-0F09-...,Batch 19,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,West Pyla,Unit 546,,http://opencontext.org/subjects/34C7E1EC-F44D-...,...,Late Roman 1 Amphora; amphora,"Amphora, Late Roman 1",Survey,pottery,"Roman, Late",handle,amphora,0,,
3,http://opencontext.org/subjects/622BEE81-FC52-...,Batch 140 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,,"Black Glazed, Attic, Classical",Survey,pottery,Classical,handle,fine,0,,
4,http://opencontext.org/subjects/58920419-0245-...,Batch 7 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,,"Storage, Archaic Basket Handle",Survey,pottery,Archaic-Hellenistic,handle,coarse,0,,
