In [1]:
import csv
import io
import contextlib
import os, errno
from collections import OrderedDict, Counter

from IPython.core.display import display, HTML

from pandas import DataFrame
import pandas as pd
import numpy as np

from periodo_reconciler import (
    RProperty,
    RQuery,
    PeriodoReconciler,
    CsvReconciler
)

In [2]:
def output_path_name(inpath, test_data_dir="../test-data/",
                     test_output_dir="../test-output/"):
    """
    given a path in the test directory, return the path for corresponding output in the test output dir
    """
 
    return os.path.join(test_output_dir, os.path.relpath(inpath, start=test_data_dir))

def makedirs_for_path(fpath):
    """
    make sure that the directory for fpath exists
    """
    
    (fdir, fname) = os.path.split(fpath)
    try:
        os.makedirs(fdir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise
            
    return True


In [3]:
def reconcile_to_csv_df(csv_path, kw, p_recon):
    """
    Inputs:  
    csv_path: the path of the csv file to reconcile
    kw: a dict holding the query, location, start, stop fields
    p_recon: a PeriodoReconciler object
    
    Side-effect:
    * a csv file with the matches in the output directory
    * returns a Pandas DataFrame holding the input data along with match data
    
    """

    output_path = output_path_name(csv_path)
    makedirs_for_path(output_path)
    
    # https://stackoverflow.com/a/19412700/7782
    with contextlib.ExitStack() as stack:
        csvfile = stack.enter_context(open(csv_path))
        outputfile = stack.enter_context(open(output_path, "w"))                  

        # p_recon = PeriodoReconciler(host='localhost:8142')
        c_recon = CsvReconciler(csvfile, p_recon, **kw)

        matches = list(c_recon.matches())

        match_summary = io.StringIO()
        c_recon.match_summary_to_csv(match_summary)
        
        c_recon.to_csv(outputfile, matches)
        
        df = DataFrame(matches)
        df_summary = pd.read_csv(io.StringIO(match_summary.getvalue()))
        

        return (df, df_summary)


# simple example of running the reconciler against a CSV file

In [4]:
# simple example

csv_path = "../test-data/periodo_simple_example.csv"
kw = {
   'location': 'location',
   'query': 'query',
   'start': 'start',
   'stop': 'end',
   'transpose_query': True,
   'match_top_candidate': True
}

p_recon = PeriodoReconciler(host='localhost:8142')
(df, match_summary) = reconcile_to_csv_df(csv_path, kw, p_recon)

df.match_num.value_counts(), match_summary

(1    4
 0    1
 Name: match_num, dtype: int64,
             query location  start   stop  match_num  \
 0              北宋      NaN    NaN    NaN          1   
 1      bronze age      NaN    NaN    NaN          0   
 2  Ранньоримський  Ukraine  200.0  600.0          1   
 3      Late Roman   Cyprus  300.0  749.0          1   
 4     Roman, Late   Cyprus  300.0  749.0          1   
 
                                           match_name  \
 0         Northern Song [China, China: 0960 to 1127]   
 1  Bronze [Palestine, Israel, Jordan: -3299 to -1...   
 2  Ранньоримський період [Ukraine, Ukraine: -0049...   
 3          Late Roman [Cyprus, Cyprus: 0300 to 0749]   
 4          Late Roman [Cyprus, Cyprus: 0300 to 0749]   
 
                                 match_id  candidates_count  match_fallback_id  \
 0  http://n2t.net/ark:/99152/p0fp7wvjvn8                 1                NaN   
 1  http://n2t.net/ark:/99152/p0z3skmnss7                83                NaN   
 2  http://n2t.net/ark:/

# OpenContext examples

In [5]:
# the list of OpenContext CSV files along with the columns to be fed to the reconciler

OPENCONTEXT_TEST_FILES = [{'csv_path': '../test-data/OpenContext/Cyprus PKAP Survey.csv',
  'kw': {'location': 'Context (1)',
   'query': 'Period',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE',
   'transpose_query': True,
   'match_top_candidate': True}},
 {'csv_path': '../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv',
  'kw': {'location': 'Context (1)',
   'query': 'periodo-pre-match',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE'}},
 {'csv_path': '../test-data/OpenContext/European Cattle with Periods.csv',
  'kw': {'location': 'Context (1)',
   'query': 'Period',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE',
   'ignored_queries':'Not determined'
  }},
 {'csv_path': '../test-data/OpenContext/Petra Artifacts.csv',
  'kw': {'location': 'Context (1)',
   'query': 'Culture',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE',
   'ignored_queries':'other,lb'
    }}]


kw_map = dict([(f['csv_path'], f['kw']) for f in OPENCONTEXT_TEST_FILES])

In [6]:
matching_results = {}
match_summaries = {}

p_recon = PeriodoReconciler(host='localhost:8142')

for file in OPENCONTEXT_TEST_FILES:
    csv_path = file['csv_path']
    kw = file['kw']
    print ("\r{}".format(csv_path), end='')
    (df, match_summary)  = reconcile_to_csv_df(csv_path, kw, p_recon)
    matching_results[csv_path] = df # df.match_num.value_counts()
    match_summaries[csv_path] = match_summary


../test-data/OpenContext/Petra Artifacts.csvh Periods.csvv

In [7]:
from itertools import islice

for (csv_path, df) in islice(matching_results.items(),None):
    print (csv_path, df.match_num.value_counts().get(0), df.match_num.value_counts().get(1))

        

../test-data/OpenContext/Cyprus PKAP Survey.csv 5381 3056
../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv 5381 3056
../test-data/OpenContext/European Cattle with Periods.csv 5270 398
../test-data/OpenContext/Petra Artifacts.csv 52181 306


# PKAP

In [8]:
df = matching_results[OPENCONTEXT_TEST_FILES[0]['csv_path']]

df.head()

Unnamed: 0,URI,Item Label,Project Label,Project URI,Context (1),Context (2),Context (3),Context (4),Context (5),Context URI,...,Material,Period,Extant Part,Fabric group,candidates_count,match_num,match_name,match_id,match_fallback_id,match_fallback_name
0,http://opencontext.org/subjects/8C2A609B-B20B-...,Batch 131 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,pottery,"Roman, Late",rim,coarse,1,1,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff,,
1,http://opencontext.org/subjects/636D6826-BCBD-...,Batch 17,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Dhekeleia,Unit 223,,http://opencontext.org/subjects/8E773523-4391-...,...,pottery,"Roman, Late",handle,amphora,1,1,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff,,
2,http://opencontext.org/subjects/CA8C3EE0-0F09-...,Batch 19,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,West Pyla,Unit 546,,http://opencontext.org/subjects/34C7E1EC-F44D-...,...,pottery,"Roman, Late",handle,amphora,1,1,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff,,
3,http://opencontext.org/subjects/622BEE81-FC52-...,Batch 140 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,pottery,Classical,handle,fine,8,0,Classical (Greco-Roman; 550 BC-330 BC) [Afghan...,http://n2t.net/ark:/99152/p03wskd389m,,
4,http://opencontext.org/subjects/58920419-0245-...,Batch 7 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,pottery,Archaic-Hellenistic,handle,coarse,19,0,"Archaic-Hellenistic [Cyprus, Cyprus: -0749 to ...",http://n2t.net/ark:/99152/p0dg76f4rhx,,


In [9]:
len(df)

8437

In [10]:
df.candidates_count.value_counts()

1     3056
2     2813
5      789
0      443
24     384
10     202
12     178
11     137
8      121
6       97
7       69
9       50
18      40
19      40
3        8
16       5
15       4
4        1
Name: candidates_count, dtype: int64

In [11]:
# rewrite CSV to patch up rows with out any matches

In [12]:
from collections import Counter, defaultdict

c = defaultdict(Counter)

c['bach'].update(['1','2'])
c['mozart'].update([3])

In [13]:
c['bach'].most_common(1)[0]

('1', 1)

In [14]:
c

defaultdict(collections.Counter,
            {'bach': Counter({'1': 1, '2': 1}), 'mozart': Counter({3: 1})})

In [15]:
df.match_fallback_id.value_counts()

                                         8365
http://n2t.net/ark:/99152/p0dg76f2smn      70
http://n2t.net/ark:/99152/p0dg76fbqff       1
http://n2t.net/ark:/99152/p0dg76fkv78       1
Name: match_fallback_id, dtype: int64

In [16]:
df[(df.match_id == '') & (df.match_fallback_id == '')]

Unnamed: 0,URI,Item Label,Project Label,Project URI,Context (1),Context (2),Context (3),Context (4),Context (5),Context URI,...,Material,Period,Extant Part,Fabric group,candidates_count,match_num,match_name,match_id,match_fallback_id,match_fallback_name
129,http://opencontext.org/subjects/1771E5F4-E411-...,Batch 1,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla-Ridge,Unit 1402,,http://opencontext.org/subjects/A3CE5232-A6A1-...,...,stone/lithics,,rim,,0,0,,,,
553,http://opencontext.org/subjects/7F33E5E7-128F-...,Artifact 51.46,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Paleokastro,Unit 51,Batch 46,http://opencontext.org/subjects/EC9D2A24-5D58-...,...,,,,,0,0,,,,
878,http://opencontext.org/subjects/4C15D78A-03FD-...,Artifact 70.52,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Paleokastro,Unit 70,Batch 52,http://opencontext.org/subjects/F6A912DD-4379-...,...,,,,,0,0,,,,
2215,http://opencontext.org/subjects/35AF81FE-78EE-...,Artifact 204.27,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Kokkinokremos Plain,Unit 204,Batch 27,http://opencontext.org/subjects/71386C7D-3B5C-...,...,,,,,0,0,,,,
3173,http://opencontext.org/subjects/8C921D14-B3E3-...,Artifact 61.33,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Paleokastro,Unit 61,Batch 33,http://opencontext.org/subjects/66E4F080-0820-...,...,,,,,0,0,,,,
3622,http://opencontext.org/subjects/110F1828-175C-...,Artifact 101.9,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Paleokastro,Unit 101,Batch 9,http://opencontext.org/subjects/1ABAE5B5-0CD8-...,...,,,,,0,0,,,,
7348,http://opencontext.org/subjects/E4DFB59D-D5B0-...,Artifact 62.36,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Paleokastro,Unit 62,Batch 36,http://opencontext.org/subjects/C7032FAD-4CCF-...,...,,,,,0,0,,,,
7471,http://opencontext.org/subjects/68F96348-BB88-...,Artifact 9.42,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Koutsopetria - Paleokastro,Unit 9,Batch 42,http://opencontext.org/subjects/06D17669-1EFC-...,...,,,,,0,0,,,,
7608,http://opencontext.org/subjects/6E68F6A7-4002-...,Artifact [18.1].42,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Koutsopetria - Paleokastro,Unit 18,Batch 42,http://opencontext.org/subjects/F2A86EA6-42CD-...,...,,,,,0,0,,,,
7621,http://opencontext.org/subjects/8523872D-763A-...,Artifact 71.45,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Paleokastro,Unit 71,Batch 45,http://opencontext.org/subjects/2D99013E-B7E0-...,...,,,,,0,0,,,,


In [18]:
df_summary = match_summaries['../test-data/OpenContext/Cyprus PKAP Survey.csv']
df_summary

Unnamed: 0,query,location,start,stop,match_num,match_name,match_id,candidates_count,match_fallback_id,match_fallback_name,row_count
0,Ancient Historic,Cyprus,-750,749,0,"Ancient Historic [Cyprus, Cyprus: -0749 to 0749]",http://n2t.net/ark:/99152/p0dg76fxkgg,2,,,2665
1,"Roman, Late",Cyprus,300,749,1,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff,1,,,2594
2,Roman,Cyprus,-99,749,0,"Roman [Cyprus, Cyprus: -0098 to 0749]",http://n2t.net/ark:/99152/p0dg76fx784,5,,,766
3,Late Bronze Age-Hellenistic,Cyprus,-1650,-100,0,"Late Bronze Age (= Late Cypriot) [Cyprus, Cypr...",http://n2t.net/ark:/99152/p083p5rpb46,24,,,384
4,,Cyprus,300,749,0,,,0,,,216
5,Iron Age,Cyprus,-1050,-312,0,"Iron Age [Cyprus, Cyprus: -1049 to -0311]",http://n2t.net/ark:/99152/p0dg76f6zd3,10,,,201
6,Unknown,Cyprus,-9000,-9000,1,"Unknown [Cyprus, Cyprus: -8999 to 2008]",http://n2t.net/ark:/99152/p0dg76f2smn,1,,,179
7,Hellenistic,Cyprus,-311,-100,0,"Hellenistic [Cyprus, Cyprus: -0310 to -0099]",http://n2t.net/ark:/99152/p0dg76fqj57,11,,,130
8,Hellenistic-Early Roman,Cyprus,-311,299,0,"Hellenistic-Early Roman [Cyprus, Cyprus: -0310...",http://n2t.net/ark:/99152/p0dg76fk9wq,12,,,130
9,"Roman, Early",Cyprus,-99,299,0,"Early Roman [Cyprus, Cyprus: -0098 to 0299]",http://n2t.net/ark:/99152/p0dg76fpqkt,2,,,125


In [19]:
df_summary.row_count.sum()

8437