In [1]:
import csv
import contextlib
import os, errno
from collections import OrderedDict, Counter

from IPython.core.display import display, HTML

from pandas import DataFrame
import pandas as pd
import numpy as np

from periodo_reconciler import (
    RProperty,
    RQuery,
    PeriodoReconciler,
    CsvReconciler
)

In [2]:
def output_path_name(inpath, test_data_dir="../test-data/",
                     test_output_dir="../test-output/"):
    """
    given a path in the test directory, return the path for corresponding output in the test output dir
    """
 
    return os.path.join(test_output_dir, os.path.relpath(inpath, start=test_data_dir))

def makedirs_for_path(fpath):
    """
    make sure that the directory for fpath exists
    """
    
    (fdir, fname) = os.path.split(fpath)
    try:
        os.makedirs(fdir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise
            
    return True


In [3]:
def reconcile_to_csv_df(csv_path, kw, p_recon):
    """
    Inputs:  
    csv_path: the path of the csv file to reconcile
    kw: a dict holding the query, location, start, stop fields
    p_recon: a PeriodoReconciler object
    
    Side-effect:
    * a csv file with the matches in the output directory
    * returns a Pandas DataFrame holding the input data along with match data
    
    """

    output_path = output_path_name(csv_path)
    makedirs_for_path(output_path)
    
    # https://stackoverflow.com/a/19412700/7782
    with contextlib.ExitStack() as stack:
        csvfile = stack.enter_context(open(csv_path))
        outputfile = stack.enter_context(open(output_path, "w"))                  

        # p_recon = PeriodoReconciler(host='localhost:8142')
        c_recon = CsvReconciler(csvfile, p_recon, **kw)

        matches = list(c_recon.matches())

        c_recon.to_csv(outputfile, matches)
        df = DataFrame(matches)

        return df


# simple example of running the reconciler against a CSV file

In [4]:
# simple example

csv_path = "../test-data/periodo_simple_example.csv"
kw = {
   'location': 'location',
   'query': 'query',
   'start': 'start',
   'stop': 'end',
   'transpose_query': True,
   'match_top_candidate': True
}

p_recon = PeriodoReconciler(host='localhost:8142')
df = reconcile_to_csv_df(csv_path, kw, p_recon)

df.match_num.value_counts()

1    4
0    1
Name: match_num, dtype: int64

In [5]:
df

Unnamed: 0,query,location,start,end,candidates_count,match_num,match_name,match_id
0,北宋,,,,1,1,"Northern Song [China, China: 0960 to 1127]",http://n2t.net/ark:/99152/p0fp7wvjvn8
1,bronze age,,,,83,0,"Bronze [Palestine, Israel, Jordan: -3299 to -1...",http://n2t.net/ark:/99152/p0z3skmnss7
2,Ранньоримський,Ukraine,200.0,600.0,1,1,"Ранньоримський період [Ukraine, Ukraine: -0049...",http://n2t.net/ark:/99152/p06v8w4dbcf
3,Late Roman,Cyprus,300.0,749.0,1,1,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff
4,"Roman, Late",Cyprus,300.0,749.0,1,1,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff


# OpenContext examples

In [6]:
# the list of OpenContext CSV files along with the columns to be fed to the reconciler

OPENCONTEXT_TEST_FILES = [{'csv_path': '../test-data/OpenContext/Cyprus PKAP Survey.csv',
  'kw': {'location': 'Context (1)',
   'query': 'Period',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE',
   'transpose_query': True}},
 {'csv_path': '../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv',
  'kw': {'location': 'Context (1)',
   'query': 'periodo-pre-match',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE'}},
 {'csv_path': '../test-data/OpenContext/European Cattle with Periods.csv',
  'kw': {'location': 'Context (1)',
   'query': 'Period',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE',
   'ignored_queries':'Not determined'
  }},
 {'csv_path': '../test-data/OpenContext/Petra Artifacts.csv',
  'kw': {'location': 'Context (1)',
   'query': 'Culture',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE',
   'ignored_queries':'other,lb'
    }}]


kw_map = dict([(f['csv_path'], f['kw']) for f in OPENCONTEXT_TEST_FILES])

In [7]:
# write out incantation for the command line CSV reconciler
# TO DO: handle the boolean option correctly

for f in OPENCONTEXT_TEST_FILES:
    options_from_kw = " ".join(['--{}="{}"'.format(k,v) for (k,v) in f['kw'].items()])
    print('periodo-reconciler-py {} "{}" -'.format(options_from_kw, f['csv_path'] ))

periodo-reconciler-py --location="Context (1)" --query="Period" --start="Early BCE/CE" --stop="Late BCE/CE" --transpose_query="True" "../test-data/OpenContext/Cyprus PKAP Survey.csv" -
periodo-reconciler-py --location="Context (1)" --query="periodo-pre-match" --start="Early BCE/CE" --stop="Late BCE/CE" "../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv" -
periodo-reconciler-py --location="Context (1)" --query="Period" --start="Early BCE/CE" --stop="Late BCE/CE" --ignored_queries="Not determined" "../test-data/OpenContext/European Cattle with Periods.csv" -
periodo-reconciler-py --location="Context (1)" --query="Culture" --start="Early BCE/CE" --stop="Late BCE/CE" --ignored_queries="other,lb" "../test-data/OpenContext/Petra Artifacts.csv" -


In [8]:
p_recon = PeriodoReconciler(host='localhost:8142')
csv_path = '../test-data/OpenContext/Cyprus PKAP Survey.csv'
kw = {'location': 'Context (1)',
   'query': 'Period',
   'start': 'Early BCE/CE',
   'stop': 'Late BCE/CE',
   'transpose_query': True,
   'match_top_candidate': True
}

df = reconcile_to_csv_df(csv_path, kw, p_recon)

In [9]:
df.head()

Unnamed: 0,URI,Item Label,Project Label,Project URI,Context (1),Context (2),Context (3),Context (4),Context (5),Context URI,...,Chronotype,Collection Type,Material,Period,Extant Part,Fabric group,candidates_count,match_num,match_name,match_id
0,http://opencontext.org/subjects/8C2A609B-B20B-...,Batch 131 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,"Coarse Ware, Roman Late",Survey,pottery,"Roman, Late",rim,coarse,1,1,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff
1,http://opencontext.org/subjects/636D6826-BCBD-...,Batch 17,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Dhekeleia,Unit 223,,http://opencontext.org/subjects/8E773523-4391-...,...,"Amphora, Late Roman 1",Survey,pottery,"Roman, Late",handle,amphora,1,1,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff
2,http://opencontext.org/subjects/CA8C3EE0-0F09-...,Batch 19,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,West Pyla,Unit 546,,http://opencontext.org/subjects/34C7E1EC-F44D-...,...,"Amphora, Late Roman 1",Survey,pottery,"Roman, Late",handle,amphora,1,1,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff
3,http://opencontext.org/subjects/622BEE81-FC52-...,Batch 140 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,"Black Glazed, Attic, Classical",Survey,pottery,Classical,handle,fine,8,0,Classical (Greco-Roman; 550 BC-330 BC) [Afghan...,http://n2t.net/ark:/99152/p03wskd389m
4,http://opencontext.org/subjects/58920419-0245-...,Batch 7 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,"Storage, Archaic Basket Handle",Survey,pottery,Archaic-Hellenistic,handle,coarse,19,0,"Archaic-Hellenistic [Cyprus, Cyprus: -0749 to ...",http://n2t.net/ark:/99152/p0dg76f4rhx


In [10]:
# p_recon._call_reconciler.cache_info()

In [11]:
df.head()

Unnamed: 0,URI,Item Label,Project Label,Project URI,Context (1),Context (2),Context (3),Context (4),Context (5),Context URI,...,Chronotype,Collection Type,Material,Period,Extant Part,Fabric group,candidates_count,match_num,match_name,match_id
0,http://opencontext.org/subjects/8C2A609B-B20B-...,Batch 131 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,"Coarse Ware, Roman Late",Survey,pottery,"Roman, Late",rim,coarse,1,1,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff
1,http://opencontext.org/subjects/636D6826-BCBD-...,Batch 17,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Dhekeleia,Unit 223,,http://opencontext.org/subjects/8E773523-4391-...,...,"Amphora, Late Roman 1",Survey,pottery,"Roman, Late",handle,amphora,1,1,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff
2,http://opencontext.org/subjects/CA8C3EE0-0F09-...,Batch 19,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,West Pyla,Unit 546,,http://opencontext.org/subjects/34C7E1EC-F44D-...,...,"Amphora, Late Roman 1",Survey,pottery,"Roman, Late",handle,amphora,1,1,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff
3,http://opencontext.org/subjects/622BEE81-FC52-...,Batch 140 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,"Black Glazed, Attic, Classical",Survey,pottery,Classical,handle,fine,8,0,Classical (Greco-Roman; 550 BC-330 BC) [Afghan...,http://n2t.net/ark:/99152/p03wskd389m
4,http://opencontext.org/subjects/58920419-0245-...,Batch 7 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,"Storage, Archaic Basket Handle",Survey,pottery,Archaic-Hellenistic,handle,coarse,19,0,"Archaic-Hellenistic [Cyprus, Cyprus: -0749 to ...",http://n2t.net/ark:/99152/p0dg76f4rhx


In [12]:
matching_results = {}

p_recon = PeriodoReconciler(host='localhost:8142')

for file in OPENCONTEXT_TEST_FILES:
    csv_path = file['csv_path']
    kw = file['kw']
    print ("\r{}".format(csv_path), end='')
    df = reconcile_to_csv_df(csv_path, kw, p_recon)
    matching_results[csv_path] = df # df.match_num.value_counts()


../test-data/OpenContext/Petra Artifacts.csvh Periods.csvv

In [13]:
from itertools import islice

for (csv_path, df) in islice(matching_results.items(),None):
    print (csv_path, df.match_num.value_counts().get(0), df.match_num.value_counts().get(1))

        

../test-data/OpenContext/Cyprus PKAP Survey.csv 5381 3056
../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv 5381 3056
../test-data/OpenContext/European Cattle with Periods.csv 5270 398
../test-data/OpenContext/Petra Artifacts.csv 52181 306


# PKAP

In [14]:
df = matching_results[OPENCONTEXT_TEST_FILES[0]['csv_path']]

df.head()

Unnamed: 0,URI,Item Label,Project Label,Project URI,Context (1),Context (2),Context (3),Context (4),Context (5),Context URI,...,Chronotype,Collection Type,Material,Period,Extant Part,Fabric group,candidates_count,match_num,match_name,match_id
0,http://opencontext.org/subjects/8C2A609B-B20B-...,Batch 131 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,"Coarse Ware, Roman Late",Survey,pottery,"Roman, Late",rim,coarse,1,1,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff
1,http://opencontext.org/subjects/636D6826-BCBD-...,Batch 17,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Dhekeleia,Unit 223,,http://opencontext.org/subjects/8E773523-4391-...,...,"Amphora, Late Roman 1",Survey,pottery,"Roman, Late",handle,amphora,1,1,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff
2,http://opencontext.org/subjects/CA8C3EE0-0F09-...,Batch 19,Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,West Pyla,Unit 546,,http://opencontext.org/subjects/34C7E1EC-F44D-...,...,"Amphora, Late Roman 1",Survey,pottery,"Roman, Late",handle,amphora,1,1,"Late Roman [Cyprus, Cyprus: 0300 to 0749]",http://n2t.net/ark:/99152/p0dg76fbqff
3,http://opencontext.org/subjects/622BEE81-FC52-...,Batch 140 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,"Black Glazed, Attic, Classical",Survey,pottery,Classical,handle,fine,8,0,Classical (Greco-Roman; 550 BC-330 BC) [Afghan...,http://n2t.net/ark:/99152/p03wskd389m
4,http://opencontext.org/subjects/58920419-0245-...,Batch 7 (exp),Pyla-Koutsopetria Archaeological Project I: Pe...,http://opencontext.org/projects/3F6DCD13-A476-...,Cyprus,PKAP Survey Area,Vigla,Unit 501,,http://opencontext.org/subjects/C3EDBDC2-6B6B-...,...,"Storage, Archaic Basket Handle",Survey,pottery,Archaic-Hellenistic,handle,coarse,19,0,"Archaic-Hellenistic [Cyprus, Cyprus: -0749 to ...",http://n2t.net/ark:/99152/p0dg76f4rhx


In [15]:
len(df)

8437

In [16]:
df.candidates_count.value_counts()

1     3056
2     2813
5      789
0      443
24     384
10     202
12     178
11     137
8      121
6       97
7       69
9       50
18      40
19      40
3        8
16       5
15       4
4        1
Name: candidates_count, dtype: int64

# PKAP

In [17]:
matching_results.keys()

dict_keys(['../test-data/OpenContext/Cyprus PKAP Survey.csv', '../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv', '../test-data/OpenContext/European Cattle with Periods.csv', '../test-data/OpenContext/Petra Artifacts.csv'])

# PKAP specifically

In [18]:
# let's look at the PKAP dataset
# look for unique 4-tuples in the PKAP dataset

from collections import Counter

recon_inputs = ['query', 'location', 'start', 'stop']


path = '../test-data/OpenContext/Cyprus PKAP Survey.csv'
#path = '../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv'

#df = matching_results[path]
df = pd.read_csv(path)
df = df.fillna('')

_ = df[[kw_map[path][input_] for input_ in recon_inputs]]
pkap_counts = Counter([tuple(x) for x in _.values])
len(pkap_counts)

79

In [19]:
# if we include the results with the inputs, do we get the same counts?

In [20]:
df.columns

Index(['URI', 'Item Label', 'Project Label', 'Project URI', 'Context (1)',
       'Context (2)', 'Context (3)', 'Context (4)', 'Context (5)',
       'Context URI', 'Latitude (WGS-84)', 'Longitude (WGS-84)',
       'Early BCE/CE', 'Late BCE/CE', 'Item Category', 'Published Date',
       'Updated Date', 'Has type', 'Chronotype', 'Collection Type', 'Material',
       'Period', 'Extant Part', 'Fabric group'],
      dtype='object')

In [21]:
in_out_columns = [kw_map[path][input_] for input_ in recon_inputs] + list(CsvReconciler.match_column_fields)
in_out_columns

_ = df[in_out_columns]
pkap_in_out_counts = Counter([tuple(x) for x in _.values])
len(pkap_in_out_counts)

KeyError: "['match_num' 'match_name' 'match_id' 'candidates_count'] not in index"

In [None]:
df[df.candidates_count == 0][in_out_columns[0]].value_counts()

In [None]:
df[df[in_out_columns[0]] == 'Roman, Late'].match_id.value_counts()

In [None]:


df[in_out_columns[0]].value_counts()

In [None]:
pkap_counts

In [None]:
queries = []

# use str(key) as label

for key in pkap_counts.keys():

    q_dict = dict(zip(recon_inputs, key))
    queries.append(
        RQuery(q_dict['query'], label=str(key), 
        properties=[
          RProperty('location', q_dict['location']),
          RProperty('start', q_dict['start']),
          RProperty('stop', q_dict['stop'])
        ])
    )

recon_results = p_recon.reconcile(queries, method='post')

In [None]:
# compare with the hand-reconciled CSV file
# ../test-data/OpenContext/Petra Artifacts-reconiled.csv

import csv

#csv_path = "../test-data/OpenContext/Petra Artifacts-reconciled.csv"
csv_path = "../test-data/OpenContext/Cyprus-PKAP-Survey-2019-03-29.csv"

with contextlib.ExitStack() as stack:
    csvfile = stack.enter_context(open(csv_path))

    df = pd.read_csv(csvfile)
    df = df.fillna('')
    df['rquery'] = df.apply(lambda row: tuple([row[kw_map[path][input_]] for input_ in recon_inputs]), axis=1)

In [None]:
# establish that Adam was consistent in reconciliation judgments

k = df.groupby('rquery').apply(lambda rows: len(rows['recon-match-id'].unique()))
assert np.alltrue(k==1)

In [None]:
# mapping

reconciled_map = df.groupby('rquery').apply(lambda rows: rows['recon-match-id'].unique()[0])

In [None]:
# now compare mapping to recon_results

def id_loc_in_list(id_, list_):
    if len(id_) == 0:
        if len(list_) == 0:
            return -1
        else:
            return -1
        
    pos = -1
    for (i, item) in enumerate(list_):
        if id_ == item:
            pos = i
            break
            
    return pos


matched_results = []

for key in pkap_counts.keys():
    v = recon_results[str(key)]
    result_ids = [r['id'] for r in v['result']]
    print (key, reconciled_map[key], len(result_ids), id_loc_in_list(reconciled_map[key], result_ids))
    matched_results.append({'qt':key, 'candidate0': reconciled_map[key]})


In [None]:
# compare the sets of 4-tuples that feed into the reconcilers 

raw_tuples = set(pkap_counts.keys())
reconciled_tuples = set(df['rquery'])

raw_tuples == reconciled_tuples

In [None]:
# Check by hand

def reconcile(p_recon, qt, candidate0=None):
    """
    returns all candidates, position of candidate0 among the candidates
    """
    queries = [RQuery(qt[0], label='q0', properties=[
            RProperty('location', qt[1]),
            RProperty('start', qt[2]),
            RProperty('end', qt[3])
    ])]
    r = p_recon.reconcile(queries, method='post')['q0']
    
    # import pdb; pdb.set_trace()
    
    results = r['result']
    
    if candidate0 is not None:
        result_ids = [r['id'] for r in results]
        loc_id = id_loc_in_list(candidate0, result_ids)
        if loc_id > -1:
            matched = results[loc_id]['match']
        else:
            matched = False
    else:
        loc_id = -1
    
    return {
        'results': results,
        'candidate0': candidate0,
        'loc_id':loc_id,
        'matched':matched,
        'results_count': len(results)
    }


    


In [None]:
# r = reconcile(p_recon,('Late Roman', 'Cyprus', '300', '749'),  'http://n2t.net/ark:/99152/p0dg76fbqff')
r = reconcile(p_recon,('Classical', 'Cyprus', '-474', '-312'), 'http://n2t.net/ark:/99152/p0dg76fk4nc')

# r = reconcile(p_recon,('Ceramic Age', 'Cyprus', '-1700', '-1700'), 'http://n2t.net/ark:/99152/p0dg76fzg6j')

 
r['loc_id'], r['results_count'], r['results'][r['loc_id']]['name'] if r['loc_id'] > -1 else '', r['matched']

In [None]:
# Loop through the matches done in OpenReconcile
# and redo the reconcilation, calculating whether the results matched and which results matched

results = []

for match in matched_results:
    r = reconcile(p_recon, match['qt'], match['candidate0'])
    try:
        match_name = r['results'][r['loc_id']]['name'] if (r['loc_id'] > -1 and r['results_count'] > 0) else ''
        results.append({'query': match['qt'][0],
                'location': match['qt'][1],
                'start': match['qt'][2],
                'stop': match['qt'][3],
                'loc_id': r['loc_id'], 
                'results_count': r['results_count'], 
                'match_name': match_name,
                'match_id':match['candidate0'],
                'matched':r['matched']
            })
    except Exception as e:
        print (e)

In [None]:
df2 = DataFrame(results, columns=['query', 'location', 'start', 'stop', 'loc_id', 'results_count',
                                  'match_name', 'match_id', 'matched'])
df2.head()

In [None]:
df2.matched.value_counts()

In [None]:
df2[df2.matched]

# sort these rows

* matched = True, False (matched desc)
* if loc_id > -1, ==-1
* loc_id ascending 



In [None]:
df2['sort_key'] = df2.apply(lambda row: (not row['matched'], 
                                         row['loc_id'] < 0, 
                                         row['loc_id'], 
                                         len(row['query']) == 0),
                             axis=1)
df2.head()

In [None]:
df2.sort_values(by='sort_key')

In [None]:
df2.loc_id.value_counts()

In [None]:
!ls ../test-data/

In [None]:
df2.sort_values(by='sort_key').to_csv('../test-data/OpenContext/Cyprus PKAP Survey-2019.04.02.csv')

## rows matched vs not matched for each test file

In [None]:

rows_html = ("".join(["<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(
 csv_path, df.match_num.value_counts().get(1, 0), df.match_num.value_counts().get(0, 0))
for (csv_path, df) in matching_results.items()]))


html_ = """<table>
<tr>
    <th>path</th>
    <th>matches</th>
    <th>non-matches</th>
</tr>
{}
</table>""".format(rows_html)

display(HTML(html_))

## which combination of query/location/start/stop were matched?

In [None]:
matching_results.keys()

In [None]:
rows_html = []

for (csv_path, df) in matching_results.items():
    kw = kw_map[csv_path]
    counter = Counter(list(df.apply(lambda row: (row[kw['query']],
                      row[kw['location']],
                      row[kw['start']],
                      row[kw['stop']],
                      row['match_num'],
                      row['match_id'],
                      row['match_name']
                     ), axis=1)))
    
    matching_items = [(k,c) for (k,c) in counter.items() if k[4] > 0]

    for (match, count) in matching_items:
        row_html = """<tr><td>{}</td><td>{}</td><td>{}</td><td>{}</td>
                      <td>{}</td><td>{}</td><td>{}</td><td>{}</td></tr>""".format(
             csv_path, match[0], match[1], match[2], match[3], count, match[5], match[6])
        rows_html.append(row_html)
        

rows_html = "".join(rows_html)

html_ = """<table>
<tr>
    <th>path</th>
    <th>query</th>
    <th>location</th>
    <th>start</th>
    <th>stop</th>
    <th>num of rows</th>
    <th>match_id</th>
    <th>match_name</th>
</tr>
{}
</table>""".format(rows_html)
        
display(HTML(html_))

In [None]:
len(counter)

# studying non-matches vs matches

In [None]:
# how to merge two counters?

from collections import Counter

def recon_data(df, csv_path, kw):

    if len(df):
        k0 = df.apply(lambda row: (
                          csv_path,
                          row[kw['query']],
                          row[kw['location']],
                          row[kw['start']],
                          row[kw['stop']],
                          row['match_num'],
                          row['match_id'],
                          row['match_name']
                         ), axis=1)
        k = list(k0)
        return (k)
    else:
        return []
    

matches = Counter()
non_matches = Counter()

for (csv_path, df) in matching_results.items():
    kw = kw_map[csv_path]
    matches.update(recon_data(df[df.match_num > 0], csv_path, kw))
    non_matches.update(recon_data(df[df.match_num == 0], csv_path, kw))



len(matches), len(non_matches)

In [None]:
(csv_path, df) = list(matching_results.items())[1]
kw = kw_map[csv_path]
df2 = df[df.match_num > 0]
k = recon_data(df2, csv_path, kw)
len(k)

In [None]:
for row in matches.keys():
    print(row)

In [None]:
[ "|{}|".format("|".join([str(col) for col in row]))
 for row in matches.keys()]

In [None]:
from IPython.display import Markdown, HTML, display
from jinja2 import Template

matches_template = Template("""
|path|query|location|start|stop|match_id|match_name|
|--|--|--|--|--|--|--|
{% for item in items %}{{item}}\n{% endfor %}
""")

Markdown(matches_template.render(items=[ "|{}|".format("|".join([str(col) for col in row]))
 for row in matches.keys()]))

In [None]:
# matches that are wrong

queries = [
    RQuery("Not determined", label="with query",  properties=[
        RProperty('location', 'Spain'),
        RProperty('start', -1500),
        RProperty('end', -714)
    ]),     
    RQuery("", label="empty query",  properties=[
        RProperty('location', 'Spain'),
        RProperty('start', -1500),
        RProperty('end', -714)
    ]),
    RQuery("Spain", label="use location for query",  properties=[
        RProperty('location', 'Spain'),
        RProperty('start', -1500),
        RProperty('end', -714)
    ]),
    RQuery("Bronze", label="use location for query",  properties=[
        RProperty('location', 'Spain'),
        RProperty('start', -1500),
        RProperty('end', -714)
    ])   
]

r = p_recon.reconcile(queries, method='post')

r