## Imports

In [102]:
import requests
import json
import csv
import pandas as pd
from IPython.display import HTML
import base64

## Config params

In [130]:
API_KEYS = {
    "key_identity": "4iNoLCQW8QAtfTf4OudZaujaAmnHyQVd",
    "key_credential": "eeUuqkTlrJrH2pBUeXIVgwj823DtfABX"
}

OMEKA_BASE_URL = "http://137.204.168.11/palread/"
OMEKA_BASE_API = OMEKA_BASE_URL + "api/"
OMAKE_API_ITEMS = OMEKA_BASE_API + "items"
OMEKA_VOCAB_MAP = {
    "Person": {"resource_template_id": 5, "property_id": 303},
    "Periodical": {"resource_template_id": 4, "property_id": 301},
    "Article": {"resource_template_id": 12, "property_id": 301}
}

def name_parts(x):
    names = x.split()
    names = [a.strip() for a in names]
    names = list(filter(lambda a_name: a_name != "" or a_name != ",", names))
    return names

QUERY = {
    "person_name": lambda col_name: {
        "query": lambda val: OMAKE_API_ITEMS+"?"+"resource_template_id="+str(OMEKA_VOCAB_MAP["Person"]["resource_template_id"])+"&property[0][property]="+str(OMEKA_VOCAB_MAP[item_class]["property_id"])+"&property[0][type]=in"+"&property[0][text]="+str(val),
        "col_name": col_name, 
        "item_class": "Person",
        "preprocessing": lambda x: name_parts(x),
        "query_id": "omeka_person" 
    },
    "magazine_title": lambda col_name: {
        "query": lambda val: OMAKE_API_ITEMS+"?"+"resource_template_id="+str(OMEKA_VOCAB_MAP["Periodical"]["resource_template_id"])+"&property[0][property]="+str(OMEKA_VOCAB_MAP[item_class]["property_id"])+"&property[0][type]=eq"+"&property[0][text]="+str(val),
        "col_name": col_name, 
        "item_class": "Periodical",
        "query_id": "omeka_magazine" 
    },
    "article_title": lambda col_name: {
        "query": lambda val: OMAKE_API_ITEMS+"?"+"resource_template_id="+str(OMEKA_VOCAB_MAP["Article"]["resource_template_id"])+"&property[0][property]="+str(OMEKA_VOCAB_MAP[item_class]["property_id"])+"&property[0][type]=eq"+"&property[0][text]="+str(val),
        "col_name": col_name, 
        "item_class": "Article",
        "query_id": "omeka_article"
    }
}

## Functions

In [171]:
def check_table(operations, output_print=True):
    
    res_tables = dict()
    results = dict()
    
    for tab_key in operations:
        res_tables[tab_key] = []
        results[tab_key] = {}
        with open(tab_key) as tsv_file:
            reader = csv.DictReader(tsv_file, delimiter='\t')
            for row in reader:
                
                all_queries = operations[tab_key]
                res_row = row
                for query_obj in all_queries:
                    query_id = query_obj["query_id"]
                    if query_id not in results[tab_key]:
                        results[tab_key][query_id] = {"in_omeka": [], "doubtfull": [], "not_in_omeka":[]}
                    
                    # prepare the query
                    val = row[query_obj["col_name"]]
                    if "preprocessing" in query_obj:
                        val = query_obj["preprocessing"](val)
                    else:
                        val = [val]

                    api_queries = [] 
                    for a_val in val:
                        api_queries.append(query_obj["query"](a_val))


                    # call the API
                    dict_results = {}
                    query_res_ids = []
                    query_flag = True
                    for q_index,a_query in enumerate(api_queries):
                        response = requests.get(a_query, params=API_KEYS)
                        json_results = json.loads(response.content)
                        query_flag &= len(json_results) > 0 
                        query_res_ids.append(set())
                        for a_json_obj in json_results:
                            query_res_ids[-1].add(a_json_obj["o:id"])
                            dict_results[a_json_obj["o:id"]] = a_json_obj

                    if output_print: 
                        print("Queries performed: "+str(len(api_queries))+"\nValues searched: "+str(val)+ "\nResults found (Items ids in Omeka) for each query: "+ str(query_res_ids))

                    # check results
                    if query_flag:
                        res_set = None
                        for s in query_res_ids:
                            if res_set == None:
                                res_set = s
                            else:
                                res_set = res_set.intersection(s)

                    if output_print: 
                        print("Results intersection: "+str(res_set)+"\n")

                    res_row[query_obj["col_name"]+"_omeka"] = ""
                    if len(res_set) == 0:
                        res_row[query_obj["col_name"]+"_omeka"] = "NONE"
                        results[tab_key][query_id]["not_in_omeka"].append(row[query_obj["col_name"]])
                    elif len(res_set) == 1:
                        res_row[query_obj["col_name"]+"_omeka"] = list(res_set)[0]
                        results[tab_key][query_id]["in_omeka"].append(row[query_obj["col_name"]])                        
                    elif len(res_set) > 1:
                        res_row[query_obj["col_name"]+"_omeka"] = "DOUBTFULL"
                        results[tab_key][query_id]["doubtfull"].append(row[query_obj["col_name"]])                        
                    
                res_tables[tab_key].append(res_row)
        
        res_tables[tab_key] = pd.DataFrame(res_tables[tab_key])
    
    return (res_tables, results)

    
def create_download_link(res_tables):  
    str_html = ""
    for tab in res_tables:
        df = res_tables[tab]
        filename = tab.replace(".tsv","__omekaids.tsv")
        title = "Download the TSV: "+filename
        csv = df.to_csv(index =False)
        b64 = base64.b64encode(csv.encode())
        payload = b64.decode()
        html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
        html = html.format(payload=payload,title=title,filename=filename)
        str_html += html + "<br>" 
    return HTML("<div>"+str_html+"</div>")


def print_stats(results):
    for tab in results:
        tab_res = results[tab]
        print("Table: "+tab)
        for q in tab_res:
            print("  * Query: "+q)
            print("    -  A total of "+str(len(tab_res[q]["not_in_omeka"]))+" items HAVEN'T been found in Omeka: "+str(tab_res[q]["not_in_omeka"]))
            print("    -  A total of "+str(len(tab_res[q]["doubtfull"]))+" items are doubtfull (need a manual check): "+str(tab_res[q]["doubtfull"]))
            print("    -  A total of "+str(len(tab_res[q]["in_omeka"]))+" items HAVE been found in Omeka: "+str(tab_res[q]["in_omeka"]))
            print("\n")
        print("\n")

## Inputs – this is the only part that should be defined by the user
#### *TSV_TABLE*: the path to the TSV table which needs to be checked
#### *COLUMN_TO_CHECK*: the column of the TSV table used to identify its records
#### *ITEM_CLASS*: the class of items stored in the TSV table

In [156]:
# USER INPUTS

# <path to the table>: [a list of queries] 
OPERATIONS = {
    #"alsharekh magazine archive - author_sample.tsv":[
    #    QUERY["person_name"]("authorName") 
    #],
    "alsharekh magazine archive - authors list.tsv":[
        QUERY["person_name"]("authorName") 
    ],
    "alsharekh magazine archive - articles.tsv":[
        QUERY["article_title"]("articleTitle"),
        QUERY["person_name"]("authorName"),
        QUERY["magazine_title"]("magazineName")
    ],
    "alsharekh magazine archive - preiodicals.tsv":[
        QUERY["person_name"]("editorInChiefID"),
        QUERY["magazine_title"]("magazineName")
    ]
}


## Main

In [174]:
res = check_table( OPERATIONS, output_print = False)

print_stats(res[1])

print("\nDownload the new TSV tables (with the corresponding Omeka ids)")
create_download_link(res[0])

Table: alsharekh magazine archive - author_sample.tsv
  * Query: omeka_author
    -  A total of 0 items HAVEN'T been found in Omeka: []
    -  A total of 0 items are doubtfull (need a manual check): []
    -  A total of 6 items HAVE been found in Omeka: ['فيصل دراج', 'فيصل دراج', 'فيصل دراج', 'فيصل دراج', 'فيصل دراج', 'فيصل دراج']





Download the new TSV tables (with the corresponding Omeka ids)
