# Omeka Lookup

## Imports

In [None]:
!pip install pandas
!pip install requests
!pip install numpy

In [1]:
import requests
import json
import csv
import pandas as pd
from IPython.display import HTML
import base64
from datetime import datetime
import ssl
## Use this to fix error "SSL: CERTIFICATE_VERIFY_FAILED" 
ssl._create_default_https_context = ssl._create_unverified_context

## Omeka Adress 
### Note: the user should define this section

In [89]:
OMEKA_BASE_URL = "http://160.45.15.183/omeka-s/"

## Config params 
### Note: only the configurer should define this section

In [94]:
OMEKA_BASE_API = OMEKA_BASE_URL + "api/"
OMAKE_API_ITEMS = OMEKA_BASE_API + "items"
OMEKA_VOCAB_MAP = {
    "Person": {"resource_template_id": 13, "property_id": 245},
    "Periodical": {"resource_template_id": 12, "property_id": 242},
    "Article": {"resource_template_id": 3, "property_id": 242},
    "Book": {"resource_template_id": 4, "property_id": 242}
    #"Person": {"resource_template_id": 5, "property_id": 303},
    #"Periodical": {"resource_template_id": 4, "property_id": 301},
    #"Article": {"resource_template_id": 12, "property_id": 301}
}

# Here we define the preprocessing functions
# The preprocessing functions must return a list
# Each element of the list will be a API call (i.e., query)
# The results must respect all the queries (AND operation)
def name_parts(x):
    names = x.split()
    names = [a.strip() for a in names]
    names = list(filter(lambda a_name: a_name != "" or a_name != ",", names))
    return names

def article_parts(x):
    parts = x.split(";")
    if len(parts) > 1: 
        return [parts[0]]
    else:
        return [x]

# each query should take 1 parameter: a column name 
QUERY = {
    # QUERY-1
    "person_name": lambda col_name, multi_val=None: {
        "query": lambda val: OMAKE_API_ITEMS+"?"+"resource_template_id="+str(OMEKA_VOCAB_MAP["Person"]["resource_template_id"])+"&property[0][property]="+str(OMEKA_VOCAB_MAP["Person"]["property_id"])+"&property[0][type]=in"+"&property[0][text]="+str(val),
        "col_name": col_name, 
        "item_class": "Person",
        # in case a preprocessing operation is needed
        # if the preprocessing function returns a list, then the query is performed on each value of the list, and the corresponding Omeka item is taken from the results intersection
        "preprocessing": lambda x: name_parts(x),
        "multivalues": multi_val,
        "query_id": "omeka_person" 
    },
    
    # QUERY-2
    "magazine_title": lambda col_name, multi_val=None: {
        "query": lambda val: OMAKE_API_ITEMS+"?"+"resource_template_id="+str(OMEKA_VOCAB_MAP["Periodical"]["resource_template_id"])+"&property[0][property]="+str(OMEKA_VOCAB_MAP["Periodical"]["property_id"])+"&property[0][type]=eq"+"&property[0][text]="+str(val),
        "col_name": col_name, 
        "multivalues": multi_val,
        "item_class": "Periodical",
        "query_id": "omeka_magazine" 
    },
    
    # QUERY-3
    "article_title": lambda col_name, multi_val=None: {
        "query": lambda val: OMAKE_API_ITEMS+"?"+"resource_template_id="+str(OMEKA_VOCAB_MAP["Article"]["resource_template_id"])+"&property[0][property]="+str(OMEKA_VOCAB_MAP["Article"]["property_id"])+"&property[0][type]=eq"+"&property[0][text]="+str(val),
        "col_name": col_name, 
        "multivalues": multi_val,
        "item_class": "Article",
        "preprocessing": lambda x: article_parts(x),
        "query_id": "omeka_article"
    },
     # QUERY-4
    "book_title": lambda col_name, multi_val=None: {
        "query": lambda val: OMAKE_API_ITEMS+"?"+"resource_template_id="+str(OMEKA_VOCAB_MAP["Book"]["resource_template_id"])+"&property[0][property]="+str(OMEKA_VOCAB_MAP["Book"]["property_id"])+"&property[0][type]=eq"+"&property[0][text]="+str(val),
        "col_name": col_name, 
        "multivalues": multi_val,
        "item_class": "Book",
        "query_id": "omeka_book"
    }
}

## Inputs 
### Note: the user should define this section

In [90]:
API_KEYS = {
    "key_identity": "",
    "key_credential": ""
}

# Each table we want to analyze must be specified in a separated block
# "table": the name used for the generated table
# "source": the source of the table. Could be a PATH or URL
# "queries": a list of queries

# The avalialble queries are the ones defined in the confg variable QUERY 
# To insert a query we must specify its name as: QUERY[<key_of_the_query>] (e.g. QUERY["person_name"])
# Then specify between brackets the column of the table used as input to the query (e.g. QUERY["person_name"]("name"))

OPERATIONS = [
    {
        "table": "authors",
        #"source": "https://raw.githubusercontent.com/palread/import_csv/main/sample/alsharekh%20magazine%20archive%20-%20authors__sample.tsv",
        "source": "sample/alsharekh magazine archive - authors__sample.tsv", # LOCAL
        "queries": [
            # Between round brackets you must specify: 
            # name of the column (REQUIRED)
            # the string used as separator in case of multiple values (OPTIONAL)  
            QUERY["person_name"]("name", multi_val="_;;_") 
        ]
    }
]

## Functions

In [91]:
def check_table(operations, output_print=True):
    
    res_tables = dict()
    none_tables_index = dict()
    none_tables = dict()
    results = dict()
    
    for entry in operations:
        tab_name = entry["table"]
        source = entry["source"]
        all_queries = entry["queries"]
        
        res_tables[tab_name] = []
        results[tab_name] = {}
        
        separator ='\t' if 'tsv' in source else ','
        df_table = pd.read_csv(source, error_bad_lines=False, sep=separator, header=0 )
        
        for df_index, row in df_table.iterrows():
            
            res_row = row
            for query_obj in all_queries:
                col_name = query_obj["col_name"]
                query_id = query_obj["query_id"]
                if query_id not in results[tab_name]:
                    results[tab_name][query_id] = {"in_omeka": [], "doubtfull": [], "not_in_omeka":[]}
                    
                val_base = row[col_name]
                if query_obj["multivalues"] != None:
                    val_base = val_base.split(query_obj["multivalues"])
                else:
                    val_base = [val_base]
                    
                res_row[col_name+"_omeka"] = ""
                for val in val_base:
                
                    org_val = val
                    
                    if "preprocessing" in query_obj:
                        val = query_obj["preprocessing"](val)
                    else:
                        val = [val]

                    api_queries = [] 
                    for a_val in val:
                        api_queries.append(query_obj["query"](a_val))
                        
                    if output_print: 
                        print("The queries (after preprocessing): ",api_queries)


                    # call the API
                    dict_results = {}
                    query_res_ids = []
                    query_flag = True
                    for q_index,a_query in enumerate(api_queries):
                        response = requests.get(a_query, params=API_KEYS)
                        json_results = json.loads(response.content)
                        query_flag &= len(json_results) > 0 
                        query_res_ids.append(set())
                        for a_json_obj in json_results:
                            query_res_ids[-1].add(a_json_obj["o:id"])
                            dict_results[a_json_obj["o:id"]] = a_json_obj

                    if output_print: 
                        print("Queries performed: "+str(len(api_queries))+"\nValues searched: "+str(val)+ "\nResults found (Items ids in Omeka) for each query: "+ str(query_res_ids))

                    # check results
                    res_set = None
                    if query_flag:
                        for s in query_res_ids:
                            if res_set == None:
                                res_set = s
                            else:
                                res_set = res_set.intersection(s)
                    else:
                        res_set = set()

                    if output_print: 
                        print("Results intersection: "+str(res_set)+"\n")

                    org_resorce_template_k = query_obj["item_class"]
                    if len(res_set) == 0:
                        res_row[col_name+"_omeka"] += "NONE"
                        results[tab_name][query_id]["not_in_omeka"].append(org_val)

                        #insert it also in a separated table
                        if org_resorce_template_k not in none_tables:
                            none_tables[org_resorce_template_k] = []
                            none_tables_index[org_resorce_template_k] = set()

                        if org_val not in none_tables_index[org_resorce_template_k]:
                            none_tables[org_resorce_template_k].append({"value": org_val})

                        none_tables_index[org_resorce_template_k].add(org_val)

                    elif len(res_set) == 1:
                        res_row[col_name+"_omeka"] += str(list(res_set)[0])
                        results[tab_name][query_id]["in_omeka"].append(org_val)                        
                    elif len(res_set) > 1:
                        res_row[col_name+"_omeka"] += "DOUBTFULL"
                        results[tab_name][query_id]["doubtfull"].append(org_val) 
                    
                    res_row[col_name+"_omeka"] += ", "
            
            res_row[col_name+"_omeka"] = res_row[col_name+"_omeka"][:-2]
            res_tables[tab_name].append(res_row)
    
        res_tables[tab_name] = pd.DataFrame(res_tables[tab_name])
    
    for rsc_template in none_tables:
        none_tables[rsc_template] = pd.DataFrame(none_tables[rsc_template])
    
    return (res_tables, none_tables, results)

    
def create_download_links(res_tables, tab_type):  
    str_html = ""
    # current date and time
    now = datetime.now()
    timestamp = datetime.timestamp(now)
    
    for tab_k in res_tables:
        df = res_tables[tab_k]
        filename = tab_k + "__"+str(tab_type)+"__"+str(timestamp).replace(".","_")+".tsv"
        title = "Download the TSV: "+filename
        csv = df.to_csv(index =False)
        b64 = base64.b64encode(csv.encode())
        payload = b64.decode()
        html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
        html = html.format(payload=payload,title=title,filename=filename)
        str_html += html + "<br>" 
    return HTML("<div>"+str_html+"</div>")


def print_stats(results):
    for tab in results:
        tab_res = results[tab]
        print("Table: "+tab)
        for q in tab_res:
            print("  * Query: "+q)
            print("    -  A total of "+str(len(tab_res[q]["not_in_omeka"]))+" items HAVEN'T been found in Omeka: "+str(tab_res[q]["not_in_omeka"]))
            print("    -  A total of "+str(len(tab_res[q]["doubtfull"]))+" items are doubtfull (need a manual check): "+str(tab_res[q]["doubtfull"]))
            print("    -  A total of "+str(len(tab_res[q]["in_omeka"]))+" items HAVE been found in Omeka: "+str(tab_res[q]["in_omeka"]))
            print("\n")
        print("\n")
        

## Main

### Run

In [92]:
res = check_table( OPERATIONS, output_print = False)
tabs = res[0]
none_tabs = res[1]
stats = res[2]

### Show results

In [65]:
# Print the stats
print_stats(stats)

Table: article_sample
  * Query: omeka_article
    -  A total of 5 items HAVEN'T been found in Omeka: ['المرأة العربية في مواجهة العصر;Arab Women and the Age', 'Arab Women and the Age', 'المرأة العربية في مواجهة الع', 'دار الفتى العربي', 'The Project for the Translation of Arabic']
    -  A total of 0 items are doubtfull (need a manual check): []
    -  A total of 0 items HAVE been found in Omeka: []






In [59]:
# Create the download links to the generated TSVs
print("Download the overview tables")
create_download_links(tabs,"overview")

Download the overview tables


In [60]:
print("Download the tables for the items not found in Omeka")
create_download_links(none_tabs,"new")

Download the tables for the items not found in Omeka
