Step 0 - Ground Truth Extraction

In [4]:
import os
import sys
import argparse
import sparql
import pandas as pd
import time
import csv
import json
from SPARQLWrapper import SPARQLWrapper, JSON
from endpoints import WikidataEndpoint
import re
import requests
import xml.etree.ElementTree as ET
from lookup import WikidataAPI

current_path = os.getcwd()
parser = argparse.ArgumentParser()

parser.add_argument(
    '--output_dir',
    type=str,
    default=os.path.join(current_path, 'output'),
    help='Directory of output'
)
parser.add_argument(
    '--in_dir',
    type=str,
    default='../SemTab_DataSets/Round1DataSets/Valid',
    help='Input dir containing data tables gt and targets'
)
parser.add_argument(
    '--tables',
    type=str,
    default='/tables',
    help='input data tables'
)
parser.add_argument(
    '--target',
    type=str,
    default='/targets/cea_targets.csv',
    help='Target file for CEA'

    # default='/targets/cta_targets.csv',
    # help='Target file for CTA'

    # default='/targets/cpa_targets.csv',
    # help='Target file for CPA'
)

FLAGS, unparsed = parser.parse_known_args()


In [5]:
wd_prefix = 'http://www.wikidata.org/entity/'

def get_wikidata_superclasses(query, attempts = 1):
    sparqlw = SPARQLWrapper(
            'https://query.wikidata.org/bigdata/namespace/wdq/sparql')
    query = """
            SELECT DISTINCT ?uri
            WHERE {
                wd:%s wdt:P31/wdt:P270* ?uri.
            }""" % (query)
    try:
        sparqlw.setQuery(query)
        sparqlw.setReturnFormat(JSON)
        results = sparqlw.query().convert()

        result_set = set()
        for result in results["results"]["bindings"]:
            uri_value = result["uri"]["value"]
            if uri_value.startswith(wd_prefix):
                result_set.add(uri_value.split(wd_prefix)[1])
        return result_set

    except Exception as e:
        print(e)
        print("Query '%s' failed. Attempts: %s" % (query, str(attempts)))
        time.sleep(60)  # To avoid rate limits, sleep for 60 seconds
        attempts -= 1
        if attempts > 0:
            return get_wikidata_superclasses(query, attempts)
        else:
            return None

def get_wikidata_classes(query, limit=1):
    wikidata = WikidataAPI()
    entities = wikidata.getKGEntities(query, limit, 'item')
    i = 0
    classes = list()
    entity_classes = dict()
    for ent in entities:
        classes.append(ent.getId().split(wd_prefix)[1])
        i += 1
        if len(classes) == 0:
            print('Zero classes')
        if len(classes) > 0:
            entity_classes = dict()
            entity_classes = classes
    return entity_classes


In [6]:
# reading the target file
target_df = pd.read_csv(os.path.join(FLAGS.in_dir+FLAGS.target), header=None, nrows=100)
target_df.columns = ['Table_id', 'Row_id', 'Column_id']

target_dict = dict()
for index, row in target_df.iterrows():
    if row['Table_id'] not in target_dict:
        target_dict[row['Table_id']] = []
    target_dict[row['Table_id']].append(int(row['Column_id']))

# reading the input data
data = dict()

for file in target_dict:
    data[file] = dict()
    df_data = pd.DataFrame()
    df_title = pd.DataFrame()

    filename = file + '.csv'
    tab_data_file = os.path.join(FLAGS.in_dir+FLAGS.tables, filename)

    if len(target_dict[file]) > 0:
        df_data = pd.read_csv(tab_data_file, header=None, skiprows=[
                              0], usecols=target_dict[file])
        df_title = pd.read_csv(tab_data_file, header=None,
                               usecols=target_dict[file], nrows=1)
    else:
        df_data = pd.read_csv(tab_data_file, header=None, skiprows=[0])
        df_title = pd.read_csv(tab_data_file, header=None, nrows=1)
    # adding the column headers to the data dictionary
    try:
        data[file]['column_titles'] = list(df_title.iloc[0, :])
    except:
        pass

    file_element = dict()
    for column in df_data.columns:
        # without cell value repetition
        file_element[column] = list(set(df_data[column]))
    data[file]['data'] = file_element

    # getting superclasses
    for cells in data[file]['data']:
        data[file]['gt'] = []
        for cell in data[file]['data'][cells]:
            wd_class = get_wikidata_classes(cell)
            if len(wd_class) > 0:
                for cls in wd_class :
                    data[file]['gt'].append(cls)
                    wd_superclass = get_wikidata_superclasses(cls)
                    if wd_superclass and len(wd_superclass) > 0 :
                        for item in wd_superclass:
                            data[file]['gt'].append(item)
                data[file]['gt']=list(set(data[file]['gt']))

with open(os.path.join(FLAGS.output_dir, 'column_gt_extend.json'), 'w') as f:
    json.dump(data, f)

Step 1 - Lookup

In [7]:
print('''Lookup and update new entities and classes''')
ent_cls = dict()
i = 0
for file in data:
    filename = file
    for col in data[file]['data']:
        column_index = col
        for line_j in range(len(data[file]['data'][col])):
            i += 1
            cell = data[file]['data'][col][line_j]
            if isinstance(cell, str):  # Check if cell is a string
                cell = cell.replace('[', '').replace(']', '')
            if cell not in ent_cls:
                ent_cls[cell] = {
                    'candidate_classes': get_wikidata_classes(cell, 1)
                }
    with open('output/entities_classes.json', 'w') as fp:
        json.dump(ent_cls, fp)


Lookup and update new entities and classes


In [8]:
candidate_dict = dict()

for filename in target_dict:
    candidate_dict[filename] = dict()
    for col_index in target_dict[filename]:
        candidate_dict[filename][col_index] = []

    for col_index in target_dict[filename]:
        for cell in ent_cls:
            try:
                if cell in data[filename]['data'][0]:
                    for candidate_cls in ent_cls[cell]['candidate_classes']:
                        for candidate_class in ent_cls[cell]['candidate_classes']:
                            if (candidate_class, cell) not in candidate_dict[filename][col_index]:
                                candidate_dict[filename][col_index].append(
                                    (candidate_class, cell))
            except:
                pass

# saving the candidate_dict to a JSON file
with open('output/candidate_columns.json', 'w') as fp:
    json.dump(candidate_dict, fp)

In [9]:
df_entities = pd.DataFrame()

for filename in candidate_dict:
    for col in candidate_dict[filename]:
        df_entities = df_entities._append(pd.DataFrame(
            candidate_dict[filename][col], columns=['type', 'entity']), ignore_index=True)


df_entities


Unnamed: 0,type,entity
0,Q6386554,Kelso Township
1,Q6500028,Laurel Township
2,Q987536,Brookville Township
3,Q5604743,Greenville Township
4,Q7998039,Mill Township
...,...,...
64,Q796762,Bookends
65,Q217750,Violetta Villas
66,Q71278710,Easy Pieces
67,Q290268,Dirty Deeds Done Dirt Cheap


In [10]:

classes_list = list()
for cell in ent_cls:
    classes_list.append(ent_cls[cell]['candidate_classes'])

# getting unique tuples
unique_classes = set(tuple(row) for row in classes_list)

In [11]:
def get_candidate_classes(cand_dict):
    candidate_classes = {}

    for file, columns in cand_dict.items():
        for col, cell_data in columns.items():
            neighbouring_classes = set()

            for cell in cell_data:
                neighbouring_classes.add(cell[0])  # add the class to neighbouring_classes

                # initialize candidate_classes entry if it doesn't exist
                if cell[0] not in candidate_classes:
                    candidate_classes[cell[0]] = {
                        'coexist_cls': set(),
                        'positive_samples': set(),
                        'negative_samples': set(),
                        'general_pos_samples': set()
                    }

            # updating co-existing classes for each class in neighbouring_classes
            for candidate_class in neighbouring_classes:
                other_classes = neighbouring_classes - {candidate_class}
                candidate_classes[candidate_class]['coexist_cls'].update(other_classes)

    return candidate_classes

def get_entities(cls, limit=1):
    wikidata = WikidataAPI()
    classes = list()
    try:
        wd_result = wikidata.getKGEntities(cls, limit, "item")
        for entity in wd_result:
            classes.append(entity.getId().split(wd_prefix)[1])
    except:
        pass

    return classes
    
def process_candidate_classes(sample_classes, df_entities):
    for cand_class in sample_classes:
        sample_classes[cand_class]['positive_samples'].update(
            set(df_entities[df_entities.type == cand_class].entity)) # updating pos samples
        for cls_set in unique_classes:
            if cand_class in cls_set:
                sample_classes[cand_class]['coexist_cls'] = sample_classes[cand_class]['coexist_cls'] - set(cls_set) # remove coexisting classes for same entity

        for neighbour_cls in sample_classes[cand_class]['coexist_cls']:
            sample_classes[cand_class]['negative_samples'].update(
                set(df_entities[df_entities.type == neighbour_cls].entity)) # updating neg samples

        sample_classes[cand_class]['general_pos_samples'] = set(
            get_entities(cand_class, 1)) # updating general pos samples

        sample_classes[cand_class]['coexist_cls'] = list(
            sample_classes[cand_class]['coexist_cls'])
        sample_classes[cand_class]['positive_samples'] = list(
            sample_classes[cand_class]['positive_samples'])
        sample_classes[cand_class]['negative_samples'] = list(
            sample_classes[cand_class]['negative_samples'])
        sample_classes[cand_class]['general_pos_samples'] = list(
            sample_classes[cand_class]['general_pos_samples'])

    return sample_classes

In [12]:
# initialise sample classes
sample_classes = get_candidate_classes(candidate_dict)

# printing information about the classes and the number of neighbours
# for key, value in sample_classes.items():
#     print(f"Class: {key} with {len(value['coexist_cls'])} neighbouring classes")

# processing candidate classes and updating samples
sample_classes = process_candidate_classes(sample_classes, df_entities)

# saving the processed sample classes
with open('output/sample_classes.json', 'w') as fp:
    json.dump(sample_classes, fp)
