Step 3 - Prediction

In [19]:
import os
import sys
import argparse
import numpy as np
import tensorflow as tf
import json
import random
import time
import re
import requests
import xml.etree.ElementTree as ET
from tensorflow import keras
from IPython.display import clear_output
from pattern.text.en import tokenize
from gensim.models import Word2Vec
from util_cnn import generate_synthetic_columns, synthetic_columns2sequence, sequence2matrix, random_cells2synthetic_columns, ordered_cells2synthetic_columns, permutation_cells2synthetic_columns
from lookup import WikidataAPI

current_path = os.getcwd()
parser = argparse.ArgumentParser()

parser.add_argument(
    '--model_dir',
    type=str,
    default='../../enwiki_model/',
    help='Directory of word2vec model')
parser.add_argument(
    '--synthetic_column_size',
    type=int,
    default=4,
    help='Size of synthetic column')
parser.add_argument(
    '--synthetic_column_type',
    type=int,
    default=-1,
    help='synthetic column num to sample for each column; '
         '>=1: sample a number; 0: sliding window; -1: permutation combination and voting')
parser.add_argument(
    '--sequence_size',
    type=int,
    default=60,
    help='Length of word sequence of entity unit')
parser.add_argument(
    '--cnn_evaluate',
    type=str,
    default=os.path.join(current_path, 'output/cnn/cnn_1_2_1.00'),
    help='Directory of trained models')
parser.add_argument(
    '--output_dir',
    type=str,
    default=os.path.join(current_path, 'output/'),
    help='Directory of output')

FLAGS, unparsed = parser.parse_known_args()

prediction_dir = os.path.join(FLAGS.output_dir,'predictions')
if not os.path.exists(prediction_dir):
    os.mkdir(prediction_dir)


In [20]:
print('load word2vec model ...')
w2v_model = Word2Vec.load(os.path.join(FLAGS.model_dir, 'word2vec_gensim'))


load word2vec model ...


In [None]:
def load_json(file):
    with open(file) as json_file:
        return json.load(json_file)


def load_model(cnn_model_directory, candidate_class):
    return keras.models.load_model(cnn_model_directory+'\%s' % candidate_class)


Predict using cnn

In [None]:
def align_samples(pos, neg):
    if len(pos) <= len(neg):
        pos_new = pos * int(len(neg) / len(pos))
        neg_new = neg * 1
        pos_new += random.sample(pos, len(neg_new) - len(pos_new))
    else:
        neg_new = neg * (len(pos) / len(neg))
        pos_new = pos * 1
        neg_new += random.sample(neg, len(pos_new) - len(neg_new))
    return pos_new, neg_new


def embedding(entities_positive, entities_negative):
    # embedding
    units_positive = generate_synthetic_columns(
        entities_positive, FLAGS.synthetic_column_size)
    units_negative = generate_synthetic_columns(
        entities_negative, FLAGS.synthetic_column_size)

    sequences_positive = list()
    for ent_unit in units_positive:
        sequences_positive.append(
            synthetic_columns2sequence(ent_unit, FLAGS.sequence_size))
    sequences_negative = list()
    for ent_unit in units_negative:
        sequences_negative.append(
            synthetic_columns2sequence(ent_unit, FLAGS.sequence_size))

    x = np.zeros((len(sequences_positive) + len(sequences_negative),
                 FLAGS.sequence_size, w2v_model.vector_size, 1))
    for sample_i, sequence in enumerate(sequences_positive + sequences_negative):
        x[sample_i] = sequence2matrix(sequence, FLAGS.sequence_size, w2v_model)

    y_positive = np.ones((len(sequences_positive), 1))
    y_negative = np.zeros((len(sequences_negative), 1))
    y = np.concatenate((y_positive, y_negative))

    # shuffling
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(y.shape[0]))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]
    return x_shuffled, y_shuffled


In [None]:
cnn_dir = os.path.join(FLAGS.cnn_evaluate)

# load cnn classifiers
cnn_classifiers = set()
for cls_name in os.listdir(FLAGS.cnn_evaluate):
    cnn_classifiers.add(cls_name)

# load gt, samples, entities
data = load_json(FLAGS.output_dir+'column_gt_extend.json')
samples = load_json(
    FLAGS.output_dir+'sample_classes.json')
entities = load_json(
    FLAGS.output_dir+'entities_classes.json')


def predict(test_x, classifier_name):
    # Load the saved model using TensorFlow 2.0
    loaded_model = load_model(cnn_dir, classifier_name)

    desired_shape = (test_x.shape[0], loaded_model.layers[0].input_shape[1],
                     loaded_model.layers[0].input_shape[2], test_x.shape[-1])

    test_x_reshaped = test_x[:,:desired_shape[1],:,:]

    # Make predictions using the loaded model
    predictions = loaded_model.predict(test_x_reshaped)
    probabilities = tf.nn.sigmoid(predictions).numpy().flatten()

    return probabilities


In [None]:
col_class_p = dict()

for col_i, col in enumerate(data.keys()):
    cells = list(data[col]['data'].values())[0]

    corresponding_ent = []
    for cell in cells:
        if cell in entities:
            corresponding_ent=entities[cell]

    if FLAGS.synthetic_column_type >= 0:
        if FLAGS.synthetic_column_type > 0:
            units = random_cells2synthetic_columns(
                cells, FLAGS.synthetic_column_size, FLAGS.synthetic_column_type)
        else:
            units = ordered_cells2synthetic_columns(
                cells, FLAGS.synthetic_column_size)
    else:
        units = permutation_cells2synthetic_columns(cells)

    X = np.zeros((len(units), FLAGS.sequence_size, w2v_model.vector_size, 1))
    for i, unit in enumerate(units):
        seq = synthetic_columns2sequence(unit, FLAGS.sequence_size)
        X[i] = sequence2matrix(seq, FLAGS.sequence_size, w2v_model)

    for classifier in corresponding_ent['candidate_classes']:
        if classifier in cnn_classifiers:
            if len(samples[classifier]['general_pos_samples']) == 0 or len(samples[classifier]['negative_samples'])==0 :
                continue
            p_ents, n_ents = align_samples(
                samples[classifier]['general_pos_samples'], samples[classifier]['negative_samples'])
            X, _ = embedding(p_ents, n_ents)
            col_class = '%s,%s' % (col, classifier)
            p = predict(X, classifier)
            score = np.mean(p)
            col_class_p[col_class] = score

    if col_i % 5 == 0:
        print('     column %d predicted' % col_i)

col_class_p_serializable = {
    key: float(value) if isinstance(value, np.float32) else value
    for key, value in col_class_p.items()
}

out_filename = 'p_%s.json' % os.path.basename(FLAGS.cnn_evaluate)
with open(os.path.join(prediction_dir, out_filename), 'w') as fp:
    json.dump(col_class_p_serializable, fp)


     column 0 predicted
     column 5 predicted
     column 10 predicted
     column 15 predicted
     column 20 predicted


Prediction by Lookup

In [None]:
wd_prefix = 'http://www.wikidata.org/entity/'

def lookup_wikidata_classes(query, limit=1):
    wikidata = WikidataAPI()
    entities = wikidata.getKGEntities(query, limit, 'item')
    i = 0
    classes = list()
    entity_classes = dict()
    for ent in entities:
        classes.append(ent.getId().split(wd_prefix)[1])
        i += 1
        if len(classes) == 0:
            print('Zero classes')
        if len(classes) > 0:
            entity_classes = dict()
            entity_classes = classes
    return entity_classes


# lookup entities and classes from DBPedia
def lookup_resources(cell_text):
    dbo_prefix = 'http://dbpedia.org/ontology/'
    dbp_prefix = 'http://dbpedia.org/resource/'
    entity_classes = dict()
    cell_items = list()
    cell_brackets = re.findall('\((.*?)\)', cell_text)
    for cell_bracket in cell_brackets:
        cell_text = cell_text.replace('(%s)' % cell_bracket, '')
    cell_text = cell_text.strip()
    if len(cell_text) > 2:
        cell_items.append(cell_text)
    for cell_bracket in cell_brackets:
        if len(cell_bracket) > 2:
            cell_items.append(cell_bracket.strip())
    for cell_item in cell_items:
        try:
            lookup_url = 'http://lookup.dbpedia.org/api/search/KeywordSearch?MaxHits=2&QueryString=%s' % cell_item
            lookup_res = requests.get(lookup_url)
            root = ET.fromstring(lookup_res.content)
            for child in root:
                entity = child[1].text.split(dbp_prefix)[1]
                classes = list()
                for cc in child[3]:
                    cls_URI = cc[1].text
                    if dbo_prefix in cls_URI:
                        classes.append(cls_URI.split(dbo_prefix)[1])
                entity_classes[entity] = classes
        except UnicodeDecodeError:
            pass
    return entity_classes


In [None]:
print('Lookup-based prediction column by column')

col_class_p = dict()
for col_i, col in enumerate(data.keys()):
    cells = list(data[col]['data'].values())[0]
    cell_classes = dict()
    unq_clses = set()
    for cell in cells:
        classes = lookup_wikidata_classes(cell)
        cell_classes[cell] = classes
        unq_clses = unq_clses | set(classes)

    for cls in unq_clses:
        count = 0
        for cell in cells:
            if cls in cell_classes[cell]:
                count += 1
        p = float(count) / float(len(cells))
        col_class = '%s,%s' % (col, cls)
        col_class_p[col_class] = p

    if col_i % 10 == 0:
        print('     column %d annotated' % col_i)
    if (col_i + 1) % 30 == 0:
        time.sleep(60*5)

out_filename = 'p_lookup.json'
with open(os.path.join(prediction_dir, out_filename), 'w') as fp:
    json.dump(col_class_p, fp)


Lookup-based prediction column by column
     column 0 annotated
     column 10 annotated
     column 20 annotated


Prediction ensemble of results by lookup and cnn

In [None]:
# file containing prediction by CNN training
predictions_model = os.path.join(current_path,'output/predictions/p_cnn_1_2_1.00.json')
# file containing prediction by lookup
predictions_voting = os.path.join(current_path,'output/predictions/p_lookup.json')

p_voting = dict()
p_voting = load_json(predictions_voting)

p_model = dict()
p_model = load_json(predictions_model)

p = dict()
for col_cls in p_voting:
    if p_voting[col_cls] >= 0.6:
        p[col_cls] = p_voting[col_cls]
    elif p_voting[col_cls] >= 0.25:
        if col_cls in p_model:
            p[col_cls] = p_model[col_cls]
        else:
            p[col_cls] = p_voting[col_cls]
    else:
        p[col_cls] = p_voting[col_cls]

out_filename = '%s_lookup.json' % os.path.basename(
    predictions_model).split('.json')[0]
with open(os.path.join(prediction_dir, out_filename), 'w') as fp:
    json.dump(p, fp)
