# 시간 단축 only

# KB에 없는 negative triple 생성하여 symbolic Unification

# Neural Theorem Prover using pandas and Pytorch

## 1. Symbolic Unificaiton using pandas DataFrame
- Load Files 
- Define Functions 
- Generate Meta Tables
- Run Symbolic Unification and generate batch 

## 2. NTP Model Training with PyTorch
- Define Model Structure using PyTorch
- Define Foward Function 
- Training Model

## 3. Extract Rules from Trained Embedding Vectors
- Matching Rule templates with Embedding vectors 
- Extract Induced Rules

## 4. Test Model 
- Evaluate Model with Test data

### import packages

In [1]:
import numpy as np
import pandas as pd
import re
import collections
import pprint
from termcolor import colored, cprint
import random
import copy
from datetime import datetime, timedelta
from pprint import pprint
from itertools import permutations

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# import tensorflow as tf

# to print pandas dataframe
from IPython.display import display

## 1. Symbolic Unificaiton using pandas DataFrame
### Load Data Files using pandas
- train : Knowledge Graph file with triple form
- test : query with triple form

In [2]:
data_name = 'ex_neg'

In [3]:
train = pd.read_csv('./data/example_6.txt', sep='\t', names=['subj','pred','obj'])
test = pd.read_csv('./data/example_test_6.txt', sep='\t', names=['subj','pred','obj'])


In [4]:
train

Unnamed: 0,subj,pred,obj
0,BART,nationality,USA
1,USA,hasCitizen,BART
2,BART,placeOfBirth,NEWYORK
3,NEWYORK,locatedIn,USA
4,BART,hasFather,HOMMER
5,HOMMER,nationality,USA


In [5]:
test = test.sample(frac=1).reset_index(drop=True)
# test

### Load Rule template and parsing using regular expression

In [6]:
def trim(string):
    """
    - function: trim whitespaces
    :param string: an input string
    
    :return: the string without trailing whitespaces
    """
    return re.sub("\A\s+|\s+\Z", "", string)

def load_from_file(path, rule_template=False):
    """
    - function: load and parsing file
    :param path: file's location 
    :param rule_template: check rule file
    
    :return : parsed kb or rule template
    """
    with open(path, "r") as f:
        text = f.readlines()
        text = [x for x in text if not x.startswith("%") and x.strip() != ""]
        text = "".join(text)
        rules = [x for x in re.split("\.\n|\.\Z", text) if x != "" and
                 x != "\n" and not x.startswith("%")]
        print('rules')
        print(rules)
        kb = parse_rules(rules, rule_template=rule_template)
        return kb
    
def parse_rules(rules, delimiter="#####", rule_template=False):
    """
    - function: read file and parse rules
    :param rules: rules which need parsing
    :param delimiter: a line delimiter
    
    :return: parsed rules
    """
    kb = []
    for rule in rules:
        print(rule)
        if rule_template:
            splits = re.split("\A\n?([0-9]?[0-9]+)", rule)
            print(splits)
            # fixme: should be 0 and 1 respectively
            num = int(splits[1])
            rule = splits[2]
        rule = re.sub(":-", delimiter, rule)
        print('rule1')
        print(rule)
        rule = re.sub("\),", ")"+delimiter, rule)
        print('rule2')
        print(rule)
        rule = [trim(x) for x in rule.split(delimiter)]
        rule = [x for x in rule if x != ""]
        if len(rule) > 0:
            atoms = []
            for atom in rule:
                splits = atom.split("(")
                predicate = splits[0]
                args = [x for x in re.split("\s?,\s?|\)", splits[1]) if x != ""]
                atoms.append((predicate, args[0], args[1]))
            #@jaeseung : get augment number
            atoms.append(num)
            
            kb.append(atoms)
    return kb

In [7]:
rules = load_from_file('./data/example.nlt', rule_template=True)
# rules = load_from_file('./new_data/kinship.nlt', rule_template=True)
# rules = load_from_file('./new_data/train2.nlt', rule_template=True)
rules

rules
['3\t#1(X, Y) :- #2(Y, X)', '3\t#1(X, Y) :- #2(X, Z),#3(Z, Y)']
3	#1(X, Y) :- #2(Y, X)
['', '3', '\t#1(X, Y) :- #2(Y, X)']
rule1
	#1(X, Y) ##### #2(Y, X)
rule2
	#1(X, Y) ##### #2(Y, X)
3	#1(X, Y) :- #2(X, Z),#3(Z, Y)
['', '3', '\t#1(X, Y) :- #2(X, Z),#3(Z, Y)']
rule1
	#1(X, Y) ##### #2(X, Z),#3(Z, Y)
rule2
	#1(X, Y) ##### #2(X, Z)######3(Z, Y)


[[('#1', 'X', 'Y'), ('#2', 'Y', 'X'), 3],
 [('#1', 'X', 'Y'), ('#2', 'X', 'Z'), ('#3', 'Z', 'Y'), 3]]

### Generate Dictionary from Train & Test data

In [8]:
id2sym_dict = {}
sym2id_dict = {}

In [9]:
# get entities from train & test data 
entities_list = sorted(set(train.subj.values).union(set(train.obj.values)).union(set(test.subj.values)).union(set(test.obj.values)))
entities_list

['BART', 'HOMMER', 'NEWYORK', 'USA']

In [10]:
predicate_list = sorted(set(train.pred.values).union(set(test.pred.values)))
print(predicate_list)

tmp_pred_list = []
# get rule's predicates
for i, rule in enumerate(rules):
    # get all body
    for r in rule[:-1]:
        # if rule has augment
        for j in range(rule[-1]):
            suffix = '_' + str(i) + '_' + str(j)
            tmp_pred_list.append(r[0]+suffix)
            
print(tmp_pred_list)

predicate_list = sorted(set(predicate_list).union(set(tmp_pred_list)))
predicate_list

['hasCitizen', 'hasFather', 'locatedIn', 'nationality', 'placeOfBirth']
['#1_0_0', '#1_0_1', '#1_0_2', '#2_0_0', '#2_0_1', '#2_0_2', '#1_1_0', '#1_1_1', '#1_1_2', '#2_1_0', '#2_1_1', '#2_1_2', '#3_1_0', '#3_1_1', '#3_1_2']


['#1_0_0',
 '#1_0_1',
 '#1_0_2',
 '#1_1_0',
 '#1_1_1',
 '#1_1_2',
 '#2_0_0',
 '#2_0_1',
 '#2_0_2',
 '#2_1_0',
 '#2_1_1',
 '#2_1_2',
 '#3_1_0',
 '#3_1_1',
 '#3_1_2',
 'hasCitizen',
 'hasFather',
 'locatedIn',
 'nationality',
 'placeOfBirth']

In [11]:
id2sym_dict[0] = 'UNK'
sym2id_dict['UNK'] = 0

# constant_ids = []
predicate_ids = []

for i, p in enumerate(predicate_list):
    id2sym_dict[i+1] = p
    sym2id_dict[p] = i+1
    predicate_ids.append(i+1)
    
# @blocked : remove entities from dict
# for i, e in enumerate(entities_list):
#     id2sym_dict[i+len(predicate_list)+1] = e
#     sym2id_dict[e] = i+len(predicate_list)+1
#     constant_ids.append(i+len(predicate_list)+1)
    
# print(constant_ids)
# print()
print(predicate_ids)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


In [12]:
id2sym_dict

{0: 'UNK',
 1: '#1_0_0',
 2: '#1_0_1',
 3: '#1_0_2',
 4: '#1_1_0',
 5: '#1_1_1',
 6: '#1_1_2',
 7: '#2_0_0',
 8: '#2_0_1',
 9: '#2_0_2',
 10: '#2_1_0',
 11: '#2_1_1',
 12: '#2_1_2',
 13: '#3_1_0',
 14: '#3_1_1',
 15: '#3_1_2',
 16: 'hasCitizen',
 17: 'hasFather',
 18: 'locatedIn',
 19: 'nationality',
 20: 'placeOfBirth'}

In [13]:
sym2id_dict

{'UNK': 0,
 '#1_0_0': 1,
 '#1_0_1': 2,
 '#1_0_2': 3,
 '#1_1_0': 4,
 '#1_1_1': 5,
 '#1_1_2': 6,
 '#2_0_0': 7,
 '#2_0_1': 8,
 '#2_0_2': 9,
 '#2_1_0': 10,
 '#2_1_1': 11,
 '#2_1_2': 12,
 '#3_1_0': 13,
 '#3_1_1': 14,
 '#3_1_2': 15,
 'hasCitizen': 16,
 'hasFather': 17,
 'locatedIn': 18,
 'nationality': 19,
 'placeOfBirth': 20}

## Define Functions 

### pseudo code
- goal: query (e.g. kim nationality korea)
- rule: rule template (e.g. #1(X,Y) :- #2(X,Z), #3(Z,Y))

- 1. 주어진 rule template의 conclusion과 query를 매칭
    - conclusion의 X,Y와 같은 Variable에 대하여   
    query를 참조하여 X/kim, Y/korea와 같이 binding
    - binding 후 각 Variable에 대하여 subject file과 object file 생성
    - substitution dictionary에 key를 Variable로, value를 (subject file, object file)로 저장  



- 2. 앞서 binding된 Variable을 참조하여 rule body의 Variable을 매칭
    - conclusion의 #1(X,Y)를 통해 binding된 X에 대한 substitution을 참조하여  
    #2(X,Z)와 같은 body의 variable인 Z를 binding하는 작업을 수행
        - 위 경우에는 substitution으로부터 X의 subject file을 참조하여 Z에 대하여 binding 수행
    - rule의 모든 body의 variable에 대하여 binding을 수행하고 결과를 substitution dictionary에 저장
    
    
    


In [19]:
def search_triples(entity, kb):
    '''
    - function: generate subject & object file with given entities
    :param entity: an entity to find subject & object file
    :param kb: a knowledge graph
    
    :return: tuple of subject & object file
    '''
    # for debug
#     print(colored('search_triples entity : ', 'magenta'), entity)
    if isinstance(entity, list):
        
        print("isinstance is list")
        print(entity)
        # 가능한 entity가 여러 개가 올 경우를 list로 처리
        tmp = []
        for ent in entity:
            tmp.append(kb[kb['subj'] == ent])
        
        # 0815 update : when substitution is empty
        if len(tmp) == 0:
            subj_df = pd.DataFrame([], columns=['subj', 'pred', 'obj'])
#             display(subj_df)
        else:
            subj_df = pd.concat(tmp)
        tmp = []
        for ent in entity:
            tmp.append(kb[kb['obj'] == ent])

        # 0815 update : when substitution is empty
        if len(tmp) == 0:
            obj_df = pd.DataFrame([], columns=['subj', 'pred', 'obj'])
#             display(obj_df)
        else:
            obj_df = pd.concat(tmp)
        return subj_df, obj_df
    else : 
        print("else")
        print(entity)
        # 하나의 entity가 올 경우
        subj_df = kb[kb['subj'] == entity]
        obj_df = kb[kb['obj'] == entity]
        return subj_df, obj_df

def join_triples(substitution, rule, pos):
    '''
    - function: join triples with rule to get proof path 
    :param substitution: a dictionary 
        - key: Variable / value: subject & object file
    :param rule: a rule template
    :param pos: check positive / negative sampling
    
    :return: proof paths generated with rule template
    '''
    proof_path = []
    
    tmp_result_df = None
    head_df = None
    body_df = None
    
    query_df = substitution['Q']
    
    for i in range(len(rule)-2): #rule 길이에 따른 join 횟수
        # get variable from rule template
        head_subj = rule[i][1] 
        head_obj = rule[i][2]
        
        body_subj = rule[i+1][1]
        body_obj = rule[i+1][2]
        
        # check common variable
        if head_subj == body_subj or head_subj == body_obj: # #1(X,Y) #2(Y,X)
            comVar = head_subj
        elif head_obj == body_subj or head_obj == body_obj:
            comVar = head_obj

        # get head dataframe using rule's variable
        # if this is first step
        if tmp_result_df is None:
            # 0812 update : use query df only
            head_df = query_df

            # new DF foramt with [Common Variable, Triples]
            tmp_head = []
            for col in head_df.itertuples(index=False):
                # check common variable
                if comVar == head_subj:
                    tmp_head.append([col.subj, [list(col)]])
                elif comVar == head_obj:
                    tmp_head.append([col.obj, [list(col)]])
                
            head_df = pd.DataFrame(tmp_head, columns=['comVar', 'triples'])

        # if previous step's result df exists
        else:
            # if tmp_result_df exists update comVar
            tmp_head = []
            for col in tmp_result_df.itertuples(index=False):
                # check common variable
                if comVar == head_subj:
                    tmp_head.append([col.triples[-1][0], col.triples])
                elif comVar == head_obj:
                    tmp_head.append([col.triples[-1][2], col.triples])
            
            head_df = pd.DataFrame(tmp_head, columns=['comVar', 'triples'])

        # get body dataframe using rule's variable
        body_df_subj = substitution[body_subj][1][0]
        body_df_obj = substitution[body_obj][1][1]
        
        body_df = pd.merge(body_df_subj, body_df_obj, how='inner')
        print('merge body df = ')
        print(body_df)
        
        # new DF foramt with [Common Variable, Triples]
        tmp_body = []
        for col in body_df.itertuples(index=False):
            # check common variable
            if comVar == body_subj:
                tmp_body.append([col.subj, [list(col)]])
            elif comVar == body_obj:
                tmp_body.append([col.obj, [list(col)]])

        body_df = pd.DataFrame(tmp_body, columns=['comVar', 'triples'])
        print('final body df = ')
        print(body_df)

        # merge two dataframe on common variable
        print('before join')
        print(head_df)
        print(body_df)
        tmp_result_df = pd.merge(head_df, body_df, how='inner', on='comVar', suffixes=('_left', '_right'))
        print('after join tmp_result_df = ')
        print(tmp_result_df)

        # for debug
#         print(colored('tmp result', 'red', attrs=['bold']))
#         display(tmp_result_df)
        
        # Todo : need fix
        # check left and right triples are same
        check_same_list = []
        for row, col in tmp_result_df.iterrows():
            if sorted(col['triples_left']) != sorted(col['triples_right']):
                check_same_list.append(row)
            
        tmp_result_df = tmp_result_df.iloc[check_same_list]

        # merge left & right triples columns into one column
        tmp_result_df['triples'] = tmp_result_df['triples_left'] + tmp_result_df['triples_right']
        del tmp_result_df['triples_left']
        del tmp_result_df['triples_right']
        
        # for debug
#         print(colored('result', 'red', attrs=['bold']))
#         display(tmp_result_df)
#         print()
        
    # if rule ends, return final result df
    result = tmp_result_df
    print('del triple_left, right, + +')
    print(result)
    # check result dataframe and generate sim_id proof paths
    for col in result.itertuples(index=False):
        tmp_path = []
        for i in range(len(rule)-1):
            tmp_path.append((rule[i][0], col.triples[i][1]))
        
        if tmp_path not in proof_path:
            proof_path.append(tmp_path)
    print('proof_path = ')
    print(proof_path)
    return proof_path

def unify(goal, rule, kb, depth=0, substitution={}, neg_per_pos=4):
    '''
    - function: 
        1. Unify Variables and store information in substitution dictionary
        2. Check Common Variable from Rules and Join each Triples
        
    :param goal: a query triple (e.g. [kim nationality korea])
    :parma rule: a given rule template (e.g. [#1(X,Y) :- #2(Y,X), 2])
    :param depth: an integer indicates rule depth
    :param substitution: a dictionary which has information of unified variables
        - key: Variable / value: subject & object file
    :return: proof paths generated by Symbolic Unification 
    '''
            
    # for debug
#     print(colored('goal : ', 'red'))
#     display(goal)
#     print(colored('rule : ', 'green'), rule)
#     print(colored('substitution : ', 'cyan'))
#     for k, v in substitution.items():
#         print('key : ', k)
#         display(v)
#     print()
    
    # if substitution is empty
    if len(substitution) == 0 :
        print('depth : ', depth)
#         substitution['Q'] = goal.to_frame().transpose()
        substitution['Q'] = pd.DataFrame(goal, index=['subj','pred','obj']).transpose()
        # subject variable binding
        if rule[depth][1] not in substitution.keys():
#             substitution[rule[depth][1]] = [goal['subj'], search_triples(goal['subj'], kb)]
            substitution[rule[depth][1]] = [goal[0], search_triples(goal[0], kb)]
            print(goal[0])
        # object variable binding
        if rule[depth][2] not in substitution.keys():
#             substitution[rule[depth][2]] = [goal['obj'], search_triples(goal['obj'], kb)]
            substitution[rule[depth][2]] = [goal[2], search_triples(goal[2], kb)]
            print(goal[2])
    else:
        # for debug
#         print('*'*20)
#         print(goal)
        print('depth : ', depth)
        # check which variable needs binding
        # check subject variable
        if rule[depth][1] not in substitution.keys():
            tmp_var = list(set(goal.subj.values))
            substitution[rule[depth][1]] = [tmp_var, search_triples(tmp_var, kb)]
        # object subject variable
        if rule[depth][2] not in substitution.keys():
            tmp_var = list(set(goal.obj.values))
            substitution[rule[depth][2]] = [tmp_var, search_triples(tmp_var, kb)]
    
    # if last rule body return sim_ids
    if depth == len(rule)-2:
        # 0812 update : negative sampling
        
        # Todo : if there are only few predicate exists - like countries[locatedIn, neighbourOf] 
        proof_paths = []
        # first : positive paths
        proof_paths.append(join_triples(substitution, rule, pos=True))
        # iteration for negative random sampling
        
        return proof_paths

    else:
        depth = depth + 1
        
        # check common variable
        # if first variable is common
        if rule[depth-1][1] == rule[depth][1]:
            print('##[1],[1]##')
            print('substitution = ')
            pprint(substitution)
#             print('sub-goal')
#             print('substitution[rule[depth-1][1]]')
#             pprint(substitution[rule[depth-1][1]])
#             print('substitution[rule[depth-1][1]][1]')
#             pprint(substitution[rule[depth-1][1]][1])
#             print('substitution[rule[depth-1][1]][1][0]')
#             pprint(substitution[rule[depth-1][1]][1][0])
            print('sub-goal = ')
            pprint(substitution[rule[depth-1][1]][1][0])
            return unify(substitution[rule[depth-1][1]][1][0], rule, kb, depth, substitution, neg_per_pos)
        if rule[depth-1][1] == rule[depth][2]:
            print('##[1],[2]##')
            print('substitution = ')
            pprint(substitution)
#             print('substitution')
#             pprint(substitution)
#             print('sub-goal')
#             print('substitution[rule[depth-1][1]]')
#             pprint(substitution[rule[depth-1][1]])
#             print('substitution[rule[depth-1][1]][1]')
#             pprint(substitution[rule[depth-1][1]][1])
#             print('substitution[rule[depth-1][1]][1][1]')
#             pprint(substitution[rule[depth-1][1]][1][1])
            print('sub-goal = ')
            pprint(substitution[rule[depth-1][1]][1][1])
            return unify(substitution[rule[depth-1][1]][1][1], rule, kb, depth, substitution, neg_per_pos)
        
        # if second variable is common
        if rule[depth-1][2] == rule[depth][1]:
            print('##[2],[1]##')
            print('substitution = ')
            pprint(substitution)
            print('sub-goal = ')
            pprint(substitution[rule[depth-1][2]][1][0])
            return unify(substitution[rule[depth-1][2]][1][0], rule, kb, depth, substitution, neg_per_pos)
        if rule[depth-1][2] == rule[depth][2]:
            print('##[2],[2]##')
            print('substitution = ')
            pprint(substitution)
            print('sub-goal = ')
            pprint(substitution[rule[depth-1][2]][1][1])
            return unify(substitution[rule[depth-1][2]][1][1], rule, kb, depth, substitution, neg_per_pos)

### Run Symbolic Unification 


# Change entities to generate negative triples

In [15]:
pos_per_batch = 1
sampling_scheme = False

if sampling_scheme:
    neg_per_pos = 4
else: 
    neg_per_pos = 2

batch_size = pos_per_batch + (pos_per_batch * neg_per_pos)
print(batch_size)


3


In [16]:
predicate_list

['#1_0_0',
 '#1_0_1',
 '#1_0_2',
 '#1_1_0',
 '#1_1_1',
 '#1_1_2',
 '#2_0_0',
 '#2_0_1',
 '#2_0_2',
 '#2_1_0',
 '#2_1_1',
 '#2_1_2',
 '#3_1_0',
 '#3_1_1',
 '#3_1_2',
 'hasCitizen',
 'hasFather',
 'locatedIn',
 'nationality',
 'placeOfBirth']

In [17]:
# predicate_set = set([pred for pred in predicate_list if pred[0] != '#'])
# predicate_set

In [18]:
# 0818 : generate batch Function
def generate_batch(query, rules, neg_per_pos):
    '''
    - function: generate batch from given query triples & rule templates
        
    :param query: query triples with dataframe format
    :parma rule: a given rule template (e.g. [#1(X,Y) :- #2(Y,X), 2])
    :param neg_per_pos: a number of negative data for each postive data
    :return: proof paths with (r1, r2) format
    '''    

    total_syms_list_r1 = []
    total_syms_list_r2 = []

    # debug : to check time
    start = datetime.now()

    for col in query.itertuples(index=False):

        # if use test
#         sim_syms_list_r1 = [[[] for rule in rules] for i in range(neg_per_pos+1)]
#         sim_syms_list_r2 = [[[] for rule in rules] for i in range(neg_per_pos+1)]

        # if use new_test
        sim_syms_list_r1 = [[[] for rule in rules]] ###[[[], []]] rule predicate
        sim_syms_list_r2 = [[[] for rule in rules]] ###[[[], []]] kb predicate
        
        for i, rule in enumerate(rules):           
            sim_syms_list = unify(list(col), rule, train, substitution={}, neg_per_pos=neg_per_pos)
            for k, sim_syms in enumerate(sim_syms_list):
                for sim in sim_syms:
                    tmp_syms_list_r1 = []
                    tmp_syms_list_r2 = []
                    for j in range(rule[-1]):
                        suffix = '_' + str(i) + '_' + str(j)
                        tmp_ids_r1 = []
                        tmp_ids_r2 = []

                        for r1, r2 in sim:
                            tmp_ids_r1.append(r1+suffix)
                            tmp_ids_r2.append(r2)

                        tmp_syms_list_r1.append(tmp_ids_r1)
                        tmp_syms_list_r2.append(tmp_ids_r2)

                    sim_syms_list_r1[k][i].append(tmp_syms_list_r1)
                    sim_syms_list_r2[k][i].append(tmp_syms_list_r2)

        for k in range(len(sim_syms_list)):
            total_syms_list_r1.append((sim_syms_list_r1[k]))
            total_syms_list_r2.append((sim_syms_list_r2[k]))

    end = datetime.now()
    print('symbolic time : ', end-start)
    
    # convert symbolic into ids
    total_ids_list_r1 = []
    total_ids_list_r2 = []

    # debug : to check time
    start = datetime.now()

    for sim_syms_list in total_syms_list_r1:
        tmp_sim_ids_lists = []
        for sim_syms_l in sim_syms_list:
            sim_ids_list = []
            for sim_syms in sim_syms_l:
                sim_ids = []
                for sim in sim_syms:
                    sim_ids.append([sym2id_dict[i] for i in sim])
                sim_ids_list.append(sim_ids)
            tmp_sim_ids_lists.append(sim_ids_list)    
        total_ids_list_r1.append(tmp_sim_ids_lists)

    for sim_syms_list in total_syms_list_r2:
        tmp_sim_ids_lists = []
        for sim_syms_l in sim_syms_list:
            sim_ids_list = []
            for sim_syms in sim_syms_l:
                sim_ids = []
                for sim in sim_syms:
                    sim_ids.append([sym2id_dict[i] for i in sim])
                sim_ids_list.append(sim_ids)
            tmp_sim_ids_lists.append(sim_ids_list)    
        total_ids_list_r2.append(tmp_sim_ids_lists)

    end = datetime.now() 
    print('converting time : ', end-start)
    
    return total_ids_list_r1, total_ids_list_r2, total_syms_list_r1, total_syms_list_r2
#    return total_syms_list_r1, total_syms_list_r2
    
    
total_ids_list_r1, total_ids_list_r2, total_syms_list_r1, total_syms_list_r2 = generate_batch(test, rules, neg_per_pos)
#total_ids_list_r1, total_ids_list_r2 = generate_batch(new_test, rules, predicate_set, neg_per_pos)

depth :  0
else
BART
BART
else
USA
USA
##[1],[2]##
substitution = 
{'Q':    subj         pred  obj
0  BART  nationality  USA,
 'X': ['BART',
       (   subj          pred      obj
0  BART   nationality      USA
2  BART  placeOfBirth  NEWYORK
4  BART     hasFather   HOMMER,
          subj        pred   obj
1  USA  hasCitizen  BART)],
 'Y': ['USA',
       (  subj        pred   obj
1  USA  hasCitizen  BART,
              subj         pred  obj
0     BART  nationality  USA
3  NEWYORK    locatedIn  USA
5   HOMMER  nationality  USA)]}
sub-goal = 
  subj        pred   obj
1  USA  hasCitizen  BART
depth :  1
merge body df = 
  subj        pred   obj
0  USA  hasCitizen  BART
final body df = 
  comVar                    triples
0   BART  [[USA, hasCitizen, BART]]
before join
  comVar                     triples
0   BART  [[BART, nationality, USA]]
  comVar                    triples
0   BART  [[USA, hasCitizen, BART]]
after join tmp_result_df = 
  comVar                triples_left              

### Symbolic Unification Result
- two lists 
    - r1 : rule template predicates
    - r2 : unified predicates
    

### Check Results

In [20]:
# print(colored('sim_syms', 'red', attrs=['bold']))
# for sim_syms_list in total_syms_list_r1:
#     for sim_syms_l in sim_syms_list:
#         for sim_syms in sim_syms_l:
#             for sim in sim_syms:
#                 print(sim)
#         print()

# for sim_syms_list in total_syms_list_r2:
#     for sim_syms_l in sim_syms_list:
#         for sim_syms in sim_syms_l:
#             for sim in sim_syms:
#                 print(sim)
#         print()

### Convert Symbolic Unification Result from string into index
- convert string into index using dictionary
- two lists
    - r1 : rule template predicates
    - r2 : unified predicates

In [21]:
# total_ids_list_r1

In [22]:
# total_ids_list_r2[:5]

### data filtering

In [23]:
def data_filter(data):
    boolian_variable = False
    for template in data:
        if len(template) != 0:
            boolian_variable = boolian_variable or True
        else:
            boolian_variable = boolian_variable or False
    return boolian_variable
print(len(total_ids_list_r2))
print(len(total_ids_list_r1))
total_ids_list_r1 = list(filter(data_filter, total_ids_list_r1))
total_ids_list_r2 = list(filter(data_filter, total_ids_list_r2))
print(len(total_ids_list_r1))
print(len(total_ids_list_r2))

1
1
1
1


### make neg data & append padding

In [24]:
#shin
atoms_each_template = []
for rule in rules:
    atoms_each_template.append(len(rule)-1)
    
atoms_each_template

[2, 3]

In [25]:
augment = rules[0][-1]
num_paths = []
num_templates = len(total_ids_list_r1[0])

time1 = datetime.now()

for idx, i in enumerate(total_ids_list_r1):
    for j in range(num_templates):
        num_paths.append(np.array(i[j]).shape[0])
max_path = max(num_paths)



In [26]:
checkdict_sym = {}

for idx, (query1,query2) in enumerate(zip(total_syms_list_r1, total_syms_list_r2)):
    for (template1,template2) in zip(query1,query2):
        for (path1,path2) in zip(template1,template2):
            for (aug1,aug2) in zip(path1,path2):
                for (atom1,atom2) in zip(aug1,aug2):
                    atom1_ = atom1.split('_')
                    key = atom1_[0]+'_'+atom1_[1]
                    if key not in checkdict_sym:
                        checkdict_sym[key] = []
                    if atom2 not in checkdict_sym[key]:
                        checkdict_sym[key].append(atom2)
keys = list(checkdict_sym.keys())                        

In [27]:
keys

['#1_0', '#2_0', '#1_1', '#2_1', '#3_1']

In [28]:
checkdict_sym

{'#1_0': ['nationality'],
 '#2_0': ['hasCitizen'],
 '#1_1': ['nationality'],
 '#2_1': ['placeOfBirth', 'hasFather'],
 '#3_1': ['locatedIn', 'nationality']}

### head body 랜덤 변경

In [29]:
new_total_ids_list_r1 = []
new_total_ids_list_r2 = []
neg_per_pos = 2
for idx, (query1,query2) in enumerate(zip(total_ids_list_r1, total_ids_list_r2)):
    if idx % 100 ==0:
        print(idx)
    #positive
    new_total_ids_list_r1.append(query1)
    new_total_ids_list_r2.append(query2)
    for i in range(neg_per_pos):
        new_query2 = copy.deepcopy(query2)
        for template_idx, template1 in enumerate(new_query2):
            if len(template1) == 0:
                continue
            for path_idx, path1 in enumerate(template1):
                 for aug_idx, aug1 in enumerate(path1):
                    if aug_idx == 0:
                        for atom_idx, atom1 in enumerate(aug1):
                            #find Corresponding rule predicate
                            rule_pred = id2sym_dict[query1[template_idx][path_idx][aug_idx][atom_idx]]
                            #Convert to key 
                            rule_pred = rule_pred.split('_')
                            key = rule_pred[0]+'_'+rule_pred[1]

                            #Excluding positive key
                            filtered_keys = copy.deepcopy(keys)
                            filtered_keys.remove(key)
                            #choice new key rendomly
                            new_key = random.choice(filtered_keys)
                            #choice new id randomly
                            new_id = sym2id_dict[random.choice(checkdict_sym[new_key])]
                            new_query2[template_idx][path_idx][aug_idx][atom_idx] = new_id
                    else :
                        neg_path = np.full((augment,len(aug1)), new_query2[template_idx][path_idx][0]).tolist()
                        new_query2[template_idx][path_idx] = neg_path
                        break

        new_total_ids_list_r1.append(query1)
        new_total_ids_list_r2.append(new_query2)

0


In [30]:
id2sym_dict

{0: 'UNK',
 1: '#1_0_0',
 2: '#1_0_1',
 3: '#1_0_2',
 4: '#1_1_0',
 5: '#1_1_1',
 6: '#1_1_2',
 7: '#2_0_0',
 8: '#2_0_1',
 9: '#2_0_2',
 10: '#2_1_0',
 11: '#2_1_1',
 12: '#2_1_2',
 13: '#3_1_0',
 14: '#3_1_1',
 15: '#3_1_2',
 16: 'hasCitizen',
 17: 'hasFather',
 18: 'locatedIn',
 19: 'nationality',
 20: 'placeOfBirth'}

In [31]:
total_ids_list_r2[:5]

[[[[[19, 16], [19, 16], [19, 16]]],
  [[[19, 20, 18], [19, 20, 18], [19, 20, 18]],
   [[19, 17, 19], [19, 17, 19], [19, 17, 19]]]]]

In [32]:
new_total_ids_list_r2[:6]

[[[[[19, 16], [19, 16], [19, 16]]],
  [[[19, 20, 18], [19, 20, 18], [19, 20, 18]],
   [[19, 17, 19], [19, 17, 19], [19, 17, 19]]]],
 [[[[19, 19], [19, 19], [19, 19]]],
  [[[16, 19, 19], [16, 19, 19], [16, 19, 19]],
   [[19, 19, 16], [19, 19, 16], [19, 19, 16]]]],
 [[[[19, 19], [19, 19], [19, 19]]],
  [[[18, 19, 16], [18, 19, 16], [18, 19, 16]],
   [[16, 16, 19], [16, 16, 19], [16, 16, 19]]]]]

### padding

In [33]:
augment = rules[0][-1]
num_paths = []
num_templates = len(total_ids_list_r1[0])

time1 = datetime.now()

for idx, i in enumerate(new_total_ids_list_r1):
    for j in range(num_templates):
        num_paths.append(np.array(i[j]).shape[0])
max_path = max(num_paths)

for idx, i in enumerate(new_total_ids_list_r1):
    for j in range(num_templates):
        padding = np.zeros((augment, list(atoms_each_template)[j]), dtype=int)
#         print(padding.shape)
        for num_path in range(max_path-(np.array(i[j]).shape[0])):
            new_total_ids_list_r1[idx][j].append(padding.tolist())
            
for idx, i in enumerate(new_total_ids_list_r2):
    for j in range(num_templates):
        padding = np.zeros((augment, list(atoms_each_template)[j]), dtype=int)
#         print(padding.shape)
        for num_path in range(max_path-(np.array(i[j]).shape[0])):
            new_total_ids_list_r2[idx][j].append(padding.tolist())
            

time2 = datetime.now() 
print(time2-time1)

0:00:00


In [34]:
new_total_ids_list_r2

[[[[[19, 16], [19, 16], [19, 16]], [[0, 0], [0, 0], [0, 0]]],
  [[[19, 20, 18], [19, 20, 18], [19, 20, 18]],
   [[19, 17, 19], [19, 17, 19], [19, 17, 19]]]],
 [[[[19, 19], [19, 19], [19, 19]], [[0, 0], [0, 0], [0, 0]]],
  [[[16, 19, 19], [16, 19, 19], [16, 19, 19]],
   [[19, 19, 16], [19, 19, 16], [19, 19, 16]]]],
 [[[[19, 19], [19, 19], [19, 19]], [[0, 0], [0, 0], [0, 0]]],
  [[[18, 19, 16], [18, 19, 16], [18, 19, 16]],
   [[16, 16, 19], [16, 16, 19], [16, 16, 19]]]]]

### Check Coverted Results

In [35]:
# for debug

# print(colored('sim_ids', 'red', attrs=['bold']))
# for sim_ids_list in total_ids_list_r1:
#     for sim_ids_l in sim_ids_list:
#         for sim_ids in sim_ids_l:
#             for sim in sim_ids:
#                 print(sim)
#             print()

# for sim_ids_list in total_ids_list_r2:
#     for sim_ids_l in sim_ids_list:
#         for sim_ids in sim_ids_l:
#             for sim in sim_ids:
#                 print(sim)
#             print()

### Define Functions
- l2_sim
- model

In [36]:
def l2_sim(a, b):
    if a.dim() == 3: #if batch_size = 1
        a = a.unsqueeze(0)
        b = b.unsqueeze(0)
    a = a.transpose(1,3)
    b = b.transpose(1,3)
    dist = torch.nn.functional.pairwise_distance(a, b)
    sim = torch.exp(-dist)
    sim = sim.transpose(1,2)
    return sim  


In [37]:
class NTP_(nn.Module):
    
    def __init__(self, vocab_size, embedding_size, batch_size, num_templates, max_path):
        super(NTP_, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.embedding_matrix = nn.Embedding(self.vocab_size, self.embedding_size)
        self.loss = torch.nn.BCELoss()
        self.batch_size = batch_size
        self.template_size = num_templates
        self.max_path = max_path
    def calculate_sim_avg(self, r1, r2):
        sims_list = []

        for i in range(self.template_size):
            if len(r1[i]) == 0:
                continue
            if len(r2[i]) == 0:
                continue
            lookup_tensor_r1 = torch.tensor(r1[i], dtype=torch.long).cuda()
            lookup_tensor_r2 = torch.tensor(r2[i], dtype=torch.long).cuda()
            
            embed_r1 = self.embedding_matrix(lookup_tensor_r1).cuda()
            embed_r2 = self.embedding_matrix(lookup_tensor_r2).cuda()

            sims=l2_sim(embed_r1, embed_r2)
#             print(sims.shape)
            avg_sims = torch.squeeze(torch.mean(sims, 2, True), dim=-1)

            if avg_sims.dim() == 1:
                avg_sims = avg_sims.unsqueeze(0)
            sims_list.append(avg_sims)
            
        avg_sims_ = torch.cat(sims_list, dim=0)

        return avg_sims_
        
        
    def forward(self, r1, r2):
        avg_sims = self.calculate_sim_avg(r1, r2)
        x = torch.chunk(avg_sims, self.template_size, dim=0)
        x = torch.cat(x, dim=1)
        x = torch.chunk(x, self.batch_size, dim=0)
        x = list(x)
        for i, t in enumerate(x):
            x[i] = torch.cat(torch.chunk(t, chunks =self.template_size ,dim=1))#template
        sims = torch.cat(x)
        max_sims = torch.max(sims, axis=1)[0]
        max_sims = max_sims.reshape(self.batch_size, -1)
        min_sims = torch.min(max_sims, axis=1)[0]
        
        return min_sims

In [38]:
torch.cuda.is_available()

True

In [43]:
vocab_size = len(sym2id_dict)
pos_per_batch = 1
batch_size = pos_per_batch + (pos_per_batch*neg_per_pos)
print(batch_size)
embedding_size = 100
ntp = NTP_(vocab_size, embedding_size, batch_size, num_templates, max_path)
ntp.cuda()

3


NTP_(
  (embedding_matrix): Embedding(21, 100)
  (loss): BCELoss()
)

In [44]:
answer = []
for i in range(pos_per_batch):
    answer += [1]
    for j in range(neg_per_pos):
        answer += [0]
answer = torch.tensor(answer, dtype=torch.float32)
answer = answer.cuda()
answer

tensor([1., 0., 0.], device='cuda:0')

In [45]:
from datetime import datetime, timedelta
time1 = datetime.now()

In [46]:
epochs = 10
report_interver_epoch = 10
report_interver_iter = 1000
optimizer = torch.optim.Adam(ntp.parameters(), lr = 0.001, weight_decay = 0.01)
data_size = len(total_ids_list_r1)

for epoch in range(1, epochs+1):
    if epoch%report_interver_epoch == 0:
        print('epochs: ',epoch)
        
    ntp.train()
    for i in range(0, data_size, batch_size):
        if i%report_interver_iter == 0 :
            print('iteration : ',int(i/batch_size))
        optimizer.zero_grad()
#         ntp.zero_grad()
        real_r = []
        template_r = []
        


        r1 = new_total_ids_list_r1[i:i+batch_size]
        r2 = new_total_ids_list_r2[i:i+batch_size]
#         pprint(r2)
        if len(r1)<batch_size:
            continue
        data_length = []
        for i in range(len(r1[0])):#templates
            templates = []
            for j in range(len(r1)):#batch_size
                templates.append(torch.tensor(r1[j][i], dtype=torch.long))
                template = torch.cat(templates)
            real_r.append(template)
            
        for i in range(len(r2[0])):#templates
            templates = []
            for j in range(len(r2)):#batch_size
                templates.append(torch.tensor(r2[j][i], dtype=torch.long))
                template = torch.cat(templates)
            template_r.append(template)        
        if len(real_r[0]) == 0 and len(real_r[1]) == 0:
            continue
            
        y_hat = ntp.forward(real_r, template_r).cuda()
#         print(y_hat)
#         print(answer)
        answer = answer.cuda()
        loss = ntp.loss(y_hat, answer)
        loss.backward()
        optimizer.step()
    if epoch%report_interver_epoch == 0:
        print('########################loss##################### : ', loss.item())

iteration :  0
iteration :  0
iteration :  0
iteration :  0
iteration :  0
iteration :  0
iteration :  0
iteration :  0
iteration :  0
epochs:  10
iteration :  0
########################loss##################### :  4.370722770690918




In [47]:
time2 = datetime.now() 
print(time2-time1)

0:00:03.472502


In [48]:
def representation_match(x, emb):
    dist = torch.torch.nn.functional.pairwise_distance(x, emb)
    sim = torch.exp(-dist)
    return sim

In [49]:
for i in enumerate(ntp.parameters()):
    print(i[1])
    embeddings = i[1]

Parameter containing:
tensor([[ 0.5255,  0.9607,  2.0689,  ...,  0.7213,  0.5457, -1.0209],
        [ 0.1921, -0.9273, -0.0340,  ..., -1.3148,  1.4988, -0.5186],
        [-0.8215,  0.6012,  0.1793,  ..., -0.6662,  1.5138,  0.9429],
        ...,
        [ 1.4940,  0.1747, -0.7029,  ...,  1.1727,  0.6480,  1.2526],
        [-1.0773,  1.4312,  1.2800,  ...,  0.8377,  0.6742, -0.5619],
        [ 1.1327, -0.3858,  0.9704,  ...,  0.5682, -1.1591, -0.8872]],
       device='cuda:0', requires_grad=True)


In [50]:
rule_templates = {}
ids_rule_templates = {}
for rule_number, template in enumerate(rules):
    result_template_key = []
    result_template_value = []
    result_template_values = []
    ids_result_template_value = []
    ids_result_template_values = []
    for i in range(len(template)-1):
        rule_element=('p'+ str(int(template[i][0][1])-1), template[i][1], template[i][2])       
        result_template_key.append(rule_element)
        rule_element = ()

    for aug in range(template[-1]):
        for j in range(len(template)-1):
            result_template_value.append([template[j][0]+'_'+str(rule_number)+'_'+str(aug), template[j][1], template[j][2]])
        result_template_values.append(result_template_value)
        result_template_value = []
    rule_templates[tuple(result_template_key)] = result_template_values
    
    
    for aug in range(template[-1]):
        for j in range(len(template)-1):
            ids_result_template_value.append([sym2id_dict[template[j][0]+'_'+str(rule_number)+'_'+
                                                           str(aug)], template[j][1], template[j][2]])
        ids_result_template_values.append(ids_result_template_value)
        ids_result_template_value = []
    ids_rule_templates[tuple(result_template_key)] = ids_result_template_values

ids_rule_templates


{(('p0', 'X', 'Y'), ('p1', 'Y', 'X')): [[[1, 'X', 'Y'], [7, 'Y', 'X']],
  [[2, 'X', 'Y'], [8, 'Y', 'X']],
  [[3, 'X', 'Y'], [9, 'Y', 'X']]],
 (('p0', 'X', 'Y'),
  ('p1', 'X', 'Z'),
  ('p2', 'Z', 'Y')): [[[4, 'X', 'Y'],
   [10, 'X', 'Z'],
   [13, 'Z', 'Y']], [[5, 'X', 'Y'],
   [11, 'X', 'Z'],
   [14, 'Z', 'Y']], [[6, 'X', 'Y'], [12, 'X', 'Z'], [15, 'Z', 'Y']]]}

In [51]:
masking_index = []
for key, value in ids_rule_templates.items():
    for rule in value:
        for element in rule:
            masking_index.append(element[0])
        
masking_index

total_reuslt = []
with open(data_name+'_rule.nl', 'w') as f:
    for key, value in ids_rule_templates.items():
        f.write(str(key)+'\n')
        for rule in value:
            result = []
            confidence_score = []
            rule_result = []
            for element in rule:
                masking_index = masking_index+[element[0]]+[0]
                x = ntp.embedding_matrix(torch.tensor([element[0]]).cuda())
                match = representation_match(x, embeddings)
                match[masking_index] = 0
                top_k = torch.topk(match, 1)
                rule_result.append(id2sym_dict[top_k.indices.item()]+'('+element[1]+','+element[2]+')')
                confidence_score.append(match[top_k.indices])
            f.write(str(min(confidence_score).item())+'\t')
            head = rule_result[0]
            body = rule_result[1:]
            f.write(head + ' :- ' +", ".join(body)+'\n')  
            result.append((key, min(confidence_score).item(), rule_result))
            total_reuslt.append(result)
        f.write('\n')
total_reuslt

[[((('p0', 'X', 'Y'), ('p1', 'Y', 'X')),
   2.390730514889583e-06,
   ['hasCitizen(X,Y)', 'nationality(Y,X)'])],
 [((('p0', 'X', 'Y'), ('p1', 'Y', 'X')),
   2.2703427475789795e-06,
   ['nationality(X,Y)', 'hasCitizen(Y,X)'])],
 [((('p0', 'X', 'Y'), ('p1', 'Y', 'X')),
   1.1742851029339363e-06,
   ['nationality(X,Y)', 'placeOfBirth(Y,X)'])],
 [((('p0', 'X', 'Y'), ('p1', 'X', 'Z'), ('p2', 'Z', 'Y')),
   9.18394675863965e-07,
   ['hasCitizen(X,Y)', 'placeOfBirth(X,Z)', 'locatedIn(Z,Y)'])],
 [((('p0', 'X', 'Y'), ('p1', 'X', 'Z'), ('p2', 'Z', 'Y')),
   8.387845014112827e-07,
   ['placeOfBirth(X,Y)', 'hasCitizen(X,Z)', 'hasCitizen(Z,Y)'])],
 [((('p0', 'X', 'Y'), ('p1', 'X', 'Z'), ('p2', 'Z', 'Y')),
   1.0808163324327325e-06,
   ['nationality(X,Y)', 'hasCitizen(X,Z)', 'locatedIn(Z,Y)'])]]

In [52]:
with open(data_name+'_rule_batch'+str(batch_size)+'_epoch'+str(epochs)+'_aug'+
          str(augment)+'_sorted_'+str(time1)[11:13]+str(time1)[14:16]+'.nl', 'w') as file:
    with open(data_name+'_rule.nl', 'r') as f:
        scores = []
        total_scores = []
        rule = []
        total_rules = []
        count = 0
        for line in f:

            if '0.' not in line.split('\t')[0]:
                file.write(line.split('\t')[0])
            if '0.' in line.split('\t')[0]:
                count+=1

                scores.append(round(float(line.split('\t')[0]), 8))
                rule.append(line.split('\t')[-1])
                if count % augment == 0:
                    count = 0
                    total_scores.append(scores)
                    total_rules.append(rule)
                    s = torch.sort(torch.tensor(scores), descending=True).values
                    r = torch.sort(torch.tensor(scores), descending=True).indices

                    for i in range(augment):
                        file.write(str(round(s[i].item(), 8))+'\t')
                        file.write(rule[r[i].item()])
                    scores = []
                    rule = []

In [53]:
top = 5
masking_index = []
for key, value in ids_rule_templates.items():
    for rule in value:
        for element in rule:
            masking_index.append(element[0])
        
masking_index

total_result = []
with open(data_name + '_rule_batch'+str(batch_size)+'_neg'+str(neg_per_pos)+'_epoch'+str(epochs)+
          '_aug'+str(augment)+'_top'+str(top)+'_'+str(time1)[11:13]+str(time1)[14:16]+'.nl', 'w') as f:
    for key, value in ids_rule_templates.items():
        f.write(str(key)+'\n')
        for rule in value:
            result = []
            confidence_score = []
            rule_result = []
            for element in rule:
                rule_results = []
                confidence_scores = []
                masking_index = masking_index+[element[0]]+[0]
                x = ntp.embedding_matrix(torch.tensor([element[0]]).cuda())
                match = representation_match(x, embeddings)
                match[masking_index] = 0
                top_k = torch.topk(match, top)
#                 print(top_k)
#                 print(top_k.indices[0].item())
                for i in range(top):
                    rule_results.append(id2sym_dict[top_k.indices[i].item()]+'('+element[1]+','+element[2]+')')
                    confidence_scores.append(match[top_k.indices[i].item()].item())
                rule_result.append(rule_results)
                
                
#                     f.write(str(min(confidence_score).item())+'\t')
#                     head = rule_result[0]
#                     body = rule_result[1:]
#                     f.write(head + ' :- ' +", ".join(body)+'\n')  
                confidence_score.append(confidence_scores)
#             print("################")
#             print(rule_result)
            confidence_score = np.array(confidence_score)
            confidence_score = np.min(confidence_score, axis=0)
#             print(confidence_score)
            for i in range(top):
                f.write(str(confidence_score[i])+'\t')
                for j in range(len(rule_result)):
                    if j == 0:
                        f.write(rule_result[j][i] + ' :- ')
                    elif j == len(rule_result)-1:
                        f.write(rule_result[j][i] + '\n')
                    else:
                        f.write(rule_result[j][i] + ', ')
            f.write('\n')
#                 result.append((key, min(confidence_score).item(), rule_result))
#                 total_result.append(result)
        f.write('\n')
# total_result
# print(rule_result)