In [1]:
import rdflib
import re
import numpy as np
from SPARQLWrapper import SPARQLWrapper, JSON
from pyjarowinkler import distance

# Load Word Embeddings

In [2]:
import logging
from six import iteritems

In [3]:
%%time
glove_vectors_file = "glove.6B.300d.txt"
glove_wordmap = {}
with open(glove_vectors_file, "r", encoding="utf-8") as glove:
    for line in glove:
        name, vector = tuple(line.split(" ", 1))
        glove_wordmap[name] = np.fromstring(vector, sep=" ")

Wall time: 47.2 s


In [4]:
#sparql = SPARQLWrapper("http://134.117.101.79:8890/sparql/")
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

In [5]:
def evaluate_similarity_score(wemb, ar):
    res = []
    for x in ar:
        if x[0] in wemb and x[1] in wemb:
            vector1 = wemb[x[0]]
            vector2 = wemb[x[1]]  
            res.append(np.dot(vector1,vector2)/(np.linalg.norm(vector1)*(np.linalg.norm(vector2))))
        else:
            res.append(-1)
    return res

# isEntityInDatabase Method

In [6]:
def isEntityInDatabase(s):
    return s.startswith("http")

In [7]:
def countOutgoingEdges(obj):
    queryString = "SELECT count(DISTINCT ?s) as ?c WHERE { ?s ?p <" + obj +">}"
    sparql.setQuery(queryString)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    #print(queryString)
    for result in results["results"]["bindings"]:
        return int(result['c']['value'])
countOutgoingEdges("http://dbpedia.org/resource/Viking_Press")

1019

# MakeQueryString Method

In [8]:
def makeQueryString(obj, filterStringsPredicates, filterStringSubjects, literals):
    if obj in literals:
        result = "SELECT distinct ?s ?p WHERE { " + "{?s ?p " + obj + " . " + "} FILTER ("
        for s in filterStringsPredicates:
            fsp = "!regex(str(?p), '" + s + "' , 'i') && "
            result += fsp
        for s in filterStringSubjects:
            fss = "!regex(str(?s), '" + s + "' , 'i') && "
            result += fss
        result = result[:result.rindex("&&")] + ")}"
    else:
        result = "SELECT distinct ?s ?p ?j ?k WHERE { "
        incomingQuery = "{?s ?p <" + obj + "> . FILTER ("
        outgoingQuery =  "{<" + obj + "> ?j ?k. FILTER ("
        
        for s in filterStringsPredicates:
            fsp = "!regex(str(?p), '" + s + "' , 'i') && "
            incomingQuery += fsp
            fsj = "!regex(str(?j), '" + s + "' , 'i') && "
            outgoingQuery += fsj
            
        for s in filterStringSubjects:
            fss = "!regex(str(?s), '" + s + "' , 'i') && "
            fss2 = "!regex(str(?k), '" + s + "' , 'i') && "
            incomingQuery += fss
            outgoingQuery += fss2
        
        incomingQuery = incomingQuery[:incomingQuery.rindex("&&")] + ")}"
        outgoingQuery = outgoingQuery[:outgoingQuery.rindex("&&")] + ")}"
        result = result + incomingQuery + " UNION " + outgoingQuery + "}"
    return result

In [9]:
def makeQueryStringWithThreshold(obj, filterStringsPredicates, filterStringSubjects, literals):
    th = 10000
    if obj in literals:
        result = "SELECT distinct ?s ?p WHERE { " + "{?s ?p " + obj + " . " + "} FILTER ("
        for s in filterStringsPredicates:
            fsp = "!regex(str(?p), '" + s + "' , 'i') && "
            result += fsp
        for s in filterStringSubjects:
            fss = "!regex(str(?s), '" + s + "' , 'i') && "
            result += fss
        result = result[:result.rindex("&&")] + ")}"
    else:
        count = countOutgoingEdges(obj)
        if count < th:
            result = "SELECT distinct ?s ?p ?j ?k WHERE { "
            incomingQuery = "{?s ?p <" + obj + "> . FILTER ("
            outgoingQuery =  "{<" + obj + "> ?j ?k. FILTER ("

            for s in filterStringsPredicates:
                fsp = "!regex(str(?p), '" + s + "' , 'i') && "
                incomingQuery += fsp
                fsj = "!regex(str(?j), '" + s + "' , 'i') && "
                outgoingQuery += fsj

            for s in filterStringSubjects:
                fss = "!regex(str(?s), '" + s + "' , 'i') && "
                fss2 = "!regex(str(?k), '" + s + "' , 'i') && "
                incomingQuery += fss
                outgoingQuery += fss2

            incomingQuery = incomingQuery[:incomingQuery.rindex("&&")] + ")}"
            outgoingQuery = outgoingQuery[:outgoingQuery.rindex("&&")] + ")}"
            result = result + incomingQuery + " UNION " + outgoingQuery + "}"
        else:
            result = "SELECT distinct ?j ?k WHERE { "
            incomingQuery = "{?s ?p <" + obj + "> . FILTER ("
            outgoingQuery =  "{<" + obj + "> ?j ?k. FILTER ("

            for s in filterStringsPredicates:
                fsp = "!regex(str(?p), '" + s + "' , 'i') && "
                incomingQuery += fsp
                fsj = "!regex(str(?j), '" + s + "' , 'i') && "
                outgoingQuery += fsj

            for s in filterStringSubjects:
                fss = "!regex(str(?s), '" + s + "' , 'i') && "
                fss2 = "!regex(str(?k), '" + s + "' , 'i') && "
                incomingQuery += fss
                outgoingQuery += fss2

            incomingQuery = incomingQuery[:incomingQuery.rindex("&&")] + ")}"
            outgoingQuery = outgoingQuery[:outgoingQuery.rindex("&&")] + ")}"
            result = result + " " + outgoingQuery + "}"
    return result

# Triple Class 

In [10]:
class Triple:
    def __init__(self):
        self.subject = None
        self.object = None
        self.predicate = None
        self.cost = None
        self.previousTriple = None
        #the list of connecting seeds
        self.seeds = []
    
    def __str__(self):
        if(self.subject is not None and self.predicate is not None and self.object is not None):
            # print connecting seeds
            connectSeeds = "["
            for seed in self.seeds:
                connectSeeds += seed + ", "
            connectSeeds = connectSeeds[:connectSeeds.rindex(',')] + "]"
            return "{0}  --  {1}  --  {2}  -- {3}  ({4:.2f})".format(self.subject, self.predicate, self.object, connectSeeds, abs(self.cost))
        else:
            return "Not Well Defined Triple"
        
    def __eq__(self,other):
        if(self.subject == other.getSubject() and self.object == other.getObject() and self.predicate == other.getPredicate()):
            return True
        else:
            return False
        
    def __gt__(self,other):
        if self.cost > other.getCost():
            return True
        else:
            return False
    
    def setSeeds(self, fs):
        if fs not in self.seeds:
            self.seeds.append(fs)
            self.seeds.sort()
            
    def getSeeds(self):
        return self.seeds
    
    def setPreviousTriple(self, pt):
        self.previousTriple = pt
        
    def getPreviousTriple(self):
        return self.previousTriple
    
    def updateSeeds(self, newSeeds):
        for seed in newSeeds:
            self.setSeeds(seed)
            
    def setSubject(self, sub):
        self.subject = sub
    def setObject(self, obj):
        self.object = obj
    def setPredicate(self, pre):
        self.predicate = pre
    def setCost(self, c):
        self.cost = c
    def getCost(self):
        return self.cost
    def getSubject(self):
        return self.subject
    def getObject(self):
        return self.object
    def getPredicate(self):
        return self.predicate

# Read most frequent predicates

In [11]:
%%time
frequentPredicates = {}
with open("FreqP.txt", "r") as freqP:
    for line in freqP:
        li = line.split("\t")
        name = li[0][li[0].rindex('/')+1:]
        frequency = int(li[1])
        frequentPredicates[name] = int(frequency)

Wall time: 15.6 ms


# Expand Method

In [12]:
def expandWithThreshold(queryTriples, eg, di, predicatesToMatch, wemb):
    newQueryTriples = []
    matchingTriples = []
    
    tmp = []
    print("queryTriple size : " + str(len(queryTriples)))
    for queryObject in queryTriples:
        queryString = makeQueryStringWithThreshold(queryObject.getObject(), filterStringPredicates,filterStringSubjects, literalsToConnect)
        print("----- Current Query ------")
        print(queryString)
        print("--------------------------")
        
        sparql.setQuery(queryString)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()

        for result in results["results"]["bindings"]:
            
            if 's' in result:
                newTriple = Triple()
                newTriple.setSubject(queryObject.getObject())
                newTriple.setPredicate(result["p"]["value"])
                newTriple.setObject(result["s"]["value"])
                newTriple.setPreviousTriple(queryObject)
                    
                for seed in queryObject.getSeeds():
                    newTriple.setSeeds(seed)
                    
                if newTriple not in expandedGraph:
                        expandedGraph.append(newTriple)
                
                #Apply word embedding for costs and modify matchingTriples
                if newTriple not in matchingTriples:
                    matchingTriples.append(newTriple)                       
                
                if(isEntityInDatabase(result["s"]["value"])):                                   
                    if newTriple.getObject() not in tmp:
                        newQueryTriples.append(newTriple)
                    
                    #Check if it's in tmp, if yes, add triple into duplicatedItems
                    if newTriple.getObject() in tmp:
                        #print("add duplicated Items: " + newTriple.getObject())
                        addDuplicatedItems(newTriple, eg, di)
                    else:
                        tmp.append(newTriple.getObject())
                        
            else:
                                    
                newTriple = Triple()
                newTriple.setSubject(queryObject.getObject())
                newTriple.setPredicate(result["j"]["value"])                
                newTriple.setObject(result["k"]["value"])
                newTriple.setPreviousTriple(queryObject)

                for seed in queryObject.getSeeds():
                    newTriple.setSeeds(seed)

                #Apply word embedding for costs and modify matchingTriples
                if newTriple not in matchingTriples:
                    matchingTriples.append(newTriple)

                if newTriple not in expandedGraph:
                    expandedGraph.append(newTriple)
                    
                if(isEntityInDatabase(result["k"]["value"])):
                    #print(result["k"]["value"] + "   ---   " + result["j"]["value"])

                    if newTriple.getObject() not in tmp:
                        newQueryTriples.append(newTriple)
                        
                    #Check if it's in tmp, if yes, add triple into duplicatedItems
                    if newTriple.getObject() in tmp:
                        #print("add duplicated Items: " + newTriple.getObject())
                        addDuplicatedItems(newTriple, eg, di)
                    else:
                        tmp.append(newTriple.getObject())
                        
        #computeCostsBaseline(predicatesToMatch, matchingTriples, wemb)
        computeCosts(predicatesToMatch, matchingTriples, wemb)
    return newQueryTriples, matchingTriples

In [13]:
def expand(queryTriples, eg, di, predicatesToMatch, wemb):
    newQueryTriples = []
    matchingTriples = []
    
    tmp = []
    print("queryTriple size : " + str(len(queryTriples)))
    for queryObject in queryTriples:
        queryString = makeQueryString(queryObject.getObject(), filterStringPredicates,filterStringSubjects, literalsToConnect)
        print("----- Current Query ------")
        print(queryString)
        print("--------------------------")
        
        sparql.setQuery(queryString)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()

        for result in results["results"]["bindings"]:
            
            if 's' in result:
                newTriple = Triple()
                newTriple.setSubject(queryObject.getObject())
                newTriple.setPredicate(result["p"]["value"])
                newTriple.setObject(result["s"]["value"])
                newTriple.setPreviousTriple(queryObject)
                    
                for seed in queryObject.getSeeds():
                    newTriple.setSeeds(seed)
                    
                if newTriple not in expandedGraph:
                        expandedGraph.append(newTriple)
                
                #Apply word embedding for costs and modify matchingTriples
                if newTriple not in matchingTriples:
                    matchingTriples.append(newTriple)                       
                
                if(isEntityInDatabase(result["s"]["value"])):                                   
                    if newTriple.getObject() not in tmp:
                        newQueryTriples.append(newTriple)
                    
                    #Check if it's in tmp, if yes, add triple into duplicatedItems
                    if newTriple.getObject() in tmp:
                        #print("add duplicated Items: " + newTriple.getObject())
                        addDuplicatedItems(newTriple, eg, di)
                    else:
                        tmp.append(newTriple.getObject())
                        
            else:
                                    
                newTriple = Triple()
                newTriple.setSubject(queryObject.getObject())
                newTriple.setPredicate(result["j"]["value"])                
                newTriple.setObject(result["k"]["value"])
                newTriple.setPreviousTriple(queryObject)

                for seed in queryObject.getSeeds():
                    newTriple.setSeeds(seed)

                #Apply word embedding for costs and modify matchingTriples
                if newTriple not in matchingTriples:
                    matchingTriples.append(newTriple)

                if newTriple not in expandedGraph:
                    expandedGraph.append(newTriple)
                    
                if(isEntityInDatabase(result["k"]["value"])):
                    #print(result["k"]["value"] + "   ---   " + result["j"]["value"])

                    if newTriple.getObject() not in tmp:
                        newQueryTriples.append(newTriple)
                        
                    #Check if it's in tmp, if yes, add triple into duplicatedItems
                    if newTriple.getObject() in tmp:
                        #print("add duplicated Items: " + newTriple.getObject())
                        addDuplicatedItems(newTriple, eg, di)
                    else:
                        tmp.append(newTriple.getObject())
                        
        computeCostsBaseline(predicatesToMatch, matchingTriples, wemb)
    return newQueryTriples, matchingTriples

# addDuplicatedItems Method

In [14]:
def addDuplicatedItems(ntp, eg, di):
    for tp in eg:
        if tp.getObject() == ntp.getObject() and tp not in di:
            di.append(tp)
            if tp.getSeeds() != ntp.getSeeds():
                updateSeedsInExpandedGraph(tp,ntp,eg)           
    #di.append(ntp)
    return "Dup"

In [15]:
def updateSeedsInExpandedGraph(tp,ntp,eg):
    old1 = tp.getSeeds()[:]
    old2 = ntp.getSeeds()[:]
    ntp.updateSeeds(tp.getSeeds())
    newS = ntp.getSeeds()
    
    for triple in eg:
        if triple.getSeeds() == old1 or triple.getSeeds() == old2:
            triple.updateSeeds(newS)

# Reduction Tests

In [16]:
def reductionTestsDegreeOne(eg):
    
    degreeOneNodes = []
    
    for tp1 in eg:
        duplicatedObject = False
        isLeafNode = True
        for tp2 in eg:
            if tp2.getPreviousTriple() == tp1:
                isLeafNode = False
            if tp1.getObject() == tp2.getObject() and not tp1.getSubject() == tp2.getSubject():
                duplicatedObject = True
                break
        if not duplicatedObject and isLeafNode:
            degreeOneNodes.append(tp1)
    print("Degree One Nodes size: " + str(len(degreeOneNodes)))
    for tp in degreeOneNodes:
        eg.remove(tp)

#TODO: Fix Bug
def keepMinEdge(eg):
    
    triplesToBeRemoved = []
    
    for tp1 in eg:
        minTriple = tp1
        for tp2 in eg:
            if tp1.getSubject() == tp2.getSubject() and tp1.getObject() == tp2.getObject and not tp1.getPredicate() == tp2.getPredicate():
                if tp2.getCost() < tp1.getCost():
                    minTriple = tp2
                    if tp1 not in triplesToBeRemoved:
                        triplesToBeRemoved.append(tp1)
                else:
                    if tp2 not in triplesToBeRemoved:
                        triplesToBeRemoved.append(tp2)
    for tp in triplesToBeRemoved:
        eg.remove(tp)
    

# checkConnection Method

In [17]:
def checkConnection(ltc, di):
    allLiterals = []
    
    for x in ltc:
        if x.lower() not in allLiterals:
            allLiterals.append(x.lower())
    
    if len(allLiterals) == 1:
        return False
    else:
        allLiterals.sort()
        for tp in di:
            if tp.getSeeds() == allLiterals:
                return True
        return False

    
"""
def checkConnection(ltc, di):
    if len(ltc) == 1:
        return False
    else:
        ltc.sort()
        for tp in di:
            if tp.getSeeds() == ltc:
                return True
        return False
"""

'\ndef checkConnection(ltc, di):\n    if len(ltc) == 1:\n        return False\n    else:\n        ltc.sort()\n        for tp in di:\n            if tp.getSeeds() == ltc:\n                return True\n        return False\n'

# ComputeCosts Method

In [27]:
def computeCosts(predicatesToMatch, matchingTriples, wemb):
    th = 50
    predicateList = []
    
    for tp in matchingTriples:
        predicate = tp.getPredicate()
        predicateList.append(predicate[predicate.rindex('/')+1:])
        
    for y in predicatesToMatch:
        ar = []
        for p in predicateList:
            """
            # Remove StopWords from predicates
            newp = re.sub( r"([A-Z])", r" \1", p).split()
            i = 0
            while i < len(newp):
                if newp[i].lower() in swl:
                    newp.pop(i)
                else:
                    i = i + 1
            newp = "".join(newp)
            ar.append([y, newp])
            """
            ar.append([y,p])
        #ar = np.array(ar)
        
        result = evaluate_similarity_score(wemb, ar)
        #print("size of result :" + str(len(result)) + '  ----  size of matching Triples : ' + str(len(matchingTriples)))
        for x in range(len(result)):
            #when two comparing predicates are both not in the word embedding, then using JW distance instead
            #TODO: What if one of them is in word embedding?
            if result[x] == -1:
                #print(y + "  ---  " + predicateList[x])
                result[x] = dw(y, predicateList[x])
            elif (1 - result[x]) * 100 > th:
                result[x] = dw(y, predicateList[x])
                
            if matchingTriples[x].getCost() is None:
                matchingTriples[x].setCost((1 - result[x]) * 100)
                #print(str(matchingTriples[x]) + ' set by ' + y)
            elif matchingTriples[x].getCost() > (1 - result[x]) * 100:
                matchingTriples[x].setCost((1 - result[x]) * 100)
                #print(str(matchingTriples[x]) + ' set by ' + y)
    
    index = 0
    
    while index < len(matchingTriples):
        if matchingTriples[index].getCost() > th:
            matchingTriples.pop(index)
        else:
            index += 1

# Baseline 1: Using JW distance only

In [19]:
def computeCostsBaseline(predicatesToMatch, matchingTriples, wemb):

    th = 40
    predicateList = []
    
    for tp in matchingTriples:
        predicate = tp.getPredicate()
        predicateList.append(predicate[predicate.rindex('/')+1:])
        
    for y in predicatesToMatch:
        ar = []
        for p in predicateList:
            """
            # Remove StopWords from predicates
            newp = re.sub( r"([A-Z])", r" \1", p).split()
            i = 0
            while i < len(newp):
                if newp[i].lower() in swl:
                    newp.pop(i)
                else:
                    i = i + 1
            newp = "".join(newp)
            #print(newp + " -- > after swl")
            ar.append([y, newp])
            """
            ar.append([y,p]) # not using stopwords
        for x in range(len(ar)):
            #print("predicates pair: " + ar[x][0] + "  ---  " + ar[x][1])
            if len(ar[x][0]) == 0 or len(ar[x][1]) == 0:
                result = 0
            else:
                result = dw(ar[x][0], ar[x][1])
                
            if matchingTriples[x].getCost() is None:
                matchingTriples[x].setCost((1 - result) * 100)
            elif matchingTriples[x].getCost() > (1 - result) * 100:
                matchingTriples[x].setCost((1 - result) * 100)
    
    index = 0
    
    while index < len(matchingTriples):
        if matchingTriples[index].getCost() > th:
            matchingTriples.pop(index)
        else:
            index += 1

# Test Version, Using Count(*)
## Assumption: if a keyword is a property, then it is not an entity

In [20]:
def isPredicateInDataset(literal):
    count_th = 100
    
    print("Current literal: " + literal)
    #Test for property
    queryString = 'select distinct Count(?ss) as ?c where {?s ?p "'+ literal + '"@en. ?ss ?s ?j}'
    sparql.setQuery(queryString)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    for result in results["results"]["bindings"]:
        res = int(result["c"]["value"])
        print(str(res) + " in property")
        if res > count_th:
            return 1
            #print("True")
        
    #Test for entity
    queryString = 'select distinct Count(?ss) as ?c where {{?s ?p "'+ literal + '"@en. ?ss ?j ?s} UNION {?s ?p "'+ literal + '"@en. ?s ?j ?ss}}'
    sparql.setQuery(queryString)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    for result in results["results"]["bindings"]:
        res = int(result["c"]["value"])
        print(str(res) + " in sub/obj")
        if res > count_th:
            return 0

    return -1
    

In [21]:
def indentifyEntitiesAndPredicates(keywords):
    entity = []
    pre = []
    
    
    li = keywords.split(',')
    
    for x in range(len(li)):
        li[x] = li[x].strip()
        
        pre.append(li[x])
        
        toIdentify = set()
        toIdentify.add(li[x])
        toIdentify.add(li[x].lower())
        toIdentify.add(convertFirstToCapital(li[x]))
        
        print("to identify-------------")
        for x in toIdentify:
            print(x)
        print("to identify-------------")
        
        possibleEnt = []
        for x in toIdentify:
            res = isPredicateInDataset(x)
            if  res == 1:
                possibleEnt.clear()
                break
            elif res == 0:
                possibleEnt.append(x)
        
        for x in possibleEnt:
            entity.append(x)
        
    for x in range(len(entity)):
        entity[x] = '"' + entity[x] + '"@en'
    return entity,pre

# Jaro_Winkler Distance

In [22]:
def dw(string1, string2):
    return distance.get_jaro_distance(string1, string2, winkler=True, scaling=0.1)

# Convert first letter in literals to capital case and lower case for the rest

In [23]:
def convertFirstToCapital(s):
    res = s.split()
    for x in range(len(res)):
        res[x] = res[x][0].upper() + res[x][1:].lower()
    res = " ".join(res)
    return res

# Main Program

In [None]:
%%time

#get stop words list
swl = []
with open('StopWords.txt','r') as sw:
    swlines = sw.readlines()
    for x in swlines:
        swl.append(x.strip())

with open('qald5-3.txt','r',encoding='utf-8') as f:
    lines = f.readlines()

with open('test_res3.txt','w',encoding='utf-8') as fw:
    for line in lines:
        filterStringPredicates = ["wikiPageWikiLink","wikiPageRedirects","wikiPageDisambiguates", "Thing","wikiPageUsesTemplate","rdf-syntax-ns#type"]
        filterStringSubjects = ["entity", "Category", "wikidata","owl#Thing", "http://wikidata.dbpedia.org/resource/Q"]
        expandedGraph = []
        duplicatedItems = []

        literalsToConnect, predicates = indentifyEntitiesAndPredicates(line.strip())
        fw.write("-----Question------\n")
        fw.write(line + '\n')
        fw.write("-------------------\n")
        qtps = []
        for literal in literalsToConnect:
            triple = Triple()
            triple.setObject(literal)
            triple.setSeeds(literal)
            qtps.append(triple)

        fw.write('--entities--\n')
        print('---- entities ----')
        for tp in qtps:
            fw.write(str(tp.getObject()) + '\n')
            print(tp.getObject())
        print(predicates)
        # Test Purpose
        #"""
        levelOfExpansion = 1
        while(levelOfExpansion < 3 and not checkConnection(literalsToConnect, duplicatedItems)):
            #add mtps to a new list
            qtps, mtps = expand(qtps, expandedGraph, duplicatedItems, predicates, glove_wordmap)
            levelOfExpansion += 1

        print("Matching Triples: -----   size: " + str(len(mtps)))
        mtps.sort()
        for tp in mtps:
            fw.write(str(tp) + '\n')
            print(tp)
        if checkConnection(literalsToConnect, duplicatedItems):
            reductionTestsDegreeOne(expandedGraph)
            fw.write("Size of expanded graph: "+ str(len(expandedGraph)) + '\n')
            for tp in expandedGraph:
                fw.write(str(tp) + '\n')
        
        #for x in expandedGraph:
        #    print(x)
        #        """

In [None]:
for x in expandedGraph:
    if 'capital' in x.getPredicate():
        print(x)

# Baseline using exact match for predicates

In [31]:
%%time

with open('./Test2/st2.txt','r',encoding='utf-8') as f:
    lines = f.readlines()

with open('./Test2/stree2_res.txt','w',encoding='utf-8') as fw:
    for line in lines:
        filterStringPredicates = ["wikiPageWikiLink","wikiPageRedirects","wikiPageDisambiguates", "Thing","wikiPageUsesTemplate","rdf-syntax-ns#type"]
        filterStringSubjects = ["entity", "Category", "wikidata","owl#Thing", "http://wikidata.dbpedia.org/resource/Q"]
        expandedGraph = []
        duplicatedItems = []

        literalsToConnect, predicates = indentifyEntitiesAndPredicates(line.strip())
        fw.write("-----Question------\n")
        fw.write(line + '\n')
        fw.write("-------------------\n")
        qtps = setFirstSeedLiterals(literalsToConnect)
        """
        for literal in literalsToConnect:
            triple = Triple()
            triple.setObject(literal)
            triple.setSeeds(literal)
            qtps.append(triple)
        """
        fw.write('--entities--\n')
        print('---- entities ----')
        for tp in qtps:
            fw.write(str(tp.getObject()) + '\n')
            print(tp.getObject())
        print(predicates)
        # Test Purpose
        #"""
        levelOfExpansion = 1
        while(levelOfExpansion < 4 and not checkConnection(literalsToConnect, duplicatedItems)):
            #add mtps to a new list
            qtps, mtps = expandWithThreshold(qtps, expandedGraph, duplicatedItems, predicates, glove_wordmap)
            levelOfExpansion += 1

        print("Matching Triples: -----   size: " + str(len(mtps)))
        mtps.sort()
        for tp in mtps:
            fw.write(str(tp) + '\n')
            print(tp)
        if checkConnection(literalsToConnect, duplicatedItems):
            reductionTestsDegreeOne(expandedGraph)
            fw.write("Size of expanded graph: "+ str(len(expandedGraph)) + '\n')
            for tp in expandedGraph:
                fw.write(str(tp) + '\n')

to identify-------------
Secret Intelligence Service
secret intelligence service
to identify-------------
Current literal: Secret Intelligence Service
0 in property
3323 in sub/obj
Current literal: secret intelligence service
0 in property
0 in sub/obj
to identify-------------
United Kingdom
united kingdom
to identify-------------
Current literal: United Kingdom
0 in property
1692332 in sub/obj
Current literal: united kingdom
0 in property
415 in sub/obj
---- entities ----
"Secret Intelligence Service"@en
"United Kingdom"@en
"united kingdom"@en
['Secret Intelligence Service', 'United Kingdom']
queryTriple size : 3
----- Current Query ------
SELECT distinct ?s ?p WHERE { {?s ?p "Secret Intelligence Service"@en . } FILTER (!regex(str(?p), 'wikiPageWikiLink' , 'i') && !regex(str(?p), 'wikiPageRedirects' , 'i') && !regex(str(?p), 'wikiPageDisambiguates' , 'i') && !regex(str(?p), 'Thing' , 'i') && !regex(str(?p), 'wikiPageUsesTemplate' , 'i') && !regex(str(?p), 'rdf-syntax-ns#type' , 'i') &

----- Current Query ------
SELECT distinct ?s ?p ?j ?k WHERE { {?s ?p <http://dbpedia.org/resource/United_Kingdom_of_Great_Britain_and_Ireland> . FILTER (!regex(str(?p), 'wikiPageWikiLink' , 'i') && !regex(str(?p), 'wikiPageRedirects' , 'i') && !regex(str(?p), 'wikiPageDisambiguates' , 'i') && !regex(str(?p), 'Thing' , 'i') && !regex(str(?p), 'wikiPageUsesTemplate' , 'i') && !regex(str(?p), 'rdf-syntax-ns#type' , 'i') && !regex(str(?s), 'entity' , 'i') && !regex(str(?s), 'Category' , 'i') && !regex(str(?s), 'wikidata' , 'i') && !regex(str(?s), 'owl#Thing' , 'i') && !regex(str(?s), 'http://wikidata.dbpedia.org/resource/Q' , 'i') )} UNION {<http://dbpedia.org/resource/United_Kingdom_of_Great_Britain_and_Ireland> ?j ?k. FILTER (!regex(str(?j), 'wikiPageWikiLink' , 'i') && !regex(str(?j), 'wikiPageRedirects' , 'i') && !regex(str(?j), 'wikiPageDisambiguates' , 'i') && !regex(str(?j), 'Thing' , 'i') && !regex(str(?j), 'wikiPageUsesTemplate' , 'i') && !regex(str(?j), 'rdf-syntax-ns#type' , 'i

----- Current Query ------
SELECT distinct ?s ?p ?j ?k WHERE { {?s ?p <http://dbpedia.org/resource/Odeon_Leicester_Square> . FILTER (!regex(str(?p), 'wikiPageWikiLink' , 'i') && !regex(str(?p), 'wikiPageRedirects' , 'i') && !regex(str(?p), 'wikiPageDisambiguates' , 'i') && !regex(str(?p), 'Thing' , 'i') && !regex(str(?p), 'wikiPageUsesTemplate' , 'i') && !regex(str(?p), 'rdf-syntax-ns#type' , 'i') && !regex(str(?s), 'entity' , 'i') && !regex(str(?s), 'Category' , 'i') && !regex(str(?s), 'wikidata' , 'i') && !regex(str(?s), 'owl#Thing' , 'i') && !regex(str(?s), 'http://wikidata.dbpedia.org/resource/Q' , 'i') )} UNION {<http://dbpedia.org/resource/Odeon_Leicester_Square> ?j ?k. FILTER (!regex(str(?j), 'wikiPageWikiLink' , 'i') && !regex(str(?j), 'wikiPageRedirects' , 'i') && !regex(str(?j), 'wikiPageDisambiguates' , 'i') && !regex(str(?j), 'Thing' , 'i') && !regex(str(?j), 'wikiPageUsesTemplate' , 'i') && !regex(str(?j), 'rdf-syntax-ns#type' , 'i') && !regex(str(?k), 'entity' , 'i') && !

----- Current Query ------
SELECT distinct ?s ?p ?j ?k WHERE { {?s ?p <http://dbpedia.org/resource/Bell_Hotel,_Thetford> . FILTER (!regex(str(?p), 'wikiPageWikiLink' , 'i') && !regex(str(?p), 'wikiPageRedirects' , 'i') && !regex(str(?p), 'wikiPageDisambiguates' , 'i') && !regex(str(?p), 'Thing' , 'i') && !regex(str(?p), 'wikiPageUsesTemplate' , 'i') && !regex(str(?p), 'rdf-syntax-ns#type' , 'i') && !regex(str(?s), 'entity' , 'i') && !regex(str(?s), 'Category' , 'i') && !regex(str(?s), 'wikidata' , 'i') && !regex(str(?s), 'owl#Thing' , 'i') && !regex(str(?s), 'http://wikidata.dbpedia.org/resource/Q' , 'i') )} UNION {<http://dbpedia.org/resource/Bell_Hotel,_Thetford> ?j ?k. FILTER (!regex(str(?j), 'wikiPageWikiLink' , 'i') && !regex(str(?j), 'wikiPageRedirects' , 'i') && !regex(str(?j), 'wikiPageDisambiguates' , 'i') && !regex(str(?j), 'Thing' , 'i') && !regex(str(?j), 'wikiPageUsesTemplate' , 'i') && !regex(str(?j), 'rdf-syntax-ns#type' , 'i') && !regex(str(?k), 'entity' , 'i') && !rege

----- Current Query ------
SELECT distinct ?s ?p ?j ?k WHERE { {?s ?p <http://dbpedia.org/resource/The_London_Studios> . FILTER (!regex(str(?p), 'wikiPageWikiLink' , 'i') && !regex(str(?p), 'wikiPageRedirects' , 'i') && !regex(str(?p), 'wikiPageDisambiguates' , 'i') && !regex(str(?p), 'Thing' , 'i') && !regex(str(?p), 'wikiPageUsesTemplate' , 'i') && !regex(str(?p), 'rdf-syntax-ns#type' , 'i') && !regex(str(?s), 'entity' , 'i') && !regex(str(?s), 'Category' , 'i') && !regex(str(?s), 'wikidata' , 'i') && !regex(str(?s), 'owl#Thing' , 'i') && !regex(str(?s), 'http://wikidata.dbpedia.org/resource/Q' , 'i') )} UNION {<http://dbpedia.org/resource/The_London_Studios> ?j ?k. FILTER (!regex(str(?j), 'wikiPageWikiLink' , 'i') && !regex(str(?j), 'wikiPageRedirects' , 'i') && !regex(str(?j), 'wikiPageDisambiguates' , 'i') && !regex(str(?j), 'Thing' , 'i') && !regex(str(?j), 'wikiPageUsesTemplate' , 'i') && !regex(str(?j), 'rdf-syntax-ns#type' , 'i') && !regex(str(?k), 'entity' , 'i') && !regex(st

http://dbpedia.org/resource/United_Kingdom_of_Great_Britain_and_Ireland  --  http://dbpedia.org/ontology/birthPlace  --  http://dbpedia.org/resource/Jack_Riley_(ice_hockey,_born_1908)  -- ["secret intelligence service"@en, "united kingdom"@en]  (50.00)
http://dbpedia.org/resource/United_Kingdom_of_Great_Britain_and_Ireland  --  http://dbpedia.org/ontology/birthPlace  --  http://dbpedia.org/resource/James_Gamble_(industrialist)  -- ["secret intelligence service"@en, "united kingdom"@en]  (50.00)
http://dbpedia.org/resource/United_Kingdom_of_Great_Britain_and_Ireland  --  http://dbpedia.org/ontology/birthPlace  --  http://dbpedia.org/resource/Michael_Hardcastle  -- ["secret intelligence service"@en, "united kingdom"@en]  (50.00)
http://dbpedia.org/resource/United_Kingdom_of_Great_Britain_and_Ireland  --  http://dbpedia.org/ontology/birthPlace  --  http://dbpedia.org/resource/Thomas_MacDonald_(cricketer)  -- ["secret intelligence service"@en, "united kingdom"@en]  (50.00)
http://dbpedia.o

Degree One Nodes size: 3647
Wall time: 43.8 s


In [None]:
for x in expandedGraph:
    print(x)

In [25]:
#qtps
def setFirstSeedLiterals(ltc):
    qtps = []
    seedGroups = []
    for literal in ltc:
        triple = Triple()
        triple.setObject(literal)
        triple.setSeeds(literal.lower())
        qtps.append(triple)
    return qtps

# Test Above

In [None]:
%%time
levelOfExpansion = 1
while(levelOfExpansion < 3 or not checkConnection(literalsToConnect, duplicatedItems)):
    #add mtps to a new list
    qtps, mtps = expand(qtps, expandedGraph, duplicatedItems, predicates, we)
    levelOfExpansion += 1

print("Matching Triples: -----   size: " + str(len(mtps)))
for tp in mtps:
    print(tp)

In [None]:
print("Size of expanded graph: "+ str(len(expandedGraph)))
for tp in expandedGraph:
    print(tp)

In [None]:
print("Size of duplicatedItems: "+ str(len(duplicatedItems)))
for tp in duplicatedItems:
    print(tp)

In [None]:
reductionTestsDegreeOne(expandedGraph)
print("Size of expanded graph: "+ str(len(expandedGraph)))
for tp in expandedGraph:
    print(tp)

In [None]:
keepMinEdge(expandedGraph)
print("Size of expanded graph: "+ str(len(expandedGraph)))
for tp in expandedGraph:
    print(tp)

# Test Purpose -- Main

In [59]:
k = [['mayor', 'leader'],['wife','spouse']]
result = evaluate_similarity_score(glove_wordmap, k)
print(result)

[0.2869392010279788, 0.5253383210285519]


In [56]:
dw('Tom Cruise','starring')

0.56

In [None]:
filterStringPredicates = ["wikiPageWikiLink","wikiPageRedirects","wikiPageDisambiguates", "Thing","wikiPageUsesTemplate","rdf-syntax-ns#type"]
filterStringSubjects = ["entity", "Category", "wikidata","owl#Thing", "http://wikidata.dbpedia.org/resource/Q"]
expandedGraph = []
duplicatedItems = []
line = 'Alberta, admit, province'
literalsToConnect, predicates = indentifyEntitiesAndPredicates(line.strip())
qtps = []
for literal in literalsToConnect:
    triple = Triple()
    triple.setObject(literal)
    triple.setSeeds(literal)
    qtps.append(triple)

for tp in qtps:
    print(tp.getObject())

# Test Purpose

levelOfExpansion = 1
while(levelOfExpansion < 3 and not checkConnection(literalsToConnect, duplicatedItems)):
    #add mtps to a new list
    qtps, mtps = expand(qtps, expandedGraph, duplicatedItems, predicates, we)
    levelOfExpansion += 1

print("Matching Triples: -----   size: " + str(len(mtps)))
for tp in mtps:
    print(tp)

In [None]:
print(isPredicateInDataset('height'))

In [None]:
literal = 'film'
queryString = 'select ?s ?p where {<http://dbpedia.org/resource/World_of_Warcraft> ?p ?s}'
#queryString = "SELECT distinct ?s ?p ?j ?k WHERE { {?s ?p <http://dbpedia.org/resource/Canada> . FILTER (!regex(str(?p), 'wikiPageWikiLink' , 'i') && !regex(str(?p), 'wikiPageRedirects' , 'i') && !regex(str(?p), 'wikiPageDisambiguates' , 'i') && !regex(str(?p), 'Thing' , 'i') && !regex(str(?p), 'wikiPageUsesTemplate' , 'i') && !regex(str(?p), 'rdf-syntax-ns#type' , 'i') && !regex(str(?s), 'entity' , 'i') && !regex(str(?s), 'Category' , 'i') && !regex(str(?s), 'wikidata' , 'i') && !regex(str(?s), 'owl#Thing' , 'i') && !regex(str(?s), 'http://wikidata.dbpedia.org/resource/Q' , 'i') )} UNION {<http://dbpedia.org/resource/Canada> ?j ?k. FILTER (!regex(str(?j), 'wikiPageWikiLink' , 'i') && !regex(str(?j), 'wikiPageRedirects' , 'i') && !regex(str(?j), 'wikiPageDisambiguates' , 'i') && !regex(str(?j), 'Thing' , 'i') && !regex(str(?j), 'wikiPageUsesTemplate' , 'i') && !regex(str(?j), 'rdf-syntax-ns#type' , 'i') && !regex(str(?k), 'entity' , 'i') && !regex(str(?k), 'Category' , 'i') && !regex(str(?k), 'wikidata' , 'i') && !regex(str(?k), 'owl#Thing' , 'i') && !regex(str(?k), 'http://wikidata.dbpedia.org/resource/Q' , 'i') )}}"
sparql.setQuery(queryString)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
    print(result["p"]["value"] + "  ---  " + result["s"]["value"])
#print(results["results"]["bindings"])

In [None]:
print(dw('official color','officialSchoolColor'))

In [76]:
print(indentifyEntitiesAndPredicates("tall"))

Current literal: tall
0 in property
507 in sub/obj
Current literal: tall
0 in property
507 in sub/obj
Current literal: Tall
0 in property
1790 in sub/obj
(['"tall"@en'], ['tall'])


# Identify E-P on QALD-5

In [61]:
%%time

#get stop words list
swl = []
with open('StopWords.txt','r') as sw:
    swlines = sw.readlines()
    for x in swlines:
        swl.append(x.strip())

with open('ttt.txt','r',encoding='utf-8') as f:
    lines = f.readlines()

with open('BaselineJWresult.txt','w',encoding='utf-8') as fw:
    for line in lines:
        filterStringPredicates = ["wikiPageWikiLink","wikiPageRedirects","wikiPageDisambiguates", "Thing","wikiPageUsesTemplate","rdf-syntax-ns#type"]
        filterStringSubjects = ["entity", "Category", "wikidata","owl#Thing", "http://wikidata.dbpedia.org/resource/Q"]
        expandedGraph = []
        duplicatedItems = []

        literalsToConnect, predicates = indentifyEntitiesAndPredicates(line.strip())
        fw.write("-----Question------\n")
        fw.write(line + '\n')
        fw.write("-------------------\n")
        qtps = []
        for literal in literalsToConnect:
            triple = Triple()
            triple.setObject(literal)
            triple.setSeeds(literal)
            qtps.append(triple)

        fw.write('--entities--\n')
        print('---- entities ----')
        for tp in qtps:
            fw.write(str(tp.getObject()) + '\n')
            print(tp.getObject())
        print(predicates)

to identify-------------
Timezone
timezone
to identify-------------
Current literal: Timezone
0 in property
167 in sub/obj
Current literal: timezone
0 in property
701 in sub/obj
to identify-------------
San Pedro De Atacama
San Pedro de Atacama
san pedro de atacama
to identify-------------
Current literal: San Pedro De Atacama
0 in property
0 in sub/obj
Current literal: San Pedro de Atacama
0 in property
967 in sub/obj
Current literal: san pedro de atacama
0 in property
0 in sub/obj
---- entities ----
"Timezone"@en
"timezone"@en
"San Pedro de Atacama"@en
['timezone', 'San Pedro de Atacama']
to identify-------------
Salt Lake City
salt lake city
to identify-------------
Current literal: Salt Lake City
0 in property
11678 in sub/obj
Current literal: salt lake city
0 in property
0 in sub/obj
to identify-------------
time zone
Time Zone
to identify-------------
Current literal: time zone
757765 in property
---- entities ----
"Salt Lake City"@en
['Salt Lake City', 'time zone']
Wall time: 3.

# Entity Similarity

In [3]:
with open("refinedLabels.dat",'r',encoding = "utf-8") as rl:
    rlLines = rl.readlines()
refinedLabels = []
for x in rlLines:
    newWords = x.strip()
    if newWords.startswith("'"):
        refinedLabels.append(newWords[2:-5])
    else:
        refinedLabels.append(newWords[1:-4])
print(len(refinedLabels))

14966461


In [36]:
%%time
bestMatch = {"score":0, "word":"Null"}
testWord = "Jack Kerouac"
for x in refinedLabels:
    currentScore = dw(x, testWord)
    if currentScore > bestMatch["score"]:
        bestMatch["score"] = currentScore
        bestMatch["word"] = x
print(bestMatch["word"])

Jack Kerouac
Wall time: 2min 1s


In [44]:
print(dw('wife','starring'))

0.0
