In [32]:
import rdflib
import re
import numpy as np
from SPARQLWrapper import SPARQLWrapper, JSON

# Load Word Embeddings

In [33]:
import logging
from six import iteritems

In [34]:
%%time
glove_vectors_file = "glove.6B.300d.txt"
glove_wordmap = {}
with open(glove_vectors_file, "r", encoding="utf-8") as glove:
    for line in glove:
        name, vector = tuple(line.split(" ", 1))
        glove_wordmap[name] = np.fromstring(vector, sep=" ")

Wall time: 47.9 s


In [39]:
sparql = SPARQLWrapper("http://134.117.101.79:8890/sparql/")
#sparql = SPARQLWrapper("http://dbpedia.org/sparql")

In [40]:
def evaluate_similarity_score(wemb, ar):
    res = []
    for x in ar:
        if x[0] in wemb and x[1] in wemb:
            vector1 = wemb[x[0]]
            vector2 = wemb[x[1]]  
            res.append(np.dot(vector1,vector2)/(np.linalg.norm(vector1)*(np.linalg.norm(vector2))))
        else:
            res.append(-1)
    return res

# isEntityInDatabase Method

In [41]:
def isEntityInDatabase(s):
    return s.startswith("http")

In [42]:
def countOutgoingEdges(obj):
    queryString = "SELECT count(DISTINCT ?s) as ?c WHERE { ?s ?p <" + obj +">}"
    sparql.setQuery(queryString)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    #print(queryString)
    for result in results["results"]["bindings"]:
        return int(result['c']['value'])
countOutgoingEdges("http://dbpedia.org/resource/Viking_Press")

591

# MakeQueryString Method

In [43]:
def makeQueryString(obj, filterStringsPredicates, filterStringSubjects, literals):
    if obj in literals:
        result = "SELECT distinct ?s ?p WHERE { " + "{?s ?p " + obj + " . " + "} FILTER ("
        for s in filterStringsPredicates:
            fsp = "!regex(str(?p), '" + s + "' , 'i') && "
            result += fsp
        for s in filterStringSubjects:
            fss = "!regex(str(?s), '" + s + "' , 'i') && "
            result += fss
        result = result[:result.rindex("&&")] + ")}"
    else:
        result = "SELECT distinct ?s ?p ?j ?k WHERE { "
        incomingQuery = "{?s ?p <" + obj + "> . FILTER ("
        outgoingQuery =  "{<" + obj + "> ?j ?k. FILTER ("
        
        for s in filterStringsPredicates:
            fsp = "!regex(str(?p), '" + s + "' , 'i') && "
            incomingQuery += fsp
            fsj = "!regex(str(?j), '" + s + "' , 'i') && "
            outgoingQuery += fsj
            
        for s in filterStringSubjects:
            fss = "!regex(str(?s), '" + s + "' , 'i') && "
            fss2 = "!regex(str(?k), '" + s + "' , 'i') && "
            incomingQuery += fss
            outgoingQuery += fss2
        
        incomingQuery = incomingQuery[:incomingQuery.rindex("&&")] + ")}"
        outgoingQuery = outgoingQuery[:outgoingQuery.rindex("&&")] + ")}"
        result = result + incomingQuery + " UNION " + outgoingQuery + "}"
    return result

In [44]:
def makeQueryStringWithThreshold(obj, filterStringsPredicates, filterStringSubjects, literals):
    th = 10000
    if obj in literals:
        result = "SELECT distinct ?s ?p WHERE { " + "{?s ?p " + obj + " . " + "} FILTER ("
        for s in filterStringsPredicates:
            fsp = "!regex(str(?p), '" + s + "' , 'i') && "
            result += fsp
        for s in filterStringSubjects:
            fss = "!regex(str(?s), '" + s + "' , 'i') && "
            result += fss
        result = result[:result.rindex("&&")] + ")}"
    else:
        count = countOutgoingEdges(obj)
        if count < th:
            result = "SELECT distinct ?s ?p ?j ?k WHERE { "
            incomingQuery = "{?s ?p <" + obj + "> . FILTER ("
            outgoingQuery =  "{<" + obj + "> ?j ?k. FILTER ("

            for s in filterStringsPredicates:
                fsp = "!regex(str(?p), '" + s + "' , 'i') && "
                incomingQuery += fsp
                fsj = "!regex(str(?j), '" + s + "' , 'i') && "
                outgoingQuery += fsj

            for s in filterStringSubjects:
                fss = "!regex(str(?s), '" + s + "' , 'i') && "
                fss2 = "!regex(str(?k), '" + s + "' , 'i') && "
                incomingQuery += fss
                outgoingQuery += fss2

            incomingQuery = incomingQuery[:incomingQuery.rindex("&&")] + ")}"
            outgoingQuery = outgoingQuery[:outgoingQuery.rindex("&&")] + ")}"
            result = result + incomingQuery + " UNION " + outgoingQuery + "}"
        else:
            result = "SELECT distinct ?j ?k WHERE { "
            incomingQuery = "{?s ?p <" + obj + "> . FILTER ("
            outgoingQuery =  "{<" + obj + "> ?j ?k. FILTER ("

            for s in filterStringsPredicates:
                fsp = "!regex(str(?p), '" + s + "' , 'i') && "
                incomingQuery += fsp
                fsj = "!regex(str(?j), '" + s + "' , 'i') && "
                outgoingQuery += fsj

            for s in filterStringSubjects:
                fss = "!regex(str(?s), '" + s + "' , 'i') && "
                fss2 = "!regex(str(?k), '" + s + "' , 'i') && "
                incomingQuery += fss
                outgoingQuery += fss2

            incomingQuery = incomingQuery[:incomingQuery.rindex("&&")] + ")}"
            outgoingQuery = outgoingQuery[:outgoingQuery.rindex("&&")] + ")}"
            result = result + " " + outgoingQuery + "}"
    return result

# Triple Class 

In [45]:
class Triple:
    def __init__(self):
        self.subject = None
        self.object = None
        self.predicate = None
        self.cost = None
        self.previousTriple = None
        #the list of connecting seeds
        self.seeds = []
    
    def __str__(self):
        if(self.subject is not None and self.predicate is not None and self.object is not None):
            # print connecting seeds
            connectSeeds = "["
            for seed in self.seeds:
                connectSeeds += seed + ", "
            connectSeeds = connectSeeds[:connectSeeds.rindex(',')] + "]"
            return "{0}  --  {1}  --  {2}  -- {3}  ({4:.2f})".format(self.subject, self.predicate, self.object, connectSeeds, abs(self.cost))
        else:
            return "Not Well Defined Triple"
        
    def __eq__(self,other):
        if(self.subject == other.getSubject() and self.object == other.getObject() and self.predicate == other.getPredicate()):
            return True
        else:
            return False
        
    def __gt__(self,other):
        if self.cost > other.getCost():
            return True
        else:
            return False
    
    def setSeeds(self, fs):
        if fs not in self.seeds:
            self.seeds.append(fs)
            self.seeds.sort()
            
    def getSeeds(self):
        return self.seeds
    
    def setPreviousTriple(self, pt):
        self.previousTriple = pt
        
    def getPreviousTriple(self):
        return self.previousTriple
    
    def updateSeeds(self, newSeeds):
        for seed in newSeeds:
            self.setSeeds(seed)
            
    def setSubject(self, sub):
        self.subject = sub
    def setObject(self, obj):
        self.object = obj
    def setPredicate(self, pre):
        self.predicate = pre
    def setCost(self, c):
        self.cost = c
    def getCost(self):
        return self.cost
    def getSubject(self):
        return self.subject
    def getObject(self):
        return self.object
    def getPredicate(self):
        return self.predicate

# Read most frequent predicates

In [46]:
%%time
frequentPredicates = {}
with open("FreqP.txt", "r") as freqP:
    for line in freqP:
        li = line.split("\t")
        name = li[0][li[0].rindex('/')+1:]
        frequency = int(li[1])
        frequentPredicates[name] = int(frequency)

Wall time: 20 ms


# Expand Method

In [47]:
def expandWithThreshold(queryTriples, eg, di, predicatesToMatch, wemb):
    newQueryTriples = []
    matchingTriples = []
    
    tmp = []
    print("queryTriple size : " + str(len(queryTriples)))
    for queryObject in queryTriples:
        queryString = makeQueryStringWithThreshold(queryObject.getObject(), filterStringPredicates,filterStringSubjects, literalsToConnect)
        print("----- Current Query ------")
        print(queryString)
        print("--------------------------")
        
        sparql.setQuery(queryString)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()

        for result in results["results"]["bindings"]:
            
            if 's' in result:
                newTriple = Triple()
                newTriple.setSubject(queryObject.getObject())
                newTriple.setPredicate(result["p"]["value"])
                newTriple.setObject(result["s"]["value"])
                newTriple.setPreviousTriple(queryObject)
                    
                for seed in queryObject.getSeeds():
                    newTriple.setSeeds(seed)
                    
                if newTriple not in expandedGraph:
                        expandedGraph.append(newTriple)
                
                #Apply word embedding for costs and modify matchingTriples
                if newTriple not in matchingTriples:
                    matchingTriples.append(newTriple)                       
                
                if(isEntityInDatabase(result["s"]["value"])):                                   
                    if newTriple.getObject() not in tmp:
                        newQueryTriples.append(newTriple)
                    
                    #Check if it's in tmp, if yes, add triple into duplicatedItems
                    if newTriple.getObject() in tmp:
                        #print("add duplicated Items: " + newTriple.getObject())
                        addDuplicatedItems(newTriple, eg, di)
                    else:
                        tmp.append(newTriple.getObject())
                        
            else:
                                    
                newTriple = Triple()
                newTriple.setSubject(queryObject.getObject())
                newTriple.setPredicate(result["j"]["value"])                
                newTriple.setObject(result["k"]["value"])
                newTriple.setPreviousTriple(queryObject)

                for seed in queryObject.getSeeds():
                    newTriple.setSeeds(seed)

                #Apply word embedding for costs and modify matchingTriples
                if newTriple not in matchingTriples:
                    matchingTriples.append(newTriple)

                if newTriple not in expandedGraph:
                    expandedGraph.append(newTriple)
                    
                if(isEntityInDatabase(result["k"]["value"])):
                    #print(result["k"]["value"] + "   ---   " + result["j"]["value"])

                    if newTriple.getObject() not in tmp:
                        newQueryTriples.append(newTriple)
                        
                    #Check if it's in tmp, if yes, add triple into duplicatedItems
                    if newTriple.getObject() in tmp:
                        #print("add duplicated Items: " + newTriple.getObject())
                        addDuplicatedItems(newTriple, eg, di)
                    else:
                        tmp.append(newTriple.getObject())
                        
        computeCostsBaseline(predicatesToMatch, matchingTriples, wemb)
    return newQueryTriples, matchingTriples

In [48]:
def expand(queryTriples, eg, di, predicatesToMatch, wemb):
    newQueryTriples = []
    matchingTriples = []
    
    tmp = []
    print("queryTriple size : " + str(len(queryTriples)))
    for queryObject in queryTriples:
        queryString = makeQueryString(queryObject.getObject(), filterStringPredicates,filterStringSubjects, literalsToConnect)
        print("----- Current Query ------")
        print(queryString)
        print("--------------------------")
        
        sparql.setQuery(queryString)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()

        for result in results["results"]["bindings"]:
            
            if 's' in result:
                newTriple = Triple()
                newTriple.setSubject(queryObject.getObject())
                newTriple.setPredicate(result["p"]["value"])
                newTriple.setObject(result["s"]["value"])
                newTriple.setPreviousTriple(queryObject)
                    
                for seed in queryObject.getSeeds():
                    newTriple.setSeeds(seed)
                    
                if newTriple not in expandedGraph:
                        expandedGraph.append(newTriple)
                
                #Apply word embedding for costs and modify matchingTriples
                if newTriple not in matchingTriples:
                    matchingTriples.append(newTriple)                       
                
                if(isEntityInDatabase(result["s"]["value"])):                                   
                    if newTriple.getObject() not in tmp:
                        newQueryTriples.append(newTriple)
                    
                    #Check if it's in tmp, if yes, add triple into duplicatedItems
                    if newTriple.getObject() in tmp:
                        #print("add duplicated Items: " + newTriple.getObject())
                        addDuplicatedItems(newTriple, eg, di)
                    else:
                        tmp.append(newTriple.getObject())
                        
            else:
                                    
                newTriple = Triple()
                newTriple.setSubject(queryObject.getObject())
                newTriple.setPredicate(result["j"]["value"])                
                newTriple.setObject(result["k"]["value"])
                newTriple.setPreviousTriple(queryObject)

                for seed in queryObject.getSeeds():
                    newTriple.setSeeds(seed)

                #Apply word embedding for costs and modify matchingTriples
                if newTriple not in matchingTriples:
                    matchingTriples.append(newTriple)

                if newTriple not in expandedGraph:
                    expandedGraph.append(newTriple)
                    
                if(isEntityInDatabase(result["k"]["value"])):
                    #print(result["k"]["value"] + "   ---   " + result["j"]["value"])

                    if newTriple.getObject() not in tmp:
                        newQueryTriples.append(newTriple)
                        
                    #Check if it's in tmp, if yes, add triple into duplicatedItems
                    if newTriple.getObject() in tmp:
                        #print("add duplicated Items: " + newTriple.getObject())
                        addDuplicatedItems(newTriple, eg, di)
                    else:
                        tmp.append(newTriple.getObject())
                        
        computeCostsBaseline(predicatesToMatch, matchingTriples, wemb)
    return newQueryTriples, matchingTriples

# addDuplicatedItems Method

In [49]:
def addDuplicatedItems(ntp, eg, di):
    for tp in eg:
        if tp.getObject() == ntp.getObject() and tp not in di:
            di.append(tp)
            if tp.getSeeds() != ntp.getSeeds():
                updateSeedsInExpandedGraph(tp,ntp,eg)           
    #di.append(ntp)
    return "Dup"

In [50]:
def updateSeedsInExpandedGraph(tp,ntp,eg):
    old1 = tp.getSeeds()[:]
    old2 = ntp.getSeeds()[:]
    ntp.updateSeeds(tp.getSeeds())
    newS = ntp.getSeeds()
    
    for triple in eg:
        if triple.getSeeds() == old1 or triple.getSeeds() == old2:
            triple.updateSeeds(newS)

# Reduction Tests

In [51]:
def reductionTestsDegreeOne(eg):
    
    degreeOneNodes = []
    
    for tp1 in eg:
        duplicatedObject = False
        isLeafNode = True
        for tp2 in eg:
            if tp2.getPreviousTriple() == tp1:
                isLeafNode = False
            if tp1.getObject() == tp2.getObject() and not tp1.getSubject() == tp2.getSubject():
                duplicatedObject = True
                break
        if not duplicatedObject and isLeafNode:
            degreeOneNodes.append(tp1)
    print("Degree One Nodes size: " + str(len(degreeOneNodes)))
    for tp in degreeOneNodes:
        eg.remove(tp)

#TODO: Fix Bug
def keepMinEdge(eg):
    
    triplesToBeRemoved = []
    
    for tp1 in eg:
        minTriple = tp1
        for tp2 in eg:
            if tp1.getSubject() == tp2.getSubject() and tp1.getObject() == tp2.getObject and not tp1.getPredicate() == tp2.getPredicate():
                if tp2.getCost() < tp1.getCost():
                    minTriple = tp2
                    if tp1 not in triplesToBeRemoved:
                        triplesToBeRemoved.append(tp1)
                else:
                    if tp2 not in triplesToBeRemoved:
                        triplesToBeRemoved.append(tp2)
    for tp in triplesToBeRemoved:
        eg.remove(tp)
    

# checkConnection Method

In [52]:
def checkConnection(ltc, di):
    if len(ltc) == 1:
        return False
    else:
        ltc.sort()
        for tp in di:
            if tp.getSeeds() == ltc:
                return True
        return False

# ComputeCosts Method

In [53]:
def computeCosts(predicatesToMatch, matchingTriples, wemb):
    th = 30
    predicateList = []
    
    for tp in matchingTriples:
        predicate = tp.getPredicate()
        predicateList.append(predicate[predicate.rindex('/')+1:])
        
    for y in predicatesToMatch:
        ar = []
        for p in predicateList:
            # Remove StopWords from predicates
            newp = re.sub( r"([A-Z])", r" \1", p).split()
            i = 0
            while i < len(newp):
                if newp[i].lower() in swl:
                    newp.pop(i)
                else:
                    i = i + 1
            newp = "".join(newp)
            ar.append([y, newp])
            
        #ar = np.array(ar)
        
        result = evaluate_similarity_score(wemb, ar)
        #print("size of result :" + str(len(result)) + '  ----  size of matching Triples : ' + str(len(matchingTriples)))
        for x in range(len(result)):
            #when two comparing predicates are both not in the word embedding, then using JW distance instead
            #TODO: What if one of them is in word embedding?
            if result[x] == 1.0 and y != predicateList[x]:
                #print(y + "  ---  " + predicateList[x])
                result[x] = dw(y, predicateList[x])
            elif (1 - result[x]) * 100 > th:
                result[x] = dw(y, predicateList[x])
                
            if matchingTriples[x].getCost() is None:
                matchingTriples[x].setCost((1 - result[x]) * 100)
            elif matchingTriples[x].getCost() > (1 - result[x]) * 100:
                matchingTriples[x].setCost((1 - result[x]) * 100)
    
    index = 0
    
    while index < len(matchingTriples):
        if matchingTriples[index].getCost() > th:
            matchingTriples.pop(index)
        else:
            index += 1

# Baseline 1: Using JW distance only

In [54]:
def computeCostsBaseline(predicatesToMatch, matchingTriples, wemb):

    th = 50
    predicateList = []
    
    for tp in matchingTriples:
        predicate = tp.getPredicate()
        predicateList.append(predicate[predicate.rindex('/')+1:])
        
    for y in predicatesToMatch:
        ar = []
        for p in predicateList:
            # Remove StopWords from predicates
            newp = re.sub( r"([A-Z])", r" \1", p).split()
            i = 0
            while i < len(newp):
                if newp[i].lower() in swl:
                    newp.pop(i)
                else:
                    i = i + 1
            newp = "".join(newp)
            #print(newp + " -- > after swl")
            ar.append([y, newp])
        
        for x in range(len(ar)):
            #print("predicates pair: " + ar[x][0] + "  ---  " + ar[x][1])
            if len(ar[x][0]) == 0 or len(ar[x][1]) == 0:
                result = 0
            else:
                result = dw(ar[x][0], ar[x][1])
                
            if matchingTriples[x].getCost() is None:
                matchingTriples[x].setCost((1 - result) * 100)
            elif matchingTriples[x].getCost() > (1 - result) * 100:
                matchingTriples[x].setCost((1 - result) * 100)
    
    index = 0
    
    while index < len(matchingTriples):
        if matchingTriples[index].getCost() > th:
            matchingTriples.pop(index)
        else:
            index += 1

# Identify Entities and predicates in keywords
###### 1. check if the lowercased literal is a property/ontology in dataset. If yes, then it is a predicate.
###### 2. check if the convertFirstToCapital literal is a property in dataset. If yes, then it is a predicate.
###### 3. check if the convertFirstToCapital literal is a resource in dataset. If yes, then it is an entity
###### 4. Treat every literal as a predicate.

In [55]:
def isResourceInDataset3(literal):
    jw_th = 0.7
    
    queryString = 'select distinct ?s where {?s ?p "'+ literal.lower() + '"@en}'
    sparql.setQuery(queryString)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    for result in results["results"]["bindings"]:
        #print(result["s"]["value"])
        sub = result["s"]["value"]
        if 'property' in sub:
            if dw(sub[sub.rindex('/')+1:],literal) > jw_th:
                return False
    
    literal = convertFirstToCapital(literal)

    queryString = 'select distinct ?s where {?s ?p "'+ literal + '"@en}'
    sparql.setQuery(queryString)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    propertyInSub = False
    resourceInSub = False
    for result in results["results"]["bindings"]:
        #print(result["s"]["value"])
        sub = result["s"]["value"]
        try:
            p = dw(sub[sub.rindex('/') +1:],literal)
        except:
            print("Substring not found, treat as a property")
            return False
        
        if 'property' in sub:
            if  p > jw_th:
                propertyInSub = True
        elif 'resource' in sub:
            if p > jw_th:
                resourceInSub = True
                
    if propertyInSub:
        return False
    elif resourceInSub:
        return True
    else:
        return False

# Test version
## Using only resource as entity

In [56]:
def isResourceInDataset2(literal):
    jw_th = 0.7
    
    #Original keyword:
    queryString = 'select distinct ?s where {?s ?p "'+ literal + '"@en}'
    sparql.setQuery(queryString)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    for result in results["results"]["bindings"]:
        #print(result["s"]["value"])
        sub = result["s"]["value"]
        #print(sub)
        try:
            p = dw(sub[sub.rindex('/') +1:],literal)
        except:
            print("Find a record with no / in lowercased")
            
        if 'resource' in sub and p > jw_th:
            return True
        
    #Lowercase:
    queryString = 'select distinct ?s where {?s ?p "'+ literal.lower() + '"@en}'
    sparql.setQuery(queryString)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    for result in results["results"]["bindings"]:
        #print(result["s"]["value"])
        sub = result["s"]["value"]
        #print(sub)
        try:
            p = dw(sub[sub.rindex('/') +1:],literal)
        except:
            print("Find a record with no / in lowercased")
            
        if 'resource' in sub and p > jw_th:
            return True
    
    #First letter capitalized
    literal = convertFirstToCapital(literal)
    
    queryString = 'select distinct ?s where {?s ?p "'+ literal + '"@en}'
    sparql.setQuery(queryString)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    for result in results["results"]["bindings"]:
        #print(result["s"]["value"])
        sub = result["s"]["value"]
        #print(sub+ " cftc")
        try:
            p = dw(sub[sub.rindex('/') +1:],literal)
        except:
            print("Find a record with no / in CFTC")
        
        if 'resource' in sub and p > jw_th:
            return True
                
    return False

# Test Version, Using Count(*)
## Assumption: if a keyword is a property, then it is not an entity

In [70]:
def isPredicateInDataset(literal):
    count_th = 100
    
    print("Current literal: " + literal)
    #Test for property
    queryString = 'select distinct Count(?ss) as ?c where {?s ?p "'+ literal + '"@en. ?ss ?s ?j}'
    sparql.setQuery(queryString)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    for result in results["results"]["bindings"]:
        res = int(result["c"]["value"])
        print(str(res) + " in property")
        if res > count_th:
            return True
            #print("True")
        
    #Test for entity
    queryString = 'select distinct Count(?ss) as ?c where {{?s ?p "'+ literal + '"@en. ?ss ?j ?s} UNION {?s ?p "'+ literal + '"@en. ?s ?j ?ss}}'
    sparql.setQuery(queryString)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    for result in results["results"]["bindings"]:
        res = int(result["c"]["value"])
        print(str(res) + " in sub/obj")
        if res > count_th:
            return False

    return True
    

In [75]:
def indentifyEntitiesAndPredicates(keywords):
    entity = []
    pre = []
    
    li = keywords.split(',')
    
    for x in range(len(li)):
        li[x] = li[x].strip()
        #print("current keyword: " + li[x])
        
        # we treat every keyword as a predicate
        if isPredicateInDataset(li[x]):
            pre.append(li[x])
            continue
        elif isPredicateInDataset(li[x].lower()):
            pre.append(li[x].lower())
            continue
        elif isPredicateInDataset(convertFirstToCapital(li[x])):
            pre.append(convertFirstToCapital(li[x]))
            continue
        else:
            entity.append(li[x])
            pre.append(li[x])
        
    for x in range(len(entity)):
        entity[x] = '"' + entity[x] + '"@en'
    return entity,pre

In [73]:
def indentifyEntitiesAndPredicates2(keywords):
    entity = []
    pre = []
    
    li = keywords.split(',')
    
    for x in range(len(li)):
        li[x] = li[x].strip()
        #print("current keyword: " + li[x])
        
        # we treat every keyword as a predicate
        if isPredicateInDataset(li[x]):
            pre.append(li[x])
            continue
        elif not isPredicateInDataset(li[x]):
            entity.append(li[x])
            pre.append(li[x])
        elif isPredicateInDataset(li[x].lower()):
            pre.append(li[x].lower())
            continue
        elif not isPredicateInDataset(li[x].lower()):
            entity.append(li[x].lower())
            pre.append(li[x].lower())
        elif isPredicateInDataset(convertFirstToCapital(li[x])):
            pre.append(convertFirstToCapital(li[x]))
            continue
        elif not isPredicateInDataset(convertFirstToCapital(li[x])):
            entity.append(convertFirstToCapital(li[x]))
            pre.append(convertFirstToCapital(li[x]))
        
    for x in range(len(entity)):
        entity[x] = '"' + entity[x] + '"@en'
    return entity,pre


# Jaro_Winkler Distance

In [59]:
def make1shorter(word1, word2):
    if len(word1) > len(word2):
        # Make sure word1 is shorter
        temp = word1
        word1 = word2
        word2 = temp

    return word1, word2

def dj(word1, word2):
    if len(word1) == 0 or len(word2) == 0:
        raise Exception("Not words, mate")

    word1, word2 = make1shorter(word1, word2)

    word2chars = list(word2)
    m = 0
    for char in word1:
        if char in word2chars:
            m += 1
            word2chars.pop(word2chars.index(char))

    t = 0
    for i in range(len(word1)):
        if word1[i] != word2[i]:
            t += 1

    if m == 0:
        return 0
    return 1/3*(m/len(word1) + m/len(word2) + (m - t / 2)/m)

def dw(word1, word2, p=0.1, lmax=4):
    word1, word2 = make1shorter(word1, word2)

    dj_ = dj(word1, word2)

    l = 0
    for i in range(min(len(word1), lmax)):
        if word1[i] == word2[i]:
            l += 1
        else:
            break

    return dj_ + l * p * (1 - dj_)

# Convert first letter in literals to capital case and lower case for the rest

In [60]:
def convertFirstToCapital(s):
    res = s.split()
    for x in range(len(res)):
        res[x] = res[x][0].upper() + res[x][1:].lower()
    res = " ".join(res)
    return res

# Main Program

In [None]:
%%time

#get stop words list
swl = []
with open('StopWords.txt','r') as sw:
    swlines = sw.readlines()
    for x in swlines:
        swl.append(x.strip())

with open('qald5-3.txt','r',encoding='utf-8') as f:
    lines = f.readlines()

with open('test_res3.txt','w',encoding='utf-8') as fw:
    for line in lines:
        filterStringPredicates = ["wikiPageWikiLink","wikiPageRedirects","wikiPageDisambiguates", "Thing","wikiPageUsesTemplate","rdf-syntax-ns#type"]
        filterStringSubjects = ["entity", "Category", "wikidata","owl#Thing", "http://wikidata.dbpedia.org/resource/Q"]
        expandedGraph = []
        duplicatedItems = []

        literalsToConnect, predicates = indentifyEntitiesAndPredicates(line.strip())
        fw.write("-----Question------\n")
        fw.write(line + '\n')
        fw.write("-------------------\n")
        qtps = []
        for literal in literalsToConnect:
            triple = Triple()
            triple.setObject(literal)
            triple.setSeeds(literal)
            qtps.append(triple)

        fw.write('--entities--\n')
        print('---- entities ----')
        for tp in qtps:
            fw.write(str(tp.getObject()) + '\n')
            print(tp.getObject())
        print(predicates)
        # Test Purpose
        #"""
        levelOfExpansion = 1
        while(levelOfExpansion < 3 and not checkConnection(literalsToConnect, duplicatedItems)):
            #add mtps to a new list
            qtps, mtps = expand(qtps, expandedGraph, duplicatedItems, predicates, glove_wordmap)
            levelOfExpansion += 1

        print("Matching Triples: -----   size: " + str(len(mtps)))
        mtps.sort()
        for tp in mtps:
            fw.write(str(tp) + '\n')
            print(tp)
        if checkConnection(literalsToConnect, duplicatedItems):
            reductionTestsDegreeOne(expandedGraph)
            fw.write("Size of expanded graph: "+ str(len(expandedGraph)) + '\n')
            for tp in expandedGraph:
                fw.write(str(tp) + '\n')
        
        #for x in expandedGraph:
        #    print(x)
        #        """

In [None]:
for x in expandedGraph:
    if 'capital' in x.getPredicate():
        print(x)

# Baseline using exact match for predicates

In [29]:
%%time

#get stop words list
swl = []
with open('StopWords.txt','r') as sw:
    swlines = sw.readlines()
    for x in swlines:
        swl.append(x.strip())

with open('qald5.txt','r',encoding='utf-8') as f:
    lines = f.readlines()

with open('test_res.txt','w',encoding='utf-8') as fw:
    for line in lines:
        filterStringPredicates = ["wikiPageWikiLink","wikiPageRedirects","wikiPageDisambiguates", "Thing","wikiPageUsesTemplate","rdf-syntax-ns#type"]
        filterStringSubjects = ["entity", "Category", "wikidata","owl#Thing", "http://wikidata.dbpedia.org/resource/Q"]
        expandedGraph = []
        duplicatedItems = []

        literalsToConnect, predicates = indentifyEntitiesAndPredicates(line.strip())
        fw.write("-----Question------\n")
        fw.write(line + '\n')
        fw.write("-------------------\n")
        qtps = []
        for literal in literalsToConnect:
            triple = Triple()
            triple.setObject(literal)
            triple.setSeeds(literal)
            qtps.append(triple)

        fw.write('--entities--\n')
        print('---- entities ----')
        for tp in qtps:
            fw.write(str(tp.getObject()) + '\n')
            print(tp.getObject())
        print(predicates)
        # Test Purpose
        #"""
        levelOfExpansion = 1
        while(levelOfExpansion < 3 and not checkConnection(literalsToConnect, duplicatedItems)):
            #add mtps to a new list
            qtps, mtps = expandWithThreshold(qtps, expandedGraph, duplicatedItems, predicates, glove_wordmap)
            levelOfExpansion += 1

        print("Matching Triples: -----   size: " + str(len(mtps)))
        mtps.sort()
        for tp in mtps:
            fw.write(str(tp) + '\n')
            print(tp)
        if checkConnection(literalsToConnect, duplicatedItems):
            reductionTestsDegreeOne(expandedGraph)
            fw.write("Size of expanded graph: "+ str(len(expandedGraph)) + '\n')
            for tp in expandedGraph:
                fw.write(str(tp) + '\n')

0 in property


KeyboardInterrupt: 

In [None]:
for x in expandedGraph:
    print(x)

# Test Above

In [None]:
%%time
levelOfExpansion = 1
while(levelOfExpansion < 3 or not checkConnection(literalsToConnect, duplicatedItems)):
    #add mtps to a new list
    qtps, mtps = expand(qtps, expandedGraph, duplicatedItems, predicates, we)
    levelOfExpansion += 1

print("Matching Triples: -----   size: " + str(len(mtps)))
for tp in mtps:
    print(tp)

In [None]:
print("Size of expanded graph: "+ str(len(expandedGraph)))
for tp in expandedGraph:
    print(tp)

In [None]:
print("Size of duplicatedItems: "+ str(len(duplicatedItems)))
for tp in duplicatedItems:
    print(tp)

In [None]:
reductionTestsDegreeOne(expandedGraph)
print("Size of expanded graph: "+ str(len(expandedGraph)))
for tp in expandedGraph:
    print(tp)

In [None]:
keepMinEdge(expandedGraph)
print("Size of expanded graph: "+ str(len(expandedGraph)))
for tp in expandedGraph:
    print(tp)

# Test Purpose -- Main

In [None]:
k = [['influence', 'affect'],['wife','spouse']]
result = evaluate_similarity_score(glove_wordmap, k)
print(result)

In [None]:
w1 = 'influence'
w2 = 'affect'
print(dw(w1,w2))

In [None]:
print('1' != '2')

In [None]:
dw('develop','developer')

In [None]:
filterStringPredicates = ["wikiPageWikiLink","wikiPageRedirects","wikiPageDisambiguates", "Thing","wikiPageUsesTemplate","rdf-syntax-ns#type"]
filterStringSubjects = ["entity", "Category", "wikidata","owl#Thing", "http://wikidata.dbpedia.org/resource/Q"]
expandedGraph = []
duplicatedItems = []
line = 'Alberta, admit, province'
literalsToConnect, predicates = indentifyEntitiesAndPredicates(line.strip())
qtps = []
for literal in literalsToConnect:
    triple = Triple()
    triple.setObject(literal)
    triple.setSeeds(literal)
    qtps.append(triple)

for tp in qtps:
    print(tp.getObject())

# Test Purpose

levelOfExpansion = 1
while(levelOfExpansion < 3 and not checkConnection(literalsToConnect, duplicatedItems)):
    #add mtps to a new list
    qtps, mtps = expand(qtps, expandedGraph, duplicatedItems, predicates, we)
    levelOfExpansion += 1

print("Matching Triples: -----   size: " + str(len(mtps)))
for tp in mtps:
    print(tp)

In [None]:
print(isPredicateInDataset('height'))

In [None]:
literal = 'film'
queryString = 'select ?s ?p where {<http://dbpedia.org/resource/World_of_Warcraft> ?p ?s}'
#queryString = "SELECT distinct ?s ?p ?j ?k WHERE { {?s ?p <http://dbpedia.org/resource/Canada> . FILTER (!regex(str(?p), 'wikiPageWikiLink' , 'i') && !regex(str(?p), 'wikiPageRedirects' , 'i') && !regex(str(?p), 'wikiPageDisambiguates' , 'i') && !regex(str(?p), 'Thing' , 'i') && !regex(str(?p), 'wikiPageUsesTemplate' , 'i') && !regex(str(?p), 'rdf-syntax-ns#type' , 'i') && !regex(str(?s), 'entity' , 'i') && !regex(str(?s), 'Category' , 'i') && !regex(str(?s), 'wikidata' , 'i') && !regex(str(?s), 'owl#Thing' , 'i') && !regex(str(?s), 'http://wikidata.dbpedia.org/resource/Q' , 'i') )} UNION {<http://dbpedia.org/resource/Canada> ?j ?k. FILTER (!regex(str(?j), 'wikiPageWikiLink' , 'i') && !regex(str(?j), 'wikiPageRedirects' , 'i') && !regex(str(?j), 'wikiPageDisambiguates' , 'i') && !regex(str(?j), 'Thing' , 'i') && !regex(str(?j), 'wikiPageUsesTemplate' , 'i') && !regex(str(?j), 'rdf-syntax-ns#type' , 'i') && !regex(str(?k), 'entity' , 'i') && !regex(str(?k), 'Category' , 'i') && !regex(str(?k), 'wikidata' , 'i') && !regex(str(?k), 'owl#Thing' , 'i') && !regex(str(?k), 'http://wikidata.dbpedia.org/resource/Q' , 'i') )}}"
sparql.setQuery(queryString)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
    print(result["p"]["value"] + "  ---  " + result["s"]["value"])
#print(results["results"]["bindings"])

In [None]:
print(dw('official color','officialSchoolColor'))

In [76]:
print(indentifyEntitiesAndPredicates("tall"))

Current literal: tall
0 in property
507 in sub/obj
Current literal: tall
0 in property
507 in sub/obj
Current literal: Tall
0 in property
1790 in sub/obj
(['"tall"@en'], ['tall'])


# Identify E-P on QALD-5

In [77]:
%%time

#get stop words list
swl = []
with open('StopWords.txt','r') as sw:
    swlines = sw.readlines()
    for x in swlines:
        swl.append(x.strip())

with open('qald5.txt','r',encoding='utf-8') as f:
    lines = f.readlines()

with open('test_res.txt','w',encoding='utf-8') as fw:
    for line in lines:
        filterStringPredicates = ["wikiPageWikiLink","wikiPageRedirects","wikiPageDisambiguates", "Thing","wikiPageUsesTemplate","rdf-syntax-ns#type"]
        filterStringSubjects = ["entity", "Category", "wikidata","owl#Thing", "http://wikidata.dbpedia.org/resource/Q"]
        expandedGraph = []
        duplicatedItems = []

        literalsToConnect, predicates = indentifyEntitiesAndPredicates(line.strip())
        fw.write("-----Question------\n")
        fw.write(line + '\n')
        fw.write("-------------------\n")
        qtps = []
        for literal in literalsToConnect:
            triple = Triple()
            triple.setObject(literal)
            triple.setSeeds(literal)
            qtps.append(triple)

        fw.write('--entities--\n')
        print('---- entities ----')
        for tp in qtps:
            fw.write(str(tp.getObject()) + '\n')
            print(tp.getObject())
        print(predicates)

Current literal: cosmonauts
0 in property
1652 in sub/obj
Current literal: cosmonauts
0 in property
1652 in sub/obj
Current literal: Cosmonauts
0 in property
1152 in sub/obj
---- entities ----
"cosmonauts"@en
['cosmonauts']
Current literal: Ganges
0 in property
4559 in sub/obj
Current literal: ganges
0 in property
1114 in sub/obj
Current literal: Ganges
0 in property
4559 in sub/obj
Current literal: start
6060 in property
Current literal: country
215095 in property
---- entities ----
"Ganges"@en
['Ganges', 'start', 'country']
Current literal: city
98506 in property
Current literal: Germany
0 in property
453894 in sub/obj
Current literal: germany
0 in property
67840 in sub/obj
Current literal: Germany
0 in property
453894 in sub/obj
Current literal: inhabitants
0 in property
11269 in sub/obj
Current literal: inhabitants
0 in property
11269 in sub/obj
Current literal: Inhabitants
0 in property
1371 in sub/obj
Current literal: more than 250000
0 in property
0 in sub/obj
---- entities ----

13147 in sub/obj
Current literal: car
2 in property
13147 in sub/obj
Current literal: Car
0 in property
10703 in sub/obj
Current literal: produce
0 in property
16627 in sub/obj
Current literal: produce
0 in property
16627 in sub/obj
Current literal: Produce
0 in property
7904 in sub/obj
Current literal: Germany
0 in property
453894 in sub/obj
Current literal: germany
0 in property
67840 in sub/obj
Current literal: Germany
0 in property
453894 in sub/obj
---- entities ----
"car"@en
"produce"@en
"Germany"@en
['car', 'produce', 'Germany']
Current literal: people
3399 in property
Current literal: born
0 in property
4987 in sub/obj
Current literal: born
0 in property
4987 in sub/obj
Current literal: Born
1177 in property
Current literal: Vienna
0 in property
66729 in sub/obj
Current literal: vienna
0 in property
0 in sub/obj
Current literal: die
0 in property
6729 in sub/obj
Current literal: die
0 in property
6729 in sub/obj
Current literal: Die
0 in property
3364 in sub/obj
Current literal

7593 in sub/obj
Current literal: maribor
0 in property
0 in sub/obj
Current literal: inhabitants
0 in property
11269 in sub/obj
Current literal: inhabitants
0 in property
11269 in sub/obj
Current literal: Inhabitants
0 in property
1371 in sub/obj
---- entities ----
"inhabitants"@en
['maribor', 'inhabitants']
Current literal: company
26677 in property
Current literal: Munich
0 in property
43702 in sub/obj
Current literal: munich
0 in property
7715 in sub/obj
Current literal: Munich
0 in property
43702 in sub/obj
---- entities ----
"Munich"@en
['company', 'Munich']
Current literal: Claudia Schiffer
0 in property
1014 in sub/obj
Current literal: claudia schiffer
0 in property
0 in sub/obj
Current literal: tall
0 in property
507 in sub/obj
Current literal: tall
0 in property
507 in sub/obj
Current literal: Tall
0 in property
1790 in sub/obj
---- entities ----
"tall"@en
['claudia schiffer', 'tall']
Current literal: game
462 in property
Current literal: GMT
0 in property
26531 in sub/obj
Cur

0 in sub/obj
---- entities ----
"influence"@en
['socrates', 'influence', 'aristotle']
Current literal: movie
2 in property
98891 in sub/obj
Current literal: movie
2 in property
98891 in sub/obj
Current literal: Movie
0 in property
12679 in sub/obj
Current literal: Denmark
0 in property
104207 in sub/obj
Current literal: denmark
0 in property
0 in sub/obj
---- entities ----
"movie"@en
['movie', 'denmark']
Current literal: launch pad
31 in property
894 in sub/obj
Current literal: launch pad
31 in property
894 in sub/obj
Current literal: Launch Pad
0 in property
205 in sub/obj
Current literal: operate
0 in property
2085 in sub/obj
Current literal: operate
0 in property
2085 in sub/obj
Current literal: Operate
0 in property
189 in sub/obj
Current literal: NASA
0 in property
9643 in sub/obj
Current literal: nasa
0 in property
35 in sub/obj
---- entities ----
"launch pad"@en
"operate"@en
['launch pad', 'operate', 'nasa']
Current literal: Cat Stevens
0 in property
3490 in sub/obj
Current lite

1 in property
22962 in sub/obj
Current literal: High
0 in property
28679 in sub/obj
---- entities ----
"high"@en
['mount everest', 'high']
Current literal: comic
3 in property
24350 in sub/obj
Current literal: comic
3 in property
24350 in sub/obj
Current literal: Comic
0 in property
10387 in sub/obj
Current literal: Captain America
0 in property
5372 in sub/obj
Current literal: captain america
0 in property
0 in sub/obj
Current literal: create
0 in property
3993 in sub/obj
Current literal: create
0 in property
3993 in sub/obj
Current literal: Create
0 in property
560 in sub/obj
---- entities ----
"comic"@en
"create"@en
['comic', 'captain america', 'create']
Current literal: Australia
0 in property
367957 in sub/obj
Current literal: australia
0 in property
54921 in sub/obj
Current literal: Australia
0 in property
367957 in sub/obj
Current literal: capital
0 in property
477685 in sub/obj
Current literal: capital
0 in property
477685 in sub/obj
Current literal: Capital
6427 in property
Cu

564734 in sub/obj
Current literal: illinois
0 in property
21298 in sub/obj
Current literal: Illinois
0 in property
564734 in sub/obj
---- entities ----
"Illinois"@en
['state', 'border', 'Illinois']
Current literal: Limerick Lake
0 in property
228 in sub/obj
Current literal: limerick lake
0 in property
0 in sub/obj
Current literal: country
215095 in property
---- entities ----
['limerick lake', 'country']
Current literal: television show
0 in property
20470 in sub/obj
Current literal: television show
0 in property
20470 in sub/obj
Current literal: Television Show
0 in property
30 in sub/obj
Current literal: create
0 in property
3993 in sub/obj
Current literal: create
0 in property
3993 in sub/obj
Current literal: Create
0 in property
560 in sub/obj
Current literal: John Cleese
0 in property
3288 in sub/obj
Current literal: john cleese
0 in property
0 in sub/obj
---- entities ----
"create"@en
['Television Show', 'create', 'john cleese']
Current literal: mountain
27 in property
25244 in s

0 in property
266075 in sub/obj
Current literal: soccer club
0 in property
5500 in sub/obj
Current literal: soccer club
0 in property
5500 in sub/obj
Current literal: Soccer Club
0 in property
982 in sub/obj
---- entities ----
"Spain"@en
"soccer club"@en
['Spain', 'soccer club']
Current literal: Suriname
0 in property
10371 in sub/obj
Current literal: suriname
0 in property
0 in sub/obj
Current literal: official language
0 in property
8968 in sub/obj
Current literal: official language
0 in property
8968 in sub/obj
Current literal: Official Language
27 in property
122 in sub/obj
---- entities ----
"official language"@en
['suriname', 'official language']
Current literal: Tel Aviv
0 in property
14082 in sub/obj
Current literal: tel aviv
0 in property
0 in sub/obj
Current literal: mayor
8023 in property
---- entities ----
['tel aviv', 'mayor']
Current literal: Brooklyn Bridge
0 in property
2770 in sub/obj
Current literal: brooklyn bridge
0 in property
0 in sub/obj
Current literal: design
3

25244 in sub/obj
Current literal: mountain
27 in property
25244 in sub/obj
Current literal: Mountain
0 in property
22726 in sub/obj
Current literal: higher
394 in property
Current literal: Nanga Parbat
0 in property
1349 in sub/obj
Current literal: nanga parbat
0 in property
0 in sub/obj
---- entities ----
"mountain"@en
['mountain', 'higher', 'nanga parbat']
Current literal: Wikipedia
0 in property
8451 in sub/obj
Current literal: wikipedia
1 in property
754 in sub/obj
Current literal: Wikipedia
0 in property
8451 in sub/obj
Current literal: created
771 in property
---- entities ----
"Wikipedia"@en
['Wikipedia', 'created']
Current literal: actor
19 in property
246390 in sub/obj
Current literal: actor
19 in property
246390 in sub/obj
Current literal: Actor
0 in property
395299 in sub/obj
Current literal: starring
470004 in property
Current literal: Last Action Hero
0 in property
1159 in sub/obj
Current literal: last action hero
0 in property
0 in sub/obj
---- entities ----
"actor"@en
['

45241 in sub/obj
Current literal: Music Album
0 in property
83 in sub/obj
Current literal: song
0 in property
85179 in sub/obj
Current literal: song
0 in property
85179 in sub/obj
Current literal: Song
1350 in property
Current literal: Last Christmas
0 in property
1635 in sub/obj
Current literal: last christmas
0 in property
0 in sub/obj
---- entities ----
['Music Album', 'Song', 'last christmas']
Current literal: book
1976 in property
Current literal: written
96 in property
42492 in sub/obj
Current literal: written
96 in property
42492 in sub/obj
Current literal: Written
0 in property
22383 in sub/obj
Current literal: Danielle Steel
0 in property
1960 in sub/obj
Current literal: danielle steel
0 in property
0 in sub/obj
---- entities ----
"written"@en
['book', 'written', 'danielle steel']
Current literal: airport
0 in property
75390 in sub/obj
Current literal: airport
0 in property
75390 in sub/obj
Current literal: Airport
3 in property
69893 in sub/obj
Current literal: located
2 in p

1590715 in sub/obj
Current literal: united states
0 in property
336 in sub/obj
Current literal: United States
0 in property
1590715 in sub/obj
---- entities ----
"United States"@en
['natalie portman', 'Born', 'United States']
Current literal: inhabitants
0 in property
11269 in sub/obj
Current literal: inhabitants
0 in property
11269 in sub/obj
Current literal: Inhabitants
0 in property
1371 in sub/obj
Current literal: largest city
501 in property
Current literal: Canada
0 in property
483759 in sub/obj
Current literal: canada
4 in property
82319 in sub/obj
Current literal: Canada
0 in property
483759 in sub/obj
---- entities ----
"inhabitants"@en
"Canada"@en
['inhabitants', 'largest city', 'Canada']
Current literal: first
589576 in property
Current literal: climb
0 in property
2660 in sub/obj
Current literal: climb
0 in property
2660 in sub/obj
Current literal: Climb
0 in property
1156 in sub/obj
Current literal: mount Everest
0 in property
0 in sub/obj
---- entities ----
"climb"@en
['f

0 in property
366 in sub/obj
Current literal: james bond movies
0 in property
0 in sub/obj
---- entities ----
['how many', 'james bond movies']
Current literal: rockets
180 in property
Current literal: launched
67 in property
2589 in sub/obj
Current literal: launched
67 in property
2589 in sub/obj
Current literal: Launched
0 in property
954 in sub/obj
Current literal: Baikonur
0 in property
1701 in sub/obj
Current literal: baikonur
0 in property
0 in sub/obj
---- entities ----
"launched"@en
['rockets', 'launched', 'baikonur']
Current literal: pope
0 in property
73799 in sub/obj
Current literal: pope
0 in property
73799 in sub/obj
Current literal: Pope
30 in property
56700 in sub/obj
Current literal: succeed
0 in property
1322 in sub/obj
Current literal: succeed
0 in property
1322 in sub/obj
Current literal: Succeed
0 in property
90 in sub/obj
Current literal: John Paul II
0 in property
7619 in sub/obj
Current literal: john paul ii
0 in property
0 in sub/obj
---- entities ----
"pope"@en

4987 in sub/obj
Current literal: Born
1177 in property
Current literal: Paris
0 in property
304219 in sub/obj
Current literal: paris
0 in property
31106 in sub/obj
Current literal: Paris
0 in property
304219 in sub/obj
Current literal: after 1950
0 in property
0 in sub/obj
---- entities ----
"actors"@en
"Paris"@en
['actors', 'Born', 'Paris', 'after 1950']
Current literal: date
971961 in property
Current literal: Carlo Giuliani
0 in property
326 in sub/obj
Current literal: carlo giuliani
0 in property
0 in sub/obj
Current literal: shot
1 in property
19614 in sub/obj
Current literal: shot
1 in property
19614 in sub/obj
Current literal: Shot
0 in property
2403 in sub/obj
---- entities ----
"shot"@en
['date', 'carlo giuliani', 'shot']
Current literal: four
0 in property
46327 in sub/obj
Current literal: four
0 in property
46327 in sub/obj
Current literal: Four
0 in property
12556 in sub/obj
Current literal: youngest
37 in property
634 in sub/obj
Current literal: youngest
37 in property
634

5 in property
4333 in sub/obj
Current literal: Bridges
0 in property
15483 in sub/obj
Current literal: cross
3 in property
18201 in sub/obj
Current literal: cross
3 in property
18201 in sub/obj
Current literal: Cross
0 in property
36523 in sub/obj
Current literal: Seine
0 in property
4295 in sub/obj
Current literal: seine
0 in property
962 in sub/obj
Current literal: Seine
0 in property
4295 in sub/obj
---- entities ----
"bridges"@en
"cross"@en
"Seine"@en
['bridges', 'cross', 'Seine']
Current literal: mayor
8023 in property
Current literal: capital
0 in property
477685 in sub/obj
Current literal: capital
0 in property
477685 in sub/obj
Current literal: Capital
6427 in property
Current literal: French Polynesia
0 in property
5874 in sub/obj
Current literal: french polynesia
0 in property
0 in sub/obj
---- entities ----
['mayor', 'Capital', 'french polynesia']
Current literal: Dracula
0 in property
10225 in sub/obj
Current literal: dracula
0 in property
594 in sub/obj
Current literal: Dr

3127 in sub/obj
Current literal: eating disorders
0 in property
987 in sub/obj
Current literal: eating disorders
0 in property
987 in sub/obj
Current literal: Eating Disorders
0 in property
709 in sub/obj
---- entities ----
"types"@en
"eating disorders"@en
['types', 'eating disorders']
Current literal: married to
3 in property
2 in sub/obj
Current literal: president Chirac
0 in property
0 in sub/obj
---- entities ----
['married to', 'president Chirac']
Current literal: largest metropolitan area
1 in property
716 in sub/obj
Current literal: largest metropolitan area
1 in property
716 in sub/obj
Current literal: Largest Metropolitan Area
0 in property
118 in sub/obj
Current literal: Washington state
0 in property
14206 in sub/obj
Current literal: washington state
0 in property
0 in sub/obj
---- entities ----
"largest metropolitan area"@en
['largest metropolitan area', 'washington state']
Current literal: produced
1637 in property
Current literal: France
0 in property
617601 in sub/obj
Cu