The following code is largely taken from first_QA_Attempt.py and put in a question answering function ('search function'). The function is modified so that it can be benchmarked. This notebook shows the further development and improvement of this function.

It should be noted that the benchmark queries in this notebook were manually made using several approaches involving python pandas, for the purpose of showing problems with the search function.

In [1]:
import pandas as pd
import statistics
pd.options.mode.chained_assignment = None
allPredicates = ['isLeaderOf', 'owns', 'isCitizenOf', 'isLocatedIn', 'hasMusicalRole', 'hasOfficialLanguage', 'edited', 'isConnectedTo', 'actedIn', 'imports', 'participatedIn', 'wasBornIn', 'dealsWith', 'created', 'diedIn', 'isPoliticianOf', 'wroteMusicFor', 'hasNeighbor', 'isMarriedTo', 'hasChild', 'isInterestedIn', 'isAffiliatedTo', 'hasCurrency', 'exports', 'happenedIn', 'hasGender', 'playsFor', 'directed', 'worksAt', 'graduatedFrom', 'hasCapital', 'influences', 'hasWonPrize', 'hasWebsite', 'livesIn', 'hasAcademicAdvisor', 'isKnownFor']
df = pd.read_csv('yagoFactsCleaned.csv')
df.columns = ['Subject', 'Predicate', 'Object']

In [2]:
def search(queryList, goldenTriple):
    rpr = 0
    mainIndexList = []
    importance = []
    predicateListFrequencies = []
    for i in range(0, len(queryList)):
        mainIndexList.extend(df.index[df.loc[:, "Subject"] == queryList[i]].tolist())
        mainIndexList.extend(df.index[df.loc[:, "Predicate"] == queryList[i]].tolist())
        mainIndexList.extend(df.index[df.loc[:, "Object"] == queryList[i]].tolist())
    mainIndexListUnique = list(dict.fromkeys(mainIndexList))
    for m in mainIndexListUnique:
        importance.append(100**mainIndexList.count(m))
    df2 = df.iloc[mainIndexListUnique]
    df2.loc[:, "Relevance"] = importance
    predicateList = df2.loc[:, 'Predicate'].tolist()
    predicateListUnique = list(dict.fromkeys(predicateList))
    for k in predicateList:
        predicateListFrequencies.append(predicateList.count(k))
    lowestUniquePredicateFrequency = min(predicateListFrequencies)
    for n in predicateListUnique:
        df2.loc[df2.loc[:, 'Predicate'] == n, 'Relevance'] = (df2.loc[df2.loc[:, 'Predicate'] == n, 'Relevance']/predicateList.count(n)) * lowestUniquePredicateFrequency
    df2 = df2.sort_values(by=['Relevance'], ascending=False)
    if len(df2) >= 10:
        for p in range(0, len(df2)):
            if df2.iloc[p]["Subject"] == goldenTriple[0] and df2.iloc[p]["Predicate"] == goldenTriple[1] and df2.iloc[p]["Object"] == goldenTriple[2]:
                rpr = 1/(p+1)
                break
    else:
        for p in range(0, 10):
            if df2.iloc[p]["Subject"] == goldenTriple[0] and df2.iloc[p]["Predicate"] == goldenTriple[1] and df2.iloc[p]["Object"] == goldenTriple[2]:
                rpr = 1/(p+1)
                break
    return rpr

In [3]:
%%time
RR = []
RR.append(search(['Rocky_Johnson', 'Dwayne_Johnson'], ['Rocky_Johnson', 'hasChild', 'Dwayne_Johnson']))
RR.append(search(['Rocky_Johnson', 'hasChild'], ['Rocky_Johnson', 'hasChild', 'Dwayne_Johnson']))
RR.append(search(['Roman_Empire', 'hasCurrency'], ['Roman_Empire', 'hasCurrency', 'Sestertius']))
RR.append(search(['Rome', 'http://www.comune.roma.it/'], ['Rome', 'hasWebsite', 'http://www.comune.roma.it/']))
RR.append(search(['directed', 'San_Andreas_(film)'], ['Brad_Peyton', 'directed', 'San_Andreas_(film)']))
RR.append(search(['wroteMusicFor', 'Cosmopolitan_(film)'], ['Andrew_Lockington', 'wroteMusicFor', 'Cosmopolitan_(film)']))
RR.append(search(['Kiribati', 'hasCapital'], ['Kiribati', 'hasCapital', 'South_Tarawa']))
RR.append(search(['Charles_the_Fat', 'East_Francia'], ['Charles_the_Fat', 'wasBornIn', 'East_Francia']))
RR.append(search(['hasChild', 'Michelle_Obama'], ['Marian_Shields_Robinson', 'hasChild', 'Michelle_Obama']))
RR.append(search(['Greenland', 'hasCurrency'], ['Greenland', 'hasCurrency', 'Danish_krone']))
print(statistics.mean(RR))

0.35900429088611807
Wall time: 3min 14s


The MRR is worse than expected and the search function takes a lot of time. This is largely due to the inclusion of triples that only have a matching predicate in the results table.

In [4]:
def search(queryList, goldenTriple):
    rpr = 0
    mainIndexList = []
    importance = []
    predicateListFrequencies = []
    for i in range(0, len(queryList)):
        mainIndexList.extend(df.index[df.loc[:, "Subject"] == queryList[i]].tolist()) # Omit search for
        mainIndexList.extend(df.index[df.loc[:, "Object"] == queryList[i]].tolist()) # rows with matching predicate
    mainIndexListUnique = list(dict.fromkeys(mainIndexList))
    for m in mainIndexListUnique:
        if df.iloc[m]["Predicate"] in queryList:
            importance.append(100**mainIndexList.count(m)*10) # Add value to rows with matching predicates
        else:
            importance.append(100**mainIndexList.count(m))
    df2 = df.iloc[mainIndexListUnique]
    df2.loc[:, "Relevance"] = importance
    predicateList = df2.loc[:, 'Predicate'].tolist()
    predicateListUnique = list(dict.fromkeys(predicateList))
    for k in predicateList:
        predicateListFrequencies.append(predicateList.count(k))
    lowestUniquePredicateFrequency = min(predicateListFrequencies)
    for n in predicateListUnique:
        df2.loc[df2.loc[:, 'Predicate'] == n, 'Relevance'] = (df2.loc[df2.loc[:, 'Predicate'] == n, 'Relevance']/predicateList.count(n)) * lowestUniquePredicateFrequency
    df2 = df2.sort_values(by=['Relevance'], ascending=False)
    if len(df2) >= 10:
        for p in range(0, len(df2)):
            if df2.iloc[p]["Subject"] == goldenTriple[0] and df2.iloc[p]["Predicate"] == goldenTriple[1] and df2.iloc[p]["Object"] == goldenTriple[2]:
                rpr = 1/(p+1)
                break
    else:
        for p in range(0, 10):
            if df2.iloc[p]["Subject"] == goldenTriple[0] and df2.iloc[p]["Predicate"] == goldenTriple[1] and df2.iloc[p]["Object"] == goldenTriple[2]:
                rpr = 1/(p+1)
                break
    return rpr

In [5]:
%%time
RR = []
RR.append(search(['Rocky_Johnson', 'Dwayne_Johnson'], ['Rocky_Johnson', 'hasChild', 'Dwayne_Johnson']))
RR.append(search(['Rocky_Johnson', 'hasChild'], ['Rocky_Johnson', 'hasChild', 'Dwayne_Johnson']))
RR.append(search(['Roman_Empire', 'hasCurrency'], ['Roman_Empire', 'hasCurrency', 'Sestertius']))
RR.append(search(['Rome', 'http://www.comune.roma.it/'], ['Rome', 'hasWebsite', 'http://www.comune.roma.it/']))
RR.append(search(['directed', 'San_Andreas_(film)'], ['Brad_Peyton', 'directed', 'San_Andreas_(film)']))
RR.append(search(['wroteMusicFor', 'Cosmopolitan_(film)'], ['Andrew_Lockington', 'wroteMusicFor', 'Cosmopolitan_(film)']))
RR.append(search(['Kiribati', 'hasCapital'], ['Kiribati', 'hasCapital', 'South_Tarawa']))
RR.append(search(['Charles_the_Fat', 'East_Francia'], ['Charles_the_Fat', 'wasBornIn', 'East_Francia']))
RR.append(search(['hasChild', 'Michelle_Obama'], ['Marian_Shields_Robinson', 'hasChild', 'Michelle_Obama']))
RR.append(search(['Greenland', 'hasCurrency'], ['Greenland', 'hasCurrency', 'Danish_krone']))
print(statistics.mean(RR))

0.95
Wall time: 25.4 s


One of the benchmark queries has an RR of 0.5 despite the golden triple having the highest relevance score. This is a limitation of the benchmarking algorithm.

In [6]:
print(search(['wroteMusicFor', 'Cosmopolitan_(film)'], ['Andrew_Lockington', 'wroteMusicFor', 'Cosmopolitan_(film)']))

0.5


The following function has a different ranking algorithm. It assumes that the query contains only subjects and objects (nodes). It only returns the rarest predicate (edge) that connects two nodes in the query.

In [7]:
def search(queryList, goldenAnswer): # Change goldenTriple to goldenAnswer
    rpr = 0
    mainIndexList = []
    importance = []
    predicateListFrequencies = []
    for i in range(0, len(queryList)):
        mainIndexList.extend(df.index[df.loc[:, "Subject"] == queryList[i]].tolist())
        mainIndexList.extend(df.index[df.loc[:, "Object"] == queryList[i]].tolist())
    mainIndexListUnique = list(dict.fromkeys(mainIndexList))
    for m in mainIndexListUnique:
        importance.append(100**(mainIndexList.count(m))) # Ignore predicates in query (change ranking)
    df2 = df.iloc[mainIndexListUnique]
    df2.loc[:, "Relevance"] = importance
    predicateList = df2.loc[:, 'Predicate'].tolist()
    predicateListUnique = list(dict.fromkeys(predicateList))
    for k in predicateList:
        predicateListFrequencies.append(predicateList.count(k))
    lowestUniquePredicateFrequency = min(predicateListFrequencies)
    for n in predicateListUnique:
        df2.loc[df2.loc[:, 'Predicate'] == n, 'Relevance'] = (df2.loc[df2.loc[:, 'Predicate'] == n, 'Relevance']/predicateList.count(n)) * lowestUniquePredicateFrequency
    df2 = df2.sort_values(by=['Relevance'], ascending=False)
    df2 = df2.loc[:, "Predicate"] # Make result table consist of predicates only
    if len(df2) >= 10:
        for p in range(0, len(df2)):
            if df2.iloc[p] == goldenAnswer: # Compare with golden answer predicate
                rpr = 1/(p+1)
                break
    else:
        for p in range(0, 10):
            if df2.iloc[p] == goldenAnswer: # "
                rpr = 1/(p+1)
                break
    return rpr

In [8]:
%%time
# Changed benchmark queries
RR = []
RR.append(search(['Barack_Obama', 'Marian_Shields_Robinson'], 'hasChildisMarriedTo'))
RR.append(search(['Sigmund_Freud', 'Kesswil'], 'influenceswasBornIn'))
RR.append(search(['Battle_of_Talas', 'Tajikistan'], 'happenedInhasNeighbor'))
RR.append(search(['Ricochet_(TV_production_company)', 'Sahara'], 'createdisLocatedIn'))
RR.append(search(['Rocky_Johnson', 'Dwayne_Johnson'], 'hasChild'))
RR.append(search(['Rome', 'http://www.comune.roma.it/'], 'hasWebsite'))
RR.append(search(['Charles_the_Fat', 'East_Francia'], 'wasBornIn'))
RR.append(search(['Luxembourg', 'Luxembourg_City'], 'hasCapital'))
RR.append(search(['Toby_Barrett', 'Long_Point,_Ontario'], 'isLeaderOf'))
RR.append(search(['Metra', 'North_Central_Service'], 'owns'))
RR.append(search(['Gordon_Ramsay', 'Culinary_Genius_(TV_series)'], 'created'))
RR.append(search(['Kugelmugel', 'German_language'], 'hasOfficialLanguage'))
RR.append(search(['Yoshitami_Kuroiwa', 'Godzilla_1985'], 'edited'))
RR.append(search(['Gisborne_Airport', 'Auckland_Airport'], 'isConnectedTo'))
RR.append(search(['Macedonia_(ancient_kingdom)', 'Siege_of_Cyropolis'], 'participatedIn'))
RR.append(search(['Aristotle', 'Euboea'], 'diedIn'))
RR.append(search(['Latvia', 'Belarus'], 'hasNeighbor'))
RR.append(search(['Luigi_Ambrosio', 'Ennio_de_Giorgi'], 'hasAcademicAdvisor'))
RR.append(search(['Jeff_Bezos', 'Amazon.com'], 'created'))
RR.append(search(['Tatsuro_Yamashita', 'Ride_On_Time_(album)'], 'created'))
print(statistics.mean(RR))

0.75
Wall time: 49.2 s


Two of the benchmark queries have a score of 0.5 which is due to the ranking algorithm in the first case and due to a mistake regarding the golden answer in the second.

In [9]:
print(search(['Gisborne_Airport', 'Auckland_Airport'], 'isConnectedTo'))
print(search(['Macedonia_(ancient_kingdom)', 'Siege_of_Cyropolis'], 'participatedIn'))

0.5
0.5


The function below is the same as above, with slightly different ranking, and it is benchmarked correctly.

In [10]:
def search(queryList, goldenAnswer):
    rpr = 0
    mainIndexList = []
    importance = []
    predicateListFrequencies = []
    for i in range(0, len(queryList)):
        mainIndexList.extend(df.index[df.loc[:, "Subject"] == queryList[i]].tolist())
        mainIndexList.extend(df.index[df.loc[:, "Object"] == queryList[i]].tolist())
    mainIndexListUnique = list(dict.fromkeys(mainIndexList))
    for m in mainIndexListUnique:
        importance.append(100**(mainIndexList.count(m)*2)) # Fine-tune ranking
    df2 = df.iloc[mainIndexListUnique]
    df2.loc[:, "Relevance"] = importance
    predicateList = df2.loc[:, 'Predicate'].tolist()
    predicateListUnique = list(dict.fromkeys(predicateList))
    for k in predicateList:
        predicateListFrequencies.append(predicateList.count(k))
    lowestUniquePredicateFrequency = min(predicateListFrequencies)
    for n in predicateListUnique:
        df2.loc[df2.loc[:, 'Predicate'] == n, 'Relevance'] = (df2.loc[df2.loc[:, 'Predicate'] == n, 'Relevance']/predicateList.count(n)) * lowestUniquePredicateFrequency
    df2 = df2.sort_values(by=['Relevance'], ascending=False)
    df2 = df2.loc[:, "Predicate"]
    if len(df2) >= 10:
        for p in range(0, len(df2)):
            if df2.iloc[p] == goldenAnswer:
                rpr = 1/(p+1)
                break
    else:
        for p in range(0, 10):
            if df2.iloc[p] == goldenAnswer:
                rpr = 1/(p+1)
                break
    return rpr

In [11]:
%%time
RR = []
RR.append(search(['Barack_Obama', 'Marian_Shields_Robinson'], 'hasChildisMarriedTo'))
RR.append(search(['Sigmund_Freud', 'Kesswil'], 'influenceswasBornIn'))
RR.append(search(['Battle_of_Talas', 'Tajikistan'], 'happenedInhasNeighbor'))
RR.append(search(['Ricochet_(TV_production_company)', 'Sahara'], 'createdisLocatedIn'))
RR.append(search(['Rocky_Johnson', 'Dwayne_Johnson'], 'hasChild'))
RR.append(search(['Rome', 'http://www.comune.roma.it/'], 'hasWebsite'))
RR.append(search(['Charles_the_Fat', 'East_Francia'], 'wasBornIn'))
RR.append(search(['Luxembourg', 'Luxembourg_City'], 'hasCapital'))
RR.append(search(['Toby_Barrett', 'Long_Point,_Ontario'], 'isLeaderOf'))
RR.append(search(['Metra', 'North_Central_Service'], 'owns'))
RR.append(search(['Gordon_Ramsay', 'Culinary_Genius_(TV_series)'], 'created'))
RR.append(search(['Kugelmugel', 'German_language'], 'hasOfficialLanguage'))
RR.append(search(['Yoshitami_Kuroiwa', 'Godzilla_1985'], 'edited'))
RR.append(search(['Gisborne_Airport', 'Auckland_Airport'], 'isConnectedTo'))
RR.append(search(['Macedonia_(ancient_kingdom)', 'Siege_of_Cyropolis'], 'happenedIn')) # Changed golden answer
RR.append(search(['Aristotle', 'Euboea'], 'diedIn'))
RR.append(search(['Latvia', 'Belarus'], 'hasNeighbor'))
RR.append(search(['Luigi_Ambrosio', 'Ennio_de_Giorgi'], 'hasAcademicAdvisor'))
RR.append(search(['Jeff_Bezos', 'Amazon.com'], 'created'))
RR.append(search(['Tatsuro_Yamashita', 'Ride_On_Time_(album)'], 'created'))
print(statistics.mean(RR))

0.8
Wall time: 49.5 s


In the function below, if there is no edge incident to two nodes in the query, the function will return a directed length-2 path that connects two nodes in the query.

In [12]:
def search(queryList, goldenAnswer):
    rpr = 0
    df2 = df.loc[df['Subject'].isin(queryList) | df['Object'].isin(queryList)]
    tf = df2.merge(right=df2, left_on='Object', right_on='Subject')
    tf['Subject'], tf['Predicate'], tf['Object'] = tf['Subject_x'], tf['Predicate_x']+tf['Predicate_y'], tf['Object_y']
    tf = tf.loc[:, ['Subject', 'Predicate', 'Object']]
    df2 = pd.concat([df2, tf])
    predicateList = df2['Predicate'].tolist()
    predicateListUnique = list(dict.fromkeys(predicateList))
    df2['Relevance'] = 0
    df2.loc[df2['Subject'].isin(queryList), 'Relevance'] += 1
    df2.loc[df2['Object'].isin(queryList), 'Relevance'] += 1
    df2.loc[df2['Predicate'].isin(allPredicates), 'Relevance'] += 0.1
    for n in predicateListUnique:
        df2.loc[df2['Predicate'] == n, 'Relevance'] += 1/(predicateList.count(n)*10)
    df2.sort_values(by=['Relevance'], ascending=False, inplace=True)
    for p in range(0, len(df2['Predicate'].head())):
        if df2['Predicate'].head().iloc[p] == goldenAnswer:
            rpr = 1/(p+1)
            break
    return rpr

In [13]:
%%time
RR = []
RR.append(search(['Barack_Obama', 'Marian_Shields_Robinson'], 'hasChildisMarriedTo'))
RR.append(search(['Sigmund_Freud', 'Kesswil'], 'influenceswasBornIn'))
RR.append(search(['Battle_of_Talas', 'Tajikistan'], 'happenedInhasNeighbor'))
RR.append(search(['Ricochet_(TV_production_company)', 'Sahara'], 'createdisLocatedIn'))
RR.append(search(['Rocky_Johnson', 'Dwayne_Johnson'], 'hasChild'))
RR.append(search(['Rome', 'http://www.comune.roma.it/'], 'hasWebsite'))
RR.append(search(['Charles_the_Fat', 'East_Francia'], 'wasBornIn'))
RR.append(search(['Luxembourg', 'Luxembourg_City'], 'hasCapital'))
RR.append(search(['Toby_Barrett', 'Long_Point,_Ontario'], 'isLeaderOf'))
RR.append(search(['Metra', 'North_Central_Service'], 'owns'))
RR.append(search(['Gordon_Ramsay', 'Culinary_Genius_(TV_series)'], 'created'))
RR.append(search(['Kugelmugel', 'German_language'], 'hasOfficialLanguage'))
RR.append(search(['Yoshitami_Kuroiwa', 'Godzilla_1985'], 'edited'))
RR.append(search(['Gisborne_Airport', 'Auckland_Airport'], 'isConnectedTo'))
RR.append(search(['Macedonia_(ancient_kingdom)', 'Siege_of_Cyropolis'], 'happenedIn'))
RR.append(search(['Aristotle', 'Euboea'], 'diedIn'))
RR.append(search(['Latvia', 'Belarus'], 'hasNeighbor'))
RR.append(search(['Luigi_Ambrosio', 'Ennio_de_Giorgi'], 'hasAcademicAdvisor'))
RR.append(search(['Jeff_Bezos', 'Amazon.com'], 'created'))
RR.append(search(['Tatsuro_Yamashita', 'Ride_On_Time_(album)'], 'created'))
print(statistics.mean(RR))

0.8916666666666667
Wall time: 17.5 s


The RR in the following benchmark queries can be improved by ignoring cyclic paths in the ranking.

In [14]:
print(search(['Barack_Obama', 'Marian_Shields_Robinson'], 'hasChildisMarriedTo'))
print(search(['Battle_of_Talas', 'Tajikistan'], 'happenedInhasNeighbor'))

0.5
0


The following function represents the final version of the search function in this development notebook. It is capable of doing a 2-hop search, i.e. finding answers that are length-1 or length-2 paths connecting the search terms.

In [15]:
def search(queryList, goldenAnswer):
    rpr = 0
    df2 = df.loc[df['Subject'].isin(queryList) | df['Object'].isin(queryList)]
    tf = df2.merge(right=df2, left_on='Object', right_on='Subject')
    tf['Subject'], tf['Predicate'], tf['Object'] = tf['Subject_x'], tf['Predicate_x']+tf['Predicate_y'], tf['Object_y']
    tf = tf.loc[:, ['Subject', 'Predicate', 'Object']]
    df2 = pd.concat([df2, tf])
    predicateList = df2['Predicate'].tolist()
    predicateListUnique = list(dict.fromkeys(predicateList))
    df2['Relevance'] = 0
    df2.loc[df2['Subject'].isin(queryList), 'Relevance'] += 1
    df2.loc[df2['Object'].isin(queryList), 'Relevance'] += 1
    df2.loc[df2['Predicate'].isin(allPredicates), 'Relevance'] += 0.1
    df2.loc[df2['Subject'] == df2['Object'], 'Relevance'] = 0 # Ignore cyclic paths
    for n in predicateListUnique:
        df2.loc[df2['Predicate'] == n, 'Relevance'] += 1/(predicateList.count(n)*10)
    df2.sort_values(by=['Relevance'], ascending=False, inplace=True)
    for p in range(0, len(df2['Predicate'].head())):
        if df2['Predicate'].head().iloc[p] == goldenAnswer:
            rpr = 1/(p+1)
            break
    return rpr

In [16]:
%%time
RR = []
RR.append(search(['Barack_Obama', 'Marian_Shields_Robinson'], 'hasChildisMarriedTo'))
RR.append(search(['Sigmund_Freud', 'Kesswil'], 'influenceswasBornIn'))
RR.append(search(['Battle_of_Talas', 'Tajikistan'], 'happenedInhasNeighbor'))
RR.append(search(['Ricochet_(TV_production_company)', 'Sahara'], 'createdisLocatedIn'))
RR.append(search(['Rocky_Johnson', 'Dwayne_Johnson'], 'hasChild'))
RR.append(search(['Rome', 'http://www.comune.roma.it/'], 'hasWebsite'))
RR.append(search(['Charles_the_Fat', 'East_Francia'], 'wasBornIn'))
RR.append(search(['Luxembourg', 'Luxembourg_City'], 'hasCapital'))
RR.append(search(['Toby_Barrett', 'Long_Point,_Ontario'], 'isLeaderOf'))
RR.append(search(['Metra', 'North_Central_Service'], 'owns'))
RR.append(search(['Gordon_Ramsay', 'Culinary_Genius_(TV_series)'], 'created'))
RR.append(search(['Kugelmugel', 'German_language'], 'hasOfficialLanguage'))
RR.append(search(['Yoshitami_Kuroiwa', 'Godzilla_1985'], 'edited'))
RR.append(search(['Gisborne_Airport', 'Auckland_Airport'], 'isConnectedTo'))
RR.append(search(['Macedonia_(ancient_kingdom)', 'Siege_of_Cyropolis'], 'happenedIn'))
RR.append(search(['Aristotle', 'Euboea'], 'diedIn'))
RR.append(search(['Latvia', 'Belarus'], 'hasNeighbor'))
RR.append(search(['Luigi_Ambrosio', 'Ennio_de_Giorgi'], 'hasAcademicAdvisor'))
RR.append(search(['Jeff_Bezos', 'Amazon.com'], 'created'))
RR.append(search(['Tatsuro_Yamashita', 'Ride_On_Time_(album)'], 'created'))
print(statistics.mean(RR))

1.0
Wall time: 17.6 s
