# Semantic Search Engine

## Installs, Imports, and Data Initialization

### Imports

In [3]:
!pip install rdflib
!pip install igraph
!pip install python-louvain
!pip install pykeen

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting igraph
  Downloading igraph-0.9.11-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 29.4 MB/s 
[?25hCollecting texttable>=1.6.2
  Downloading texttable-1.6.4-py2.py3-none-any.whl (10 kB)
Installing collected packages: texttable, igraph
Successfully installed igraph-0.9.11 texttable-1.6.4
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
# Imports

import pandas as pd
from sklearn.metrics import ndcg_score
from sklearn import metrics
from scipy import stats
from sklearn.model_selection import train_test_split
import igraph as ig
import networkx as nx
import community
from community import community_louvain
import torch
import pykeen
from pykeen.pipeline import pipeline
import matplotlib.pyplot as plt
import seaborn as sn
import rdflib
from rdflib import Literal, URIRef
from google.colab import drive
import warnings

### Data Initialisation

In [5]:
# Initialising Google Drive

drive.mount('/content/drive')
warnings.filterwarnings('ignore')

Mounted at /content/drive


In [6]:
# Load Dataset

wikidata_path = 'drive/MyDrive/data/wikidata-20211122-all.has-sitelinks.spo.parquet'
data = pd.read_parquet(wikidata_path, engine='pyarrow')

In [7]:
queries_path = 'drive/MyDrive/data/queries-v2.tsv'
answers_path = 'drive/MyDrive/data/qrels-v2.tsv'

queries = pd.read_csv(queries_path, sep='\t', header=None)
answers = pd.read_csv(answers_path, sep='\t', header=None)

In [None]:
# create tuples containing id's; format (query id of topic, wikidata id of topic)
vietnam_war_id = ('INEX_LD-20120112', 6584901)
vietnam_cuisine_id = ('INEX_LD-20120121', 826059)
bike_races_id = ('INEX_LD-20120411', 8667665)
music_conferences_id = ('INEX_LD-20120531', 8652300)
roman_architecture_id = ('INEX_LD-2009039', 7156062)
d_day_id = ('INEX_LD-2009063', 206106)
eiffel_id = ('INEX_LD-2009096', 20882)
virtual_museum_id = ('INEX_LD-2009115', 1225034)
indian_food_id = ('INEX_LD-2010004', 192087)
surrealist_movies_id = ('INEX_LD-2010043', 6837722)
relativity_theory_id = ('INEX_LD-2010057', 43514)
summer_flowers_id =('INEX_LD-2010069', 111650790) 
starting_over_id = ('INEX_LD-2012311', 158560)
german_language_id = ('INEX_LD-2012353', 7142706)
telescope_id = ('INEX_LD-2012367', 4213)
railway_stations_id = ('INEX_LD-2012371', 8627668)
olympic_sailing_id = ('INEX_XER-60', 8698126)
circus_animals_id = ('INEX_XER-74', 2526578)
ww2_participants_id = ('INEX_XER-86', 109473326)
axis_powers_id = ('INEX_XER-87', 15059644)
hybrid_cars_id = ('INEX_XER-94', 193075)
tom_hanks_movies_id = ('INEX_XER-95', 7816075)
object_oriented_programming_languages_id = ('INEX_XER-96', 16511217)
us_state_capitals_id = ('INEX_XER-108', 6808007)
nobel_prize_id = ('INEX_XER-110', 37922)
european_grand_prix_id = ('INEX_XER-114', 8168658)
f1_constructor_champions_id = ('INEX_XER-115', 177410)
blues_brothers_id = ('INEX_XER-117', 109767)
academy_award_winners_id = ('INEX_XER-122', 6468204)
world_cup_winners_id = ('INEX_XER-125', 19317)
bond_girls_id = ('INEX_XER-128', 1649366)
eu_countries_id = ('INEX_XER-133', 4587626)
kurosawa_films_id = ('INEX_XER-139', 8452687)
germany_airports_id = ('INEX_XER-140', 27670218)
catalunya_universities = ('INEX_XER-141', 219615)
chess_champions_id = ('INEX_XER-144', 10290970)

In [None]:
eval_vietnam_war = pd.Series([742687, 858887, 482402, 384062, 5183698, 7360646, 7928438, 5047436, 5029467, 7258665], name='query_wd').to_frame()
eval_vietnam_cuisine = pd.Series([5004795, 5004793, 5004813, 2213650, 4733266, 867426, 826031, 404114, 96475732, 826059, 943935], name='query_wd').to_frame()
eval_bike_races = pd.Series([4546217, 5352269, 7003377, 2408541, 33951, 13647403, 15091377, 506911, 6459597, 6394001], name='query_wd').to_frame()
eval_roman_architecture = pd.Series([366134, 1223230, 753791, 430792, 2459870, 152339, 46261, 1862851, 1570519, 265384], name='query_wd').to_frame()
eval_d_day = pd.Series([206106, 6764103, 16470, 8641370, 162873, 2810850], name='query_wd').to_frame()
eval_eiffel = pd.Series([4304521, 3049347, 20882, 508362], name='query_wd').to_frame()
eval_virtual_museum = pd.Series([7714545, 7935003, 4745379, 16977112, 16240941, 20855697, 7864903, 7094081, 207694, 1225034, 7934942, 7934967, 8196926, 20237199, 3330826, 7935002, 433276, 3330818, 7864103, 7935051], name='query_wd').to_frame()
eval_indian_food = pd.Series([2985347, 188420, 6943844, 2025907, 3349224, 3595467, 956595, 5556748], name='query_wd').to_frame()
eval_surrealist_movies = pd.Series([211693, 5990977, 3820137, 6620019, 340300, 1922418, 4478964], name='query_wd').to_frame()
eval_relativity_theory = pd.Series([719048, 7731676, 13636406, 7310836, 1188682, 2996088, 133327, 1684776, 11455, 43514, 51497, 18362], name='query_wd').to_frame()
eval_starting_over = pd.Series([158560, 2708064, 117012, 7981113], name='query_wd').to_frame()
eval_german_language = pd.Series([3347183, 55899405, 183, 2166752, 387066, 279350], name='query_wd').to_frame()
eval_railway_stations = pd.Series([6633431, 1466941, 994118, 6668115, 801282, 34696, 115753], name='query_wd').to_frame()
eval_olympic_sailing = pd.Series([1361557, 238060, 2707322, 767909], name='query_wd').to_frame()
eval_ww2_participants = pd.Series([6604328, 55, 189, 796, 153660, 38, 17, 211, 1014, 37, 32, 96, 664, 41, 20, 419, 218, 334, 15180, 869], name='query_wd').to_frame()
eval_axis_powers = pd.Series([153660, 219, 7318, 183, 1357609, 17, 38, 214], name='query_wd').to_frame()
eval_tom_hanks_movies = pd.Series([284229, 21010849], name='query_wd').to_frame()
eval_object_oriented_programming_languages = pd.Series([7072276, 6506388, 188531, 161053, 4049196, 1549370], name='query_wd').to_frame()
eval_us_state_capitals = pd.Series([24861, 43788, 6346, 28198, 28180, 28260, 33405, 34863], name='query_wd').to_frame()
eval_european_grand_prix = pd.Series([1548846, 173271, 7994, 7876, 7954], name='query_wd').to_frame()
eval_f1_constructor_champions = pd.Series([173103, 172030, 17596, 169898, 173663, 171506], name='query_wd').to_frame()
eval_academy_award_winners = pd.Series([44578, 131074, 212775, 207588, 63026, 211372, 201215, 103618], name='query_wd').to_frame()
eval_world_cup_winners = pd.Series([713750, 79800, 142, 21, 19820229, 38, 155, 96], name='query_wd').to_frame()
eval_bond_girls = pd.Series([262674, 269832, 951379, 1033016, 273351, 255323, 7595876, 123849, 2696235, 2756439], name='query_wd').to_frame()
eval_eu_countries = pd.Series([27, 218, 38, 55, 142, 33, 36, 45, 32, 35], name='query_wd').to_frame()
eval_kurosawa_films = pd.Series([575014, 257818, 2637288, 1990204, 1475995, 1194546, 39946, 2005041, 678972], name='query_wd').to_frame()
eval_germany_airports = pd.Series([49746647, 551868, 529143, 1431861, 706450, 473067], name='query_wd').to_frame()
eval_chess_champions = pd.Series([178865, 68664, 322888, 260172, 1657753, 445661, 161092, 45747, 1591494, 154586, 16225673, 1850959, 172798, 106807, 173714, 17037389, 102664, 260100, 4359259, 104148], name='query_wd').to_frame()

In [9]:
def fetch_nodes(query_id):
  all_nodes = answers[(answers[0] == query_id)]
  relevant_nodes = all_nodes[(all_nodes[3] == 1) | (all_nodes[3] == 2)]
  relevant_nodes = relevant_nodes[[2,3]]
  nodes = pd.Series(relevant_nodes.sort_values(3, ascending=False)[2]).reset_index()[2]
  
  return nodes

In [13]:
def create_dataset(node_id):
  
  dataframe = data[(data['s'] == node_id) | (data['o'] == node_id)]
  
  while len(dataframe) <= 5000000:
    subjects = dataframe['s']
    objects = dataframe['o']
    dataframe_candidate = data[(data['s'].isin(subjects)) | (data['o'].isin(objects))].drop_duplicates()
    
    if len(dataframe_candidate) > 5000000:
      return dataframe

    dataframe = dataframe_candidate

  return dataframe

In [None]:
def create_subclass_dataset(node_id):

  dataframe_base = create_dataset(node_id)
  subjects = dataframe_base['s']
  objects = dataframe_base['o']
  
  subclass_s = data[(data['o'].isin(subjects)) & (data['p'] == 279)]
  subclass_o = data[(data['s'].isin(objects)) & (data['p'] == 279)]
  
  subclass = pd.concat([subclass_s, subclass_o], ignore_index=True).drop_duplicates()
  dataframe = pd.concat([dataframe_base, subclass], ignore_index=True).drop_duplicates()
  
  return dataframe

In [None]:
def get_frequency_scores(node_id, dataframe, dataframe_subclass):
  edges = dataframe[['s','o','p']]
  edges_s = dataframe_subclass[['s','o','p']]

  g = ig.Graph.DataFrame(edges, directed=True) # graph in igraph
  g_s = ig.Graph.DataFrame(edges_s, directed=True) # subclass graph in igraph

  G = nx.from_pandas_edgelist(dataframe, 's', 'o') # graph in networkx
  G_s = nx.from_pandas_edgelist(dataframe_subclass, 's', 'o') # subclass graph in networkx


  # Term Frequency
  query_node = [node for node in g.vs if node['name'] == node_id][0].index
  neighbor_ids = g.neighbors(query_node)
  term_frequency = sorted([(node['name'], g.degree(node)) for node in g.vs if node.index in neighbor_ids], key=lambda x: x[1], reverse=True)
  df_tf = pd.Series([item[0] for item in term_frequency])
  
  # Term Frequency + Subclasses
  query_node = [node for node in g_s.vs if node['name'] == node_id][0].index
  neighbor_ids = g_s.neighbors(query_node)
  term_frequency_subclass = sorted([(node['name'], g_s.degree(node)) for node in g_s.vs if node.index in neighbor_ids], key=lambda x: x[1], reverse=True)
  df_tfs = pd.Series([item[0] for item in term_frequency_subclass])
  
  # Term Frequency + Community Detection
  partitions = community_louvain.best_partition(G)
  in_community = {key: partitions[key] for key in partitions if partitions[key] == partitions[node_id]}
  neighbors = [neighbor for neighbor in G.neighbors(node_id)]
  neighbors_in_community = {key: in_community[key] for key in in_community if key in neighbors}
  term_frequency_community = [i for i in term_frequency if i[0] in neighbors_in_community]
  df_tfc = pd.Series([item[0] for item in term_frequency_community])
  
  # Term Frequency + Subclasses + Community Detection
  partitions = community_louvain.best_partition(G_s)
  in_community = {key: partitions[key] for key in partitions if partitions[key] == partitions[node_id]}
  neighbors = [neighbor for neighbor in G.neighbors(node_id)]
  neighbors_in_community = {key: in_community[key] for key in in_community if key in neighbors}
  term_frequency_subclass_community = [i for i in term_frequency_subclass if i[0] in neighbors_in_community]
  df_tfsc = pd.Series([item[0] for item in term_frequency_subclass_community])
  
  
  data = pd.concat([df_tf, df_tfs, df_tfc, df_tfsc], axis=1)
  lengths = [len(df_tf), len(df_tfs), len(df_tfc), len(df_tfsc)]
  shortest = sorted(lengths)[0]

  return data.head(shortest)

In [None]:
def get_centrality_scores(node_id, dataframe, dataframe_subclass):
  edges = dataframe[['s','o','p']]
  edges_s = dataframe_subclass[['s','o','p']]

  g = ig.Graph.DataFrame(edges, directed=True) # graph in igraph
  g_s = ig.Graph.DataFrame(edges_s, directed=True) # subclass graph in igraph

  G = nx.from_pandas_edgelist(dataframe, 's', 'o') # graph in networkx
  G_s = nx.from_pandas_edgelist(dataframe_subclass, 's', 'o') # subclass graph in networkx


  # Centrality
  betweenness = g.betweenness()
  centrality_scores = sorted(betweenness, reverse=True)
  nodes = [(g.vs[node], centrality) for node, centrality in enumerate(centrality_scores)]
  node_index = [node for node in nodes if node[0]['name'] == node_id][0][0].index
  neighbor_ids = g.neighbors(node_index)
  neighbors = [(g.vs[node], centrality) for node, centrality in enumerate(centrality_scores) if g.vs[node].index in neighbor_ids]
  centrality = sorted([(node[0]['name'], node[1]) for node in neighbors], key=lambda x: x[1], reverse=True)
  df_c = pd.Series([item[0] for item in centrality])
  
  # Centrality + Subclasses
  betweenness = g_s.betweenness()
  centrality_scores = sorted(betweenness, reverse=True)
  nodes = [(g_s.vs[node], centrality) for node, centrality in enumerate(centrality_scores)]
  node_index = [node for node in nodes if node[0]['name'] == node_id][0][0].index
  neighbor_ids = g_s.neighbors(node_index)
  neighbors = [(g_s.vs[node], centrality) for node, centrality in enumerate(centrality_scores) if g_s.vs[node].index in neighbor_ids]
  centrality_subclass = sorted([(node[0]['name'], node[1]) for node in neighbors], key=lambda x: x[1], reverse=True)
  df_cs = pd.Series([item[0] for item in centrality_subclass])
  
  # Centrality + Community Detection
  partitions = community_louvain.best_partition(G)
  in_community = {key: partitions[key] for key in partitions if partitions[key] == partitions[node_id]}
  neighbors = [neighbor for neighbor in G.neighbors(node_id)]
  neighbors_in_community = {key: in_community[key] for key in in_community if key in neighbors}
  centrality_community = [i for i in centrality if i[0] in neighbors_in_community]
  df_cc = pd.Series([item[0] for item in centrality_community])

  # Centrality + Subclasses + Community Detection
  partitions = community_louvain.best_partition(G_s)
  in_community = {key: partitions[key] for key in partitions if partitions[key] == partitions[node_id]}
  neighbors = [neighbor for neighbor in G_s.neighbors(node_id)]
  neighbors_in_community = {key: in_community[key] for key in in_community if key in neighbors}
  centrality_subclass_community = [i for i in centrality_subclass if i[0] in neighbors_in_community]
  df_csc = pd.Series([item[0] for item in centrality_subclass_community])


  data = pd.concat([df_c, df_cs, df_cc, df_csc], axis=1)
  lengths = [len(df_c), len(df_cs), len(df_cc), len(df_csc)]
  shortest = sorted(lengths)[0]

  return data.head(shortest)

In [None]:
def evaluate_methods(topic_tuple):
  query_id = topic_tuple[0]
  node_id = topic_tuple[1]
  
  evaluation_nodes = fetch_nodes(query_id)
  dataframe = create_dataset(node_id)
  dataframe_subclass = create_subclass_dataset(node_id)

  node_frequency = get_frequency_scores(node_id, dataframe, dataframe_subclass)
  centrality = get_centrality_scores(node_id, dataframe, dataframe_subclass)

  data = pd.concat([evaluation_nodes, node_frequency, centrality], axis=1)
  lengths = [20, len(evaluation_nodes), len(node_frequency), len(centrality)]
  shortest = sorted(lengths)[0]
  data.columns = ['query', 'tf', 'tf_s', 'tf_c', 'tf_sc', 'c', 'c_s', 'c_c', 'c_sc']

  return data.head(shortest)

In [None]:
def compute_ndcg(dataframe):

  ndcg_columns = ['tf', 'tf_s', 'tf_c', 'tf_sc', 'c', 'c_s', 'c_c', 'c_sc']
  target = [dataframe['query_wd'].to_list()]
  results = []

  for column in ndcg_columns:
    score = [dataframe[column].to_list()]
    ndcg = ndcg_score(score, target)
    results.append(ndcg)

  return results

In [None]:
def full_evaluation(node_id, target_df):
  
  score_df = evaluate_methods(node_id)
  eval_dataframe = pd.concat([score_df, target_df], axis=1)
  result = pd.Series(compute_ndcg(eval_dataframe))

  return result

### Running Examples

In [None]:
# Base Running Examples
df_europe_base =  data[(data['s'] == 46) | (data['o'] == 46)]
df_caesar_base =  data[(data['s'] == 1048) | (data['o'] == 1048)]
df_f1_base =  data[(data['s'] == 1968) | (data['o'] == 1968)]

In [None]:
# Small Dataset - 31754 triples
europe_subject = df_europe_base['s']
europe_object = df_europe_base['o']
df_europe_small = data[(data['s'].isin(europe_subject)) & (data['o'].isin(europe_object))].drop_duplicates()

In [None]:
# Medium Dataset - 1071478 triples
europe_subject = df_europe_small['s']
europe_object = df_europe_small['o']
df_europe_medium = data[(data['s'].isin(europe_subject)) | (data['o'].isin(europe_object))].drop_duplicates()

europe_subject = df_europe_medium['s']
europe_object = df_europe_medium['o']
df_europe_medium = data[(data['s'].isin(europe_subject)) & (data['o'].isin(europe_object))].drop_duplicates()

In [None]:
# Large Dataset - 62656360 triples
df_europe_large = data[(data['s'].isin(europe_subject)) | (data['o'].isin(europe_object))].drop_duplicates()

europe_subject = df_europe_large['s']
europe_object = df_europe_large['o']
df_europe_large = data[(data['s'].isin(europe_subject)) & (data['o'].isin(europe_object))].drop_duplicates()

In [None]:
# Small Dataset - 574 triples
caesar_subject = df_caesar_base['s']
caesar_object = df_caesar_base['o']
df_caesar_small = data[(data['s'].isin(caesar_subject)) & (data['o'].isin(caesar_object))].drop_duplicates()

In [None]:
# Medium Dataset - 14730768 triples
caesar_subject = df_caesar_small['s']
caesar_object = df_caesar_small['o']
df_caesar_medium = data[(data['s'].isin(caesar_subject)) | (data['o'].isin(caesar_object))].drop_duplicates()

caesar_subject = df_caesar_medium['s']
caesar_object = df_caesar_medium['o']
df_caesar_medium = data[(data['s'].isin(caesar_subject)) & (data['o'].isin(caesar_object))].drop_duplicates()

In [None]:
# Large Dataset - 56709277 triples (NOT FINISHED)
df_caesar_large = data[(data['s'].isin(caesar_subject)) | (data['o'].isin(caesar_object))].drop_duplicates()

caesar_subject = df_caesar_large['s']
caesar_object = df_caesar_large['o']
df_caesar_large = data[(data['s'].isin(caesar_subject)) & (data['o'].isin(caesar_object))].drop_duplicates()

In [None]:
# Small Dataset - 578 triples
f1_subject = df_f1_base['s']
f1_object = df_f1_base['o']
df_f1_small = data[(data['s'].isin(f1_subject)) & (data['o'].isin(f1_object))].drop_duplicates()

In [None]:
# Medium Dataset - 14565 triples
f1_subject = df_f1_small['s']
f1_object = df_f1_small['o']
df_f1_medium = data[(data['s'].isin(f1_subject)) | (data['o'].isin(f1_object))].drop_duplicates()

f1_subject = df_f1_medium['s']
f1_object = df_f1_medium['o']
df_f1_medium = data[(data['s'].isin(f1_subject)) & (data['o'].isin(f1_object))].drop_duplicates()

In [None]:
# Large Dataset - 37010056 triples
df_f1_large = data[(data['s'].isin(f1_subject)) | (data['o'].isin(f1_object))].drop_duplicates()

f1_subject = df_f1_large['s']
f1_object = df_f1_large['o']
df_f1_large = data[(data['s'].isin(f1_subject)) & (data['o'].isin(f1_object))].drop_duplicates()

## Term Frequency

In [None]:
dataframe = df_caesar_small
query_id = 1048

In [None]:
edges = dataframe[['s','o','p']]
g = ig.Graph.DataFrame(edges, directed=True)

In [None]:
query_node = [node for node in g.vs if node['name'] == query_id][0].index
neighbor_ids = g.neighbors(query_node, mode='out')
degrees = sorted([(node['name'], g.degree(node)) for node in g.vs if node.index in neighbor_ids], key=lambda x: x[1], reverse=True)
degrees

[(1405, 44),
 (1747689, 44),
 (5, 26),
 (314710, 25),
 (241143, 21),
 (635, 20),
 (235087, 19),
 (2743448, 19),
 (1138524, 18),
 (2815472, 17),
 (220, 14),
 (232725, 14),
 (397, 13),
 (106199, 13),
 (30059240, 13),
 (39589, 12),
 (1102155, 12),
 (235423, 11),
 (602358, 11),
 (6581097, 11),
 (232229, 10),
 (40185, 9),
 (154668, 9),
 (234825, 9),
 (235627, 9),
 (273616, 8),
 (202161, 7),
 (432130, 7),
 (510193, 7),
 (544948, 7),
 (3181656, 7),
 (11942503, 7),
 (172248, 6),
 (440420, 6),
 (867541, 6),
 (3268376, 6),
 (11002436, 6),
 (71887839, 6),
 (4173137, 5),
 (12271314, 5),
 (16744001, 5),
 (19180675, 5),
 (35497, 4),
 (207370, 4),
 (294846, 4),
 (725434, 4),
 (1025414, 4),
 (1238338, 4),
 (5517401, 4),
 (20056508, 4),
 (29871072, 4),
 (67311526, 4),
 (97667506, 4),
 (98103687, 4),
 (40779, 3),
 (172907, 3),
 (657438, 3),
 (658992, 3),
 (944814, 3),
 (1190123, 3),
 (1231621, 3),
 (1243545, 3),
 (5460604, 3),
 (17505974, 3),
 (46002746, 3),
 (106727050, 3),
 (49757, 2),
 (82955, 2),
 (

## Node Centrality

In [None]:
dataframe = df_f1_medium
query_id = 1968

In [None]:
edges = dataframe[['s','o','p']]
g = ig.Graph.DataFrame(edges, directed=True)

In [None]:
centrality = g.betweenness()
centrality_scores = sorted(centrality, reverse=True)
nodes = [(g.vs[node], centrality) for node, centrality in enumerate(centrality)]

node_index = [node for node in nodes if node[0]['name'] == query_id][0][0].index
neighbor_ids = g.neighbors(node_index, mode='out')
neighbors = [(g.vs[node], centrality) for node, centrality in enumerate(centrality) if g.vs[node].index in neighbor_ids]

neighbor_centralities = sorted([(node[0]['name'], node[1]) for node in neighbors], key=lambda x: x[1], reverse=True)
neighbor_centralities

[(1405, 50.95),
 (635, 28.25),
 (314710, 14.583333333333334),
 (235087, 13.666666666666664),
 (106199, 6.916666666666666),
 (241143, 6.416666666666666),
 (232229, 3.1666666666666665),
 (2743448, 3.083333333333333),
 (235627, 3.0),
 (40185, 2.9166666666666665),
 (39589, 2.333333333333333),
 (232725, 1.3333333333333333),
 (544948, 0.3333333333333333),
 (235423, 0.25),
 (5, 0.0),
 (220, 0.0),
 (397, 0.0),
 (35497, 0.0),
 (40779, 0.0),
 (49757, 0.0),
 (58373, 0.0),
 (82955, 0.0),
 (149086, 0.0),
 (154668, 0.0),
 (172248, 0.0),
 (172907, 0.0),
 (189430, 0.0),
 (202161, 0.0),
 (207370, 0.0),
 (212943, 0.0),
 (213322, 0.0),
 (234825, 0.0),
 (236885, 0.0),
 (242174, 0.0),
 (271108, 0.0),
 (273616, 0.0),
 (294846, 0.0),
 (432130, 0.0),
 (440420, 0.0),
 (510193, 0.0),
 (580995, 0.0),
 (602358, 0.0),
 (657438, 0.0),
 (658992, 0.0),
 (715380, 0.0),
 (725434, 0.0),
 (731126, 0.0),
 (842337, 0.0),
 (849288, 0.0),
 (867541, 0.0),
 (928994, 0.0),
 (944814, 0.0),
 (952844, 0.0),
 (1025414, 0.0),
 (1097

## Subclass Iteration

In [None]:
g=rdflib.Graph()
g.load('http://dbpedia.org/resource/Amsterdam')
g.parse('https://www.wikidata.org/wiki/P31')
qres = g.query(
   """
    PREFIX wdq: <http://www.wikidata.org/wiki/Q>
    PREFIX wdp: <http://www.wikidata.org/wiki/P>
    SELECT ?s
        WHERE {
            wdq:46 wdp:31 ?s .
        }
        LIMIT 3
       """)
for row in qres:
    print("%s" % row)

# SELECT ?item ?itemLabel
# WHERE 
# {
#   wd:Q46 wdt:P31/wdt:P279 ?item  . #regular retrieval
#   wd:Q46 p:P31/ps:P31/wdt:P279* ?item  . #subclass retrieval
#   SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } 
# }


HTTPError: ignored

## Community Detection

In [None]:
dataframe = df_f1_medium
query_id = 1968

In [None]:
G = nx.from_pandas_edgelist(dataframe, 's', 'o') #, create_using=nx.DiGraph
partitions = community_louvain.best_partition(G)

In [None]:
partitions = community_louvain.best_partition(G)
in_community = {key: partitions[key] for key in partitions if partitions[key] == partitions[query_id]}
neighbors = [neighbor for neighbor in G.neighbors(query_id)]
neighbors_in_community = {key: in_community[key] for key in in_community if key in neighbors}
in_community_centrality = [degree for degree in neighbor_centralities if degree[0] in neighbors_in_community]


out_community = {key: partitions[key] for key in partitions if partitions[key] != partitions[query_id]}

neighbors = [neighbor for neighbor in G.neighbors(query_id)]
neighbors_in_community = {key: in_community[key] for key in in_community if key in neighbors}
neighbors_out_community = {key: out_community[key] for key in out_community if key in neighbors}

In [None]:
neighbors_in_community
# neighbor_degrees = [degree[0] for degree in degrees]
# in_community_degree = {key: in_community[key] for key in neighbors_in_community if key in neighbor_degrees}
in_community_degree = [degree for degree in degrees if degree[0] in neighbors_in_community]
in_community_degree

[(657438, 3),
 (1231621, 3),
 (82955, 2),
 (242174, 2),
 (1097498, 2),
 (5517436, 2),
 (7329059, 2),
 (30752931, 2),
 (88009196, 2),
 (58373, 1),
 (149086, 1),
 (189430, 1),
 (213322, 1),
 (236885, 1),
 (271108, 1),
 (580995, 1),
 (715380, 1),
 (731126, 1),
 (952844, 1),
 (1133288, 1),
 (1402561, 1),
 (1426927, 1),
 (3409374, 1),
 (4228595, 1),
 (5413489, 1),
 (11774156, 1),
 (11942281, 1),
 (17768966, 1),
 (46997079, 1),
 (51955019, 1),
 (86215133, 1),
 (99899510, 1)]

In [None]:
in_community_centrality = [degree for degree in neighbor_centralities if degree[0] in neighbors_in_community]
in_community_centrality

[(58373, 0.0),
 (82955, 0.0),
 (149086, 0.0),
 (189430, 0.0),
 (213322, 0.0),
 (236885, 0.0),
 (242174, 0.0),
 (271108, 0.0),
 (580995, 0.0),
 (657438, 0.0),
 (715380, 0.0),
 (731126, 0.0),
 (952844, 0.0),
 (1097498, 0.0),
 (1133288, 0.0),
 (1231621, 0.0),
 (1402561, 0.0),
 (1426927, 0.0),
 (3409374, 0.0),
 (4228595, 0.0),
 (5413489, 0.0),
 (5517436, 0.0),
 (7329059, 0.0),
 (11774156, 0.0),
 (11942281, 0.0),
 (17768966, 0.0),
 (30752931, 0.0),
 (46997079, 0.0),
 (51955019, 0.0),
 (86215133, 0.0),
 (88009196, 0.0),
 (99899510, 0.0)]

## Graph Embeddings

In [None]:
embedding_data = df_f1_medium
train, test = train_test_split(embedding_data, test_size=0.2)
train_tsv = train.to_csv(sep='\t', header=False, index=False)
test_tsv = test.to_csv(sep='\t', header=False, index=False)

In [None]:
result = pipeline(
    training = train_tsv,
    testing = test_tsv,
    model = 'TransE',
    epochs = 5
)

No random seed is specified. Setting to 2890236583.
No cuda devices were available. The model runs on CPU


OSError: ignored