# Creating HAS-model embeddings for entities in a KG
The purpose of these will be for assessing entities' similarities to one another. This measure of similarity between entities will in turn be used to assess the distinctiveness of labels that have previously been created and filtered for this KG.

In [72]:
import os
import random
import numpy as np
#from gensim.models import Word2Vec
import graph_tool.all as gt
from kgtk.gt.gt_load import load_graph_from_kgtk
from kgtk.io.kgtkreader import KgtkReader
import pathlib

## parameters

**Embedding model parameters**  
*undirected*: Treat graph as undirected  
*number_walks*: Number of random walks to start at each node  
*walk_length*: Length of random walk started at each node  
*seed*: Seed for random walk generator
*representation_size*: Number of latent dimensions to learn from each node  
*window_size*: Window size of skipgram model  
*workers*: Number of parallel processes  

**File/Directory parameters**  
*output_filename*: Name for output representation file.  
*item_file*: File path for the file that contains entity to entity relationships (e.g. wikibase-item).  
*label_file*: File path for the file that contains wikidata labels.  
*work_dir*: Path to work_dir that was specified in candidate_label_creation notebook. This should contain a folder called label_creation that has a type_mapping.tsv file within it.  
*store_dir*: Path to folder containing the sqlite3.db file that we will use for our queries. We will reuse an existing file if there is one in this folder. Otherwise we will create a new one.

In [3]:
# Embedding model params
undirected = True
number_walks = 10
walk_length = 10
seed = 0
representation_size = 64
window_size = 5
workers = 32

# File/Directory params
output_filename = "HAS_vec"
item_file = "../../Q44/data/Q44.part.wikibase-item.tsv"
label_file = "../../Q44/data/Q44.label.en.tsv"
work_dir = "../../Q44/profiler_work_string_and_untrimmed_quantity"
store_dir = "../../Q44"

### Process parameters and set up variables / file names

In [4]:
# Ensure paths are absolute
item_file = os.path.abspath(item_file)
label_file = os.path.abspath(label_file)
work_dir = os.path.abspath(work_dir)
store_dir = os.path.abspath(store_dir)
    
# Create directories
if not os.path.exists(work_dir):
    os.makedirs(work_dir)
output_dir = "{}/HAS_embeddings".format(work_dir)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
output_file = "{}/{}".format(output_dir, output_filename)

# Setting up environment variables 
os.environ['ITEM_FILE'] = item_file
os.environ['LABEL_FILE'] = label_file
os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(store_dir)
os.environ['OUT'] = output_dir
os.environ['WORK'] = work_dir
os.environ['kgtk'] = "kgtk" # Need to do this for kgtk to be recognized as a command when passing it through a subprocess call

### 1. H-Paths
These random walks are intended to detect similarity due to homophily. Random walks are performed in a DFS manner

#### 1.1 H-Paths - loading entity to entity edges into a Graph-Tool graph

In [70]:
kr = KgtkReader.open(pathlib.Path(item_file))
g = load_graph_from_kgtk(kr, directed=True, hashed=True)

#### 1.2 H-Paths - Build corpus of random walks

In [73]:
# Performs a random walk from each start node given
# Returns a 2d array where each row is a walk made up of vertex indexes
def random_walk(g, start_nodes, walk_length=10):
    # pre-allocate walks array with initial values of -1
    walks = np.ones((len(start_nodes),walk_length), dtype=int) * -1
    
    walks[:,0] = start_nodes
    cur_length = 1
    while cur_length < walk_length:
        cur_nodes = walks[:,cur_length - 1]
        # if we've previously hit a dead end, then we could have -1 as a 
        # current node value in this case, we want to continue filling in -1.
        neighbors = [np.array([-1]) if v < 0 else g.get_out_neighbors(v) for v in cur_nodes]
        # if there are no outbound edges we can take for 
        # some vertex, we'll make the next vertex we visit = -1
        neighbors = [np.arrray([-1]) if len(arr) < 0 else arr for arr in neighbors]
        next_nodes = [np.random.choice(arr) for arr in neighbors]
        walks[:,cur_length] = next_nodes
        cur_length += 1
    # trim -1's from any walks that reached a dead end
    walks = [arr[arr >= 0] for arr in walks]
    return walks

# Performs num_walks random walks at each node in the graph
# i.e. if there are 10 nodes in the gaph and num_walks=10, then we'll do 10*10 = 100 random walks.
# Returns a list of walks
def build_corpus(g, walk_length=10, num_walks=10):
    start_nodes = np.repeat(g.get_vertices(), num_walks)
    walks_by_index = random_walk(g, start_nodes)
    walks_by_qnode = [[g.vp.name[v_ix] for v_ix in walk] for walk in walks_by_index]
    return walks_by_qnode

In [74]:
walks = build_corpus(g)

ValueError: 'a' cannot be empty unless no samples are taken

### 2. A-Paths (TODO)

### 3. S-Paths (TODO)

### 4. Use Skip-Gram model on the combined set of random walks to learn representations for the entities

In [None]:
model = Word2Vec(walks, size=representation_size, window=window_size, min_count=0, sg=1, hs=1,
                 workers=workers)
model.wv.save_word2vec_format(output)