# Creating HAS-model embeddings for entities in a KG
The purpose of these will be for assessing entities' similarities to one another. This measure of similarity between entities will in turn be used to assess the distinctiveness of labels that have previously been created and filtered for this KG.

In [1]:
import os
import random
#from gensim.models import Word2Vec
from src import graph

## parameters

**Embedding model parameters**  
*undirected*: Treat graph as undirected  
*number_walks*: Number of random walks to start at each node  
*walk_length*: Length of random walk started at each node  
*seed*: Seed for random walk generator
*representation_size*: Number of latent dimensions to learn from each node  
*window_size*: Window size of skipgram model  
*workers*: Number of parallel processes  

**File/Directory parameters**  
*output_filename*: Name for output representation file.  
*item_file*: File path for the file that contains entity to entity relationships (e.g. wikibase-item).  
*label_file*: File path for the file that contains wikidata labels.  
*work_dir*: Path to work_dir that was specified in candidate_label_creation notebook. This should contain a folder called label_creation that has a type_mapping.tsv file within it.  
*store_dir*: Path to folder containing the sqlite3.db file that we will use for our queries. We will reuse an existing file if there is one in this folder. Otherwise we will create a new one.

In [2]:
# Embedding model params
undirected = True
number_walks = 10
walk_length = 10
seed = 0
representation_size = 64
window_size = 5
workers = 32

# File/Directory params
output_filename = "HAS_vec"
item_file = "../../Q44/data/Q44.part.wikibase-item.tsv"
label_file = "../../Q44/data/Q44.label.en.tsv"
work_dir = "../../Q44/profiler_work_string_and_untrimmed_quantity"
store_dir = "../../Q44"

### Process parameters and set up variables / file names

In [3]:
# Ensure paths are absolute
item_file = os.path.abspath(item_file)
label_file = os.path.abspath(label_file)
work_dir = os.path.abspath(work_dir)
store_dir = os.path.abspath(store_dir)
    
# Create directories
if not os.path.exists(work_dir):
    os.makedirs(work_dir)
output_dir = "{}/HAS_embeddings".format(work_dir)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
output_file = "{}/{}".format(output_dir, output_filename)

# Setting up environment variables 
os.environ['ITEM_FILE'] = item_file
os.environ['LABEL_FILE'] = label_file
os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(store_dir)
os.environ['OUT'] = output_dir
os.environ['WORK'] = work_dir
os.environ['kgtk'] = "kgtk" # Need to do this for kgtk to be recognized as a command when passing it through a subprocess call

### 1. H-Paths
These random walks are intended to detect similarity due to homophily. Random walks are performed in a DFS manner

#### 1.1 H-Paths - creating input file to send to embedding code
First we need to create a file to use as input where each line has the format: entity1 entity2 entity1-type entity2-type

***do we want to use "distinct" here?***

In [5]:
!kgtk query -i $WORK/label_creation/type_mapping.tsv -i $ITEM_FILE \
-o $OUT/h_file_in.tsv --graph-cache $STORE \
--match '`'"$ITEM_FILE"'`: (e1)-[]->(e2), type: (e1)-[]->(e1_type), type: (e2)-[]->(e2_type)' \
--return 'distinct e1 as entity1, e2 as entity2, e1_type as entity1_type, e2_type as entity2_type' \

In [6]:
# remove header
with open('{}/h_file_in.tsv'.format(output_dir), 'r') as fin:
    data = fin.read().splitlines(True)
with open('{}/h_file_in.tsv'.format(output_dir), 'w') as fout:
    fout.writelines(data[1:])

In [7]:
!head $OUT/h_file_in.tsv | column -t -s $'\t'

Q153546    Q1726  Q131734   Q1066984
Q153546    Q1726  Q167270   Q1066984
Q19237092  Q1726  Q191067   Q1066984
Q1524      Q61    Q1549591  Q1093829
Q1524      Q61    Q5119     Q1093829
Q1524      Q61    Q515      Q1093829
Q30        Q61    Q1489259  Q1093829
Q30        Q61    Q1520223  Q1093829
Q30        Q61    Q3624078  Q1093829
Q30        Q61    Q43702    Q1093829


#### 1.2 H-Paths - calling embedding code

In [8]:
G = graph.load_edgelist_with_nodetype('{}/h_file_in.tsv'.format(output_dir), undirected=undirected)
walks = graph.build_corpus(G, num_paths=number_walks,
                     path_length=walk_length, alpha=0, rand=random.Random(seed))

NameError: name 'db_path' is not defined

### Below they seem to get random walks from files that have already been computed... Will need to call into some other code for this

In [None]:
f = open(file_S, "r")
lines = f.readlines()
for line in lines:
    m = line.strip().split(' ')
    walks.append(m)
f.close()

In [None]:
f = open(file_P, "r")
lines = f.readlines()
for line in lines:
    m = line.strip().split(' ')
    walks.append(m)
f.close()

### Use Skip-Gram model on the combined set of random walks to learn representations for the entities

In [None]:
model = Word2Vec(walks, size=representation_size, window=window_size, min_count=0, sg=1, hs=1,
                 workers=workers)
model.wv.save_word2vec_format(output)