In [1]:
import os
from utility import run_command
from tqdm.notebook import tqdm

## Pre-reqs:
- We use tqdm. You can install it with `pip install tqdm` or `conda install -c conda-forge tqdm`

## parameters 
 
*edge_file_in*: File path for the kgtk file that contains the graph you want to perform walks on.  
*work_dir*: Path to directory where files created by this notebook will be saved.  
*store_dir*: Path to folder containing the sqlite3.db file that we will use for our queries. We will reuse an existing file if there is one in this folder. Otherwise we will create a new one.  

*num_walks_per_node*: Number of walks we should perform starting from each node in the graph (i.e. 10 --> for each node in the graph do 10 walks)
*undirected*: Whether we should treat the graph we are doing walks on as directed or undirected.

In [12]:
edge_file_in = "../../wikidata_films/data/claims.wikibase-item.tsv.gz"
work_dir = "../../wikidata_films/profiler_work"
store_dir = "../../wikidata_films"

# edge_file_in = "./data/toy_numbers/toy_numbers.tsv"
# work_dir = "./output/toy_numbers"
# store_dir = "./data/toy_numbers"

num_walks_per_node = 10
walk_length = 10
undirected = True #TODO - we don't support directed walks yet. need to deal with dead ends.

### Process parameters and set up variables / file names

In [13]:
# Ensure paths are absolute
edge_file_in = os.path.abspath(edge_file_in)
work_dir = os.path.abspath(work_dir)
store_dir = os.path.abspath(store_dir)
    
# Create directories
if not os.path.exists(work_dir):
    os.makedirs(work_dir)
output_dir = "{}/kypher_random_walks".format(work_dir)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Setting up environment variables 
os.environ['EDGE_FILE_IN'] = edge_file_in
os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(store_dir)
os.environ['OUT'] = output_dir
os.environ['kgtk'] = "kgtk" # Need to do this for kgtk to be recognized as a command when passing it through a subprocess call

### Pre-processing step to make the graph undirected
If the graph is undirected, create a new kgtk file that has a backward edge for every forward edge in the input file.

In [14]:
!kgtk query -i $EDGE_FILE_IN -o $OUT/backwards_edge_file.tsv.gz --graph-cache $STORE \
--match '`'"$EDGE_FILE_IN"'`: (n1)-[l]->(n2)' \
--return 'n2 as node1, printf("%s-bwd", l.label) as label, n1 as node2, printf("%s-bwd", l) as id'

In [15]:
!kgtk cat -i $OUT/backwards_edge_file.tsv.gz -i $EDGE_FILE_IN -o $OUT/undirected_edge_file.tsv.gz

In [16]:
if undirected:
    edge_file = "{}/undirected_edge_file.tsv.gz".format(output_dir)
else:
    edge_file = edge_file_in
os.environ["EDGE_FILE"] = edge_file

### Set up files for performing walks
one file to store the final walks, another kgtk file for building them up

In [22]:
walks_file = "{}/walks.txt".format(output_dir)
os.environ["WALKS_FILE"] = walks_file
open(walks_file, 'w').close()

#### note - for directed walks, if we want to include walks of length one (i.e. start nodes that have no outbound edges), need to do some extra work here. Not doing this now since we don't currently have a need for this.

In [33]:
!kgtk query -i $EDGE_FILE -o $OUT/walks_in_progress.tsv.gz --graph-cache $STORE \
--match '`'"$EDGE_FILE"'`: (n1)-[]->()' \
--return 'distinct n1 as node1, "" as label, n1 as node2, "_" as id'

In [34]:
cat_command = "$kgtk cat " + " ".join(["-i $OUT/walks_in_progress.tsv.gz" for i in range(num_walks_per_node)]) + " -o $OUT/walks_in_progress_temp.tsv.gz \
                && mv $OUT/walks_in_progress_temp.tsv.gz $OUT/walks_in_progress.tsv.gz"
run_command(cat_command)

In [35]:
!kgtk add-id -i $OUT/walks_in_progress.tsv.gz --overwrite-id -o $OUT/walks_in_progress_temp.tsv.gz \
&& mv $OUT/walks_in_progress_temp.tsv.gz $OUT/walks_in_progress.tsv.gz

### Take steps
If we have a directed representation, first we need to check for any walks that have reached a dead end and save them to the final walks file. (todo - implement this)

In [None]:
command = "$kgtk query -i EDGE_FILE -i $OUT/walks_in_progress.tsv.gz -o $OUT/walks_in_progress_next.tsv.gz --graph-cache $STORE \
            --match 'progress: (path)-[l]->(cur), `EDGE_FILE`: (cur)-[]->(next)' \
            --return 'l as id, max(random()) as rand, printf(\"%s %s\", path, next) as node1, \"\" as label, next as node2' \
            && $kgtk remove-columns -i $OUT/walks_in_progress_next.tsv.gz --columns rand -o $OUT/walks_in_progress.tsv.gz"
for i in tqdm(range(walk_length - 1)):
    run_command(command, {"EDGE_FILE" : edge_file})

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9.0), HTML(value='')))

### Write walks to file

In [148]:
!kgtk query -i $OUT/walks_in_progress.tsv -o $WALKS_FILE --graph-cache $STORE \
--match 'walks: (n1)-[]->()' \
--return 'n1 as walks'

In [149]:
!head -5 $WALKS_FILE

walks
24 12 24 8 24 8 24 8 16 2 26
3 12 2 8 24 4 24 8 24 4 24
3 24 3 18 6 24 2 24 4 24 6
4 12 24 3 18 3 18 6 12 3 24


remove header line

In [150]:
!tail -n +2 $WALKS_FILE > $WALKS_FILE.tmp && mv $WALKS_FILE.tmp $WALKS_FILE

In [151]:
!head -5 $WALKS_FILE

24 12 24 8 24 8 24 8 16 2 26
3 12 2 8 24 4 24 8 24 4 24
3 24 3 18 6 24 2 24 4 24 6
4 12 24 3 18 3 18 6 12 3 24
6 18 3 15 3 24 12 3 24 6 12
