In [None]:
"""
Vignette 2

Import all .JSON file containing all building info and node information
Create node embeddings using node2vec

Save embeddings and models to files

"""

import json
import os
import networkx as nx
from networkx.readwrite import json_graph
import numpy as np
from node2vec import Node2Vec
import csv

path = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_buildings_features.json')

with open(path) as f:
    data = json.load(f)

print('done')

In [None]:
"""
Loop through both lists of files and use node2vec to embed the nodes

# https://github.com/eliorc/node2vec/blob/master/example.py
"""

# node2vec parameters - discover nodes with similar structural roles 
d  = 64   # dimensions 64
wl = 100   # walk_length 100
nw  = 200  # num_walks 200
w  = 4    # workers
p_val = 1.0
q_val = 2.0

# node2vec parameters - discovers homophily - clusters of nodes that frequently interact with each other
# d  = 64   # dimensions
# wl = 100   # walk_length
# nw  = 200  # num_walks
# w  = 4    # workers
# p_val = 1.0
# q_val = 0.5

# convert node_link JSON file to networkx graph and create a node2vec embeddings
def embed(data, p_val, q_val, d, wl, nw, w):
    # default = {"source": "source", "target": "target", "name": "id", "key": "key", "link": "links"}
    default = {"name": "id"} # deals pydot bug but may not be necessary
    G = json_graph.node_link_graph(data, attrs=default)
    
    # Precompute probabilities and generate walks
    node2vec = Node2Vec(G, p=p_val, q=q_val, dimensions=d, walk_length=wl, num_walks=nw, workers=w)

    ## if d_graph is big enough to fit in the memory, pass temp_folder which has enough disk space
    # Note: It will trigger "sharedmem" in Parallel, which will be slow on smaller graphs
    #node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4, temp_folder="/mnt/tmp_data")

    # Embed
    model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)

    return model

# add embeddings to building json
def add_embed(data, model):
    
    model_vectors = model.wv.vectors
    model_indices = {node: model.wv.vocab[node].index for node in model.wv.vocab}

    # add embedding vector to each node and create single vector for whole building
    nodes = []
    bldgs = {}
    for node in data['nodes']:
        
        # add vector to each node
        i = model_indices[node['id']]
        vector = [e.item() for e in model_vectors[i]]
        node['vector'] = vector
        nodes.append(model_vectors[i])
        
        # add vector to temp building list in temp dict
        b = node['building_name']
        if b not in list(bldgs.keys()):
            bldgs[b] = [vector]
        else:
            bldgs[b].append(vector)
    
    # calculate centroid of each building and add to building
    for bldg in data['graph']:
        nodes = np.array(bldgs[bldg['building_name']])
        centroid_array = np.mean(nodes, axis=0)
        centroid = [e.item() for e in centroid_array]
        bldg['centroid'] = centroid

    return data

create node2vec model
model = embed(data, p_val, q_val, d, wl, nw, w)

update json with embeddings
updated = add_embed(data, model)

save new json with embeddings added
# path = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_buildings_embeddings.json')
path = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_buildings_embeddings2.json')

with open(path, 'w', encoding ='utf8') as json_file: 
    json.dump(updated, json_file, indent=4) 

print('done')


In [None]:
"""
Add embeddings to CSV files
"""

# Update buildings csv
buildings = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_buildings_features.csv')

headers = []
rows = []
with open(buildings, 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    headers = next(reader)
    for row in reader:
        rows.append(row)

    # update each row of the building csv with the building centroid
    i = headers.index('building_name')
    comps = 0
    for row in rows:
        centroid = next(bldg['centroid'] for bldg in updated['graph'] if bldg["building_name"] == row[i])
        comps = len(centroid)
        row.extend(centroid)
    headers.extend(['c' + str(i) for i in range(comps)])

# updated_buildings = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_buildings_embeddings.csv')
updated_buildings = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_buildings_embeddings2.csv')
with open(updated_buildings, 'w', newline="", encoding='utf-8') as outFile: 
    writer = csv.writer(outFile)
    writer.writerow(headers)
    for row in rows:
        writer.writerow(row)

# Update nodes csv    
nodes = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_nodes_features.csv')

headers = []
rows = []
with open(nodes, 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    headers = next(reader)
    for row in reader:
        rows.append(row)
    
    # update each row of the node csv with node vector
    i = headers.index('id')
    comps = 0
    prev_row = None
    for row in rows:
        vector = next(node['vector'] for node in updated['nodes'] if node["id"] == row[i])
        comps = len(vector)
        row.extend(vector)
    headers.extend(['c' + str(i) for i in range(comps)])

# updated_nodes = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_nodes_embeddings.csv')
updated_nodes = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_nodes_embeddings2.csv')
with open(updated_nodes, 'w', newline="", encoding='utf-8') as outFile: 
    writer = csv.writer(outFile)
    writer.writerow(headers)
    for row in rows:
        writer.writerow(row)        
        
print('done')