In [54]:
"""
Vignette 2

Join all buildings into single graph and create a single model

"""

import json
import os

# list paths for all monastery .JSONs
mon_path = os.path.join(os.path.dirname(os.getcwd()), 'data\monastaries\jsons_features')

mon_files = []
for i in os.listdir(mon_path):
    if i.endswith('.json'):
        mon_files.append(os.path.join(mon_path, i))

# list of paths for all mosque .JSONs
mos_path = os.path.join(os.path.dirname(os.getcwd()), 'data\mosques\jsons_features')

mos_files = []
for i in os.listdir(mos_path):
    if i.endswith('.json'):
        mos_files.append(os.path.join(mos_path, i))                

print("monasteries: ", len(mon_files))
print("mosques :", len(mos_files))

monasteries:  19
mosques : 20


In [55]:
"""
Combine all jsons into single json in which all buildings are connected through a single ground node
"""

# empty dictionary
ground = {'id': "GR0000_000", 'area': 0, 'iso_area': 0} # need to add features to this 

all_buildings_ground = {
                 'directed': False,
                 'multigraph': False,
                 'graph': [],
                 'nodes': [ground],
                 'links': []
                }

# loop through all buildings to create single JSON
files = mon_files + mos_files
for file in files:
    with open(file) as f:
        data = json.load(f)
    
    # delete 'exterior' node from building
    ext_node = {}
    for i, node in enumerate(data['nodes']):
        if node['id'][-3:] == '000':
            ext_node = node['id']
            del data['nodes'][i]
    
    # swap edges that connect to exterior for edges that connect to new ground
    for link in data['links']:
        if link['source'] == ext_node:
            link['source'] = ground['id']
        elif link['target'] == ext_node:
            link['target'] = ground['id']
    
    all_buildings_ground['graph'].append(data['graph'])
    all_buildings_ground['nodes'].extend(data['nodes'])
    all_buildings_ground['links'].extend(data['links'])   
    
# save JSON
path = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_buildings_ground_features.json')

with open(path, 'w', encoding ='utf8') as json_file: 
    json.dump(all_buildings_ground, json_file, indent=4)

print('nodes: ', len(all_buildings_ground['nodes']))
print('edges: ', len(all_buildings_ground['links']))

nodes:  1531
edges:  1957


In [56]:
"""
Create node2vec model
"""

import numpy as np
from node2vec import Node2Vec

# node2vec parameters
# d  = 64   # dimensions
# wl = 100   # walk_length
# nw  = 100  # num_walks
# w  = 4    # workers
# p_val = 1.0
# q_val = 2.0

# node2vec parameters
d  = 64   # dimensions
wl = 100   # walk_length
nw  = 100  # num_walks
w  = 4    # workers
p_val = 1.0
q_val = 0.5

# convert node_link JSON file to networkx graph and create a node2vec embeddings
def embed(data, p_val, q_val, d, wl, nw, w):
    # default = {"source": "source", "target": "target", "name": "id", "key": "key", "link": "links"}
    default = {"name": "id"} # deals pydot bug but may not be necessary
    G = json_graph.node_link_graph(data, attrs=default)
    
    # Precompute probabilities and generate walks
    node2vec = Node2Vec(G, p=p_val, q=q_val, dimensions=d, walk_length=wl, num_walks=nw, workers=w)

    ## if d_graph is big enough to fit in the memory, pass temp_folder which has enough disk space
    # Note: It will trigger "sharedmem" in Parallel, which will be slow on smaller graphs
    #node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4, temp_folder="/mnt/tmp_data")

    # Embed
    model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)

    return model

# add embeddings to building json
def add_embed(data, model):
    
    model_vectors = model.wv.vectors
    model_indices = {node: model.wv.vocab[node].index for node in model.wv.vocab}

    # add embedding vector to each node and create single vector for whole building
    nodes = []
    bldgs = {}
    for node in data['nodes']:
        
        # add vector to each node
        i = model_indices[node['id']]
        vector = [e.item() for e in model_vectors[i]]
        node['vector'] = vector
        nodes.append(model_vectors[i])
        
        # add vector to temp building list in temp dict
        if node['id'] != "GR0000_000":
            b = node['building_name']
            if b not in list(bldgs.keys()):
                bldgs[b] = [vector]
            else:
                bldgs[b].append(vector)

    # calculate centroid of each building and add to building
    for bldg in data['graph']:
        nodes = np.array(bldgs[bldg['building_name']])
        centroid_array = np.mean(nodes, axis=0)
        centroid = [e.item() for e in centroid_array]
        bldg['centroid'] = centroid

    return data

# create node2vec model
model = embed(all_buildings_ground, p_val, q_val, d, wl, nw, w)
print('model done')

# update json with embeddings
updated = add_embed(all_buildings_ground, model)

# save new json with embeddings added
# path = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_buildings_ground_embeddings.json')
path = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_buildings_ground_embeddings2.json')

with open(path, 'w', encoding ='utf8') as json_file: 
    json.dump(updated, json_file, indent=4) 

print('done')


Computing transition probabilities: 100%|█| 1531/1531 [00:00<00:00, 3458.42it/s


model done
done


In [57]:
"""
Save building and node cvs files

"""

import csv

# Update buildings csv
buildings = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_buildings_features.csv')

headers = []
rows = []
with open(buildings, 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    headers = next(reader)
    for row in reader:
        rows.append(row)

    # update each row of the building csv with the building centroid
    i = headers.index('building_name')
    comps = 0
    for row in rows:
        centroid = next(bldg['centroid'] for bldg in updated['graph'] if bldg["building_name"] == row[i])
        comps = len(centroid)
        row.extend(centroid)
    headers.extend(['c' + str(i) for i in range(comps)])

# updated_buildings = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_buildings_ground_embeddings.csv')
updated_buildings = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_buildings_ground_embeddings2.csv')

with open(updated_buildings, 'w', newline="", encoding='utf-8') as outFile: 
    writer = csv.writer(outFile)
    writer.writerow(headers)
    for row in rows:
        writer.writerow(row)

# # Update nodes csv    
nodes = nodes = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_nodes_features.csv')

headers = []
rows = []
new_rows = []
with open(nodes, 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    headers = next(reader)
    for row in reader:
        rows.append(row)
    
    # update each row of the node csv with node vector
    i = headers.index('id')
    comps = 0
    for row in rows:
        if not row[i].endswith('00'):
            vector = next(node['vector'] for node in updated['nodes'] if node["id"] == row[i])
            comps = len(vector)
            row.extend(vector)
            new_row = row
            new_rows.append(new_row)
    headers.extend(['c' + str(i) for i in range(comps)])

# updated_nodes = nodes = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_nodes_ground_embeddings.csv')
updated_nodes = nodes = os.path.join(os.path.dirname(os.getcwd()), 'data\all\all_nodes_ground_embeddings2.csv')

with open(updated_nodes, 'w', newline="", encoding='utf-8') as outFile: 
    writer = csv.writer(outFile)
    writer.writerow(headers)
    for row in new_rows:
        writer.writerow(row)

print('done')

done
