In [87]:
"""
Vignette 2

Import all .JSON files for mosques and monastaries
Creating noded embeddings using node2vec
Save embeddings and models to files

"""

import json
import os

# list paths for all monastery .JSONs
mon_path = os.path.join(os.path.dirname(os.getcwd()), 'data\monastaries\jsons_features')
mon_files = []
for i in os.listdir(mon_path):
    if i.endswith('.json'):
        mon_files.append(os.path.join(mon_path, i))

# list of paths for all mosque .JSONs
# mos_path = r'G:\My Drive\Classes\Thesis\project\ferrando_data\my_work\mosques\jsons'
mos_path = os.path.join(os.path.dirname(os.getcwd()), 'data\mosques\jsons_features')
mos_files = []
for i in os.listdir(mos_path):
    if i.endswith('.json'):
        mos_files.append(os.path.join(mos_path, i))                

print("monasteries: ", len(mon_files))
print("mosques :", len(mos_files))

monasteries:  19
mosques : 20


In [91]:
"""
Loop through both lists of files and use node to vect to embed the nodes

# https://github.com/eliorc/node2vec/blob/master/example.py
"""

import matplotlib.pyplot as plt
import networkx as nx
from networkx.readwrite import json_graph
import numpy as np
from node2vec import Node2Vec

# node2vec parameters - discover nodes with similar structural roles 
d  = 64   # dimensions
wl = 100   # walk_length
nw  = 200  # num_walks
w  = 4    # workers
p_val = 1.0
q_val = 2.0

# node2vec parameters - discovers homophily - clusters of nodes that frequently interact with each other
# d  = 64   # dimensions
# wl = 100   # walk_length
# nw  = 200  # num_walks
# w  = 4    # workers
# p_val = 1.0
# q_val = 0.5


# convert node_link JSON file to networkx graph and create a node2vec embeddings
def embed(data, p_val, q_val, d, wl, nw, w):
    # default = {"source": "source", "target": "target", "name": "id", "key": "key", "link": "links"}
    default = {"name": "id"} # deals pydot bug but may not be necessary
    G = json_graph.node_link_graph(data, attrs=default)
    
    # Precompute probabilities and generate walks
    node2vec = Node2Vec(G, p=p_val, q=q_val, dimensions=d, walk_length=wl, num_walks=nw, workers=w)

    ## if d_graph is big enough to fit in the memory, pass temp_folder which has enough disk space
    # Note: It will trigger "sharedmem" in Parallel, which will be slow on smaller graphs
    #node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4, temp_folder="/mnt/tmp_data")

    # Embed
    model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)

    return model

# add embeddings to building json
def add_embed(data, model):
    
    model_vectors = model.wv.vectors
    model_indices = {node: model.wv.vocab[node].index for node in model.wv.vocab}

    # add embedding vector to each node and create single vector for whole building
    nodes = []
    for node in data['nodes']:
        i = model_indices[node['id']]
        vector = [e.item() for e in model_vectors[i]]
        node['vector'] = vector
        nodes.append(model_vectors[i])
    nodes = np.array(nodes)
    centroid_array = np.mean(nodes, axis=0)
    centroid = [e.item() for e in centroid_array]
    data['graph']['centroid'] = centroid
    
    return data
    
# loop through all monastery JSONs and save embeddings
mon_embed_path = os.path.join(os.path.dirname(os.getcwd()), 'data\monastaries\jsons_embeddings')
# mon_embed_path = os.path.join(os.path.dirname(os.getcwd()), 'data\monastaries\jsons_embeddings2')

for file in mon_files:
    with open(file) as f:
        data = json.load(f)
    
    # create node2vec model
    model = embed(data, p_val, q_val, d, wl, nw, w)
    
    # update json with embeddings
    updated = add_embed(data, model)
    
    # save files
    i = file.rfind('\\')
    monastery = file[i + 1:-4]
    
#     embedding_filename = os.path.join(mon_embed_path, monastery + 'emb')
#     model.wv.save_word2vec_format(embedding_filename)
    
#     model_filename = os.path.join(mon_embed_path, monastery + 'model')
#     model.save(model_filename)

    json_filename = os.path.join(mon_embed_path, monastery + 'json')
    with open(json_filename, 'w', encoding ='utf8') as json_file: 
        json.dump(updated, json_file, indent=4)   

# loop through all mosque JSONs and save embeddings
mos_embed_path = os.path.join(os.path.dirname(os.getcwd()), 'data\mosques\jsons_embeddings')
# mos_embed_path = os.path.join(os.path.dirname(os.getcwd()), 'data\mosques\jsons_embeddings2')

for file in mos_files:
    with open(file) as f:
        data = json.load(f)
    
    # create node2vec model
    model = embed(data, p_val, q_val, d, wl, nw, w)
    
    # update json with embeddings
    updated = add_embed(data, model)

    # save files
    i = file.rfind('\\')
    mosque = file[i + 1:-4]
    
#     embedding_filename = os.path.join(mos_embed_path, mosque + 'emb')
#     model.wv.save_word2vec_format(embedding_filename)
    
#     model_filename = os.path.join(mos_embed_path, mosque + 'model')
#     model.save(model_filename)

    json_filename = os.path.join(mos_embed_path, mosque + 'json')
    with open(json_filename, 'w', encoding ='utf8') as json_file: 
        json.dump(updated, json_file, indent=4) 

print('done')


Computing transition probabilities: 100%|████| 25/25 [00:00<00:00, 2275.70it/s]
Computing transition probabilities: 100%|████| 30/30 [00:00<00:00, 3345.81it/s]
Computing transition probabilities: 100%|████| 33/33 [00:00<00:00, 5533.60it/s]
Computing transition probabilities: 100%|████| 38/38 [00:00<00:00, 5430.63it/s]
Computing transition probabilities: 100%|████| 30/30 [00:00<00:00, 4992.82it/s]
Computing transition probabilities: 100%|████| 25/25 [00:00<00:00, 5014.47it/s]
Computing transition probabilities: 100%|████| 23/23 [00:00<00:00, 3805.33it/s]
Computing transition probabilities: 100%|████| 37/37 [00:00<00:00, 6238.26it/s]
Computing transition probabilities: 100%|████| 31/31 [00:00<00:00, 2590.37it/s]
Computing transition probabilities: 100%|████| 35/35 [00:00<00:00, 5013.17it/s]
Computing transition probabilities: 100%|████| 42/42 [00:00<00:00, 2339.64it/s]
Computing transition probabilities: 100%|████| 74/74 [00:00<00:00, 6773.27it/s]
Computing transition probabilities: 100%

done


In [92]:
"""
Add embeddings to CSV files
"""

import json
import csv
import os

# Return vector of building centroid
def get_centroid(bldg):
    if bldg.startswith('MN'):
        path = os.path.join(os.path.dirname(os.getcwd()), 'data\monastaries\jsons_embeddings')
#         path = os.path.join(os.path.dirname(os.getcwd()), 'data\monastaries\jsons_embeddings2')
    else:
        path = os.path.join(os.path.dirname(os.getcwd()), 'data\mosques\jsons_embeddings')
#         path = os.path.join(os.path.dirname(os.getcwd()), 'data\mosques\jsons_embeddings2')
    file = os.path.join(path, bldg + '.json')
    with open(file) as f:
        data = json.load(f)
    return data['graph']['centroid']

# Update buildings csv
buildings = os.path.join(os.path.dirname(os.getcwd()), 'data\all\buildings.csv')
headers = []
rows = []
with open(buildings, 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    headers = next(reader)
    for row in reader:
        rows.append(row)

    # update each row of the building csv with the building centroid
    i = headers.index('building_name')
    comps = 0
    for row in rows:
        centroid = get_centroid(row[i])
        comps = len(centroid)
        row.extend(centroid)
    headers.extend(['c' + str(i) for i in range(comps)])

updated_buildings = os.path.join(os.path.dirname(os.getcwd()), 'data\all\buildings_embeddings.csv')    
# updated_buildings = os.path.join(os.path.dirname(os.getcwd()), 'data\all\buildings_embeddings2.csv')
with open(updated_buildings, 'w', newline="", encoding='utf-8') as outFile: 
    writer = csv.writer(outFile)
    writer.writerow(headers)
    for row in rows:
        writer.writerow(row)

def get_nodes(bldg):
    if bldg.startswith('MN'):
        path = os.path.join(os.path.dirname(os.getcwd()), 'data\monastaries\jsons_embeddings')
#         path = os.path.join(os.path.dirname(os.getcwd()), 'data\monastaries\jsons_embeddings2')
    else:
        path = os.path.join(os.path.dirname(os.getcwd()), 'data\mosques\jsons_embeddings')
#         path = os.path.join(os.path.dirname(os.getcwd()), 'data\mosques\jsons_embeddings2')
    file = os.path.join(path, bldg + '.json')
    with open(file) as f:
        data = json.load(f)
    return data['nodes']

# Update nodes csv    
nodes = os.path.join(os.path.dirname(os.getcwd()), 'data\all\nodes.csv')
headers = []
rows = []
with open(nodes, 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    headers = next(reader)
    for row in reader:
        rows.append(row)
    
    # update each row of the node csv with node vector
    i = headers.index('building_name')
    j = headers.index('id')
    comps = 0
    prev_row = None
    for row in rows:
        if row[i] != prev_row: # what is this doing?
            json_nodes = get_nodes(row[i])
        vector = next(node['vector'] for node in json_nodes if node["id"] == row[j])
        comps = len(vector)
        row.extend(vector)
    headers.extend(['c' + str(i) for i in range(comps)])

updated_nodes = os.path.join(os.path.dirname(os.getcwd()), 'data\all\nodes_embeddings.csv')
# updated_nodes = os.path.join(os.path.dirname(os.getcwd()), 'data\all\nodes_embeddings2.csv')
with open(updated_nodes, 'w', newline="", encoding='utf-8') as outFile: 
    writer = csv.writer(outFile)
    writer.writerow(headers)
    for row in rows:
        writer.writerow(row)

print('done')

done
