# Similarity Measures
### Functions measuring similarity using graph edit distance.
#### The graph edit distance is the number of edge/node changes needed to make two graphs isomorphic.

In [64]:
import os, glob
import time, datetime
import networkx as nx
import numpy as np
import simplejson as json
from pathlib import Path
from node2vec import Node2Vec

def save_graph(graph, fname):
    nodes = [{'id': n, 'group': graph.nodes[n]['group'], 'degree': str(graph.nodes[n]['degree'])} for n in graph.nodes()]
    links = [{'source': u, 'target': v, 'label': d['label']} for u, v, d in graph.edges(data=True)]
    with open(fname, 'w') as f:
        json.dump({'nodes': nodes, 'links': links}, f, indent=4,)
    #print('Graph file %s is created.', fname)

def load_graph(filename, name):
    d = json.load(open(filename))
    g = nx.DiGraph(name=name)
    for n in d['nodes']:
        if n['group'] != 'OTHER':
            g.add_node(n['id'], group = n['group'], degree = n['degree'])
    for n in d['links']:
        g.add_edge(n['source'], n['target'], label = n['label'])
    return g

#start_all_time = time.time()
#print('STARTED: ', datetime.datetime.now())
#hours, rem = divmod(time.time() - start_all_time, 3600)
#minutes, seconds = divmod(rem, 60)
#print("Total processing time: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
#print('FINISHED: ', datetime.datetime.now())

In [65]:
def get_graph_embedding(graph):
    # # Precompute probabilities and generate walks
    node2vec = Node2Vec(graph, dimensions=32, walk_length=10, num_walks=10, workers = 4)
    # Learn embeddings 
    model = node2vec.fit(window=5, min_count=1)
    print('Num words = ', len(model.wv.vocab))
    graph_vectors = [model.wv.get_vector(v) for v in model.wv.vocab]
    
    return np.mean(graph_vectors, axis=0) 

#### 1. Generate word embeddings and get the mean vector for each graph in the given folder

In [66]:
def calculate_emb(folder_name):
    files = glob.glob(os.path.join(folder_name, '*.json'))
    num_files = len(files)
    print(f"Total number of input files in '{folder_name}' folder is {num_files}.")

    files = glob.glob(os.path.join(folder_name, '*.json'))
    num_files = len(files)
    print(f"Total number of input files in '{folder_name}' folder is {num_files}.")
    graph_embs = {}

    for filename in files:
        G = load_graph(filename, 'G')
        graph_embs[filename] = get_graph_embedding(G)

    print('\nNum files = ', len(files), ', Num scores = ', len(graph_embs)) #, graph_embs)    
    return graph_embs

#### 2. Rank by the cosine similarity between graph vectors

In [69]:
from scipy.spatial.distance import cosine

def rank_by_similarity(graph_embs):    
    print('Cosine similarity:\n')
    similarity_scores = []
    pairs = []
    
    for i, (f1, emb1) in enumerate(graph_embs.items()):
        for j, (f2, emb2) in enumerate(graph_embs.items()):            
            if i is not j:
                #print('1 - ', str(i)+','+str(j))
                #if (str(i)+','+str(j) not in pairs or str(j)+','+str(i) not in pairs):
                #if [i,j] in pairs or [j,i] in pairs:
                    #continue
                    #pairs.append([i,j])
                    #print('2 - ', str(i)+','+str(j))
                    #pairs.append(str(i)+','+str(j))
                    similarity_scores.append([round(1-cosine(emb1, emb2), 2), Path(f1).name, Path(f2).name])
    #print(*sorted(similarity_scores, reverse=True), sep='\n')
    return sorted(similarity_scores, reverse=True)

#### 3. Run for the folder 'gold'

In [70]:
graph_embs = calculate_emb(r"../wamex_data/wamex_graphs_fastest/gold/")
ranks = rank_by_similarity(graph_embs)
print(*ranks, sep='\n') # should be 6 pairs for 4 files

Computing transition probabilities: 100%|██████████| 28/28 [00:00<00:00, 16185.30it/s]
Computing transition probabilities: 100%|██████████| 178/178 [00:00<00:00, 3934.99it/s]

Total number of input files in '../wamex_data/wamex_graphs_fastest/gold/' folder is 4.
Total number of input files in '../wamex_data/wamex_graphs_fastest/gold/' folder is 4.
Num words =  28



Computing transition probabilities: 100%|██████████| 152/152 [00:00<00:00, 5888.92it/s]

Num words =  178



Computing transition probabilities: 100%|██████████| 29/29 [00:00<00:00, 25532.08it/s]

Num words =  152
Num words =  29

Num files =  4 , Num scores =  4
Cosine similarity:

[0.94, 'gold_a074513_coolgardie_combined_annual_2006_final_13997651.json', 'gold_a072821_coolgardie_combined_annual_2005_12942716.json']
[0.94, 'gold_a072821_coolgardie_combined_annual_2005_12942716.json', 'gold_a074513_coolgardie_combined_annual_2006_final_13997651.json']
[0.89, 'gold_a084492_coolgardie_annual_report_2009_15818118.json', 'gold_a076168_coolgardie annual report 2007_9968549.json']
[0.89, 'gold_a076168_coolgardie annual report 2007_9968549.json', 'gold_a084492_coolgardie_annual_report_2009_15818118.json']
[0.48, 'gold_a084492_coolgardie_annual_report_2009_15818118.json', 'gold_a074513_coolgardie_combined_annual_2006_final_13997651.json']
[0.48, 'gold_a076168_coolgardie annual report 2007_9968549.json', 'gold_a074513_coolgardie_combined_annual_2006_final_13997651.json']
[0.48, 'gold_a074513_coolgardie_combined_annual_2006_final_13997651.json', 'gold_a084492_coolgardie_annual_report_2009




#### 3. Run for the folder 'iron ore'

In [71]:
graph_embs = calculate_emb(r"../wamex_data/wamex_graphs_fastest/iron_ore/")
ranks = rank_by_similarity(graph_embs)
print(*ranks, sep='\n')

Computing transition probabilities: 100%|██████████| 90/90 [00:00<00:00, 9138.80it/s]
Computing transition probabilities: 100%|██████████| 203/203 [00:00<00:00, 5124.70it/s]

Total number of input files in '../wamex_data/wamex_graphs_fastest/iron_ore/' folder is 7.
Total number of input files in '../wamex_data/wamex_graphs_fastest/iron_ore/' folder is 7.
Num words =  90



Computing transition probabilities: 100%|██████████| 68/68 [00:00<00:00, 26384.15it/s]
Computing transition probabilities: 100%|██████████| 74/74 [00:00<00:00, 20224.05it/s]
Computing transition probabilities: 100%|██████████| 59/59 [00:00<00:00, 14853.78it/s]

Num words =  203
Num words =  68
Num words =  74



Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 18188.66it/s]
Computing transition probabilities: 100%|██████████| 109/109 [00:00<00:00, 7208.07it/s]


Num words =  59
Num words =  45
Num words =  109

Num files =  7 , Num scores =  7
Cosine similarity:

[0.81, 'a086152_c125_1997_2010a_14952469.json', 'a078606_c125-1997-2008a_12703370.json']
[0.81, 'a078606_c125-1997-2008a_12703370.json', 'a086152_c125_1997_2010a_14952469.json']
[0.61, 'a075345_c125_2004_2007a_14419765.json', 'a072391_c125_2004_2006a_12728776.json']
[0.61, 'a072391_c125_2004_2006a_12728776.json', 'a075345_c125_2004_2007a_14419765.json']
[0.5, 'a096981_c125_2010.2012a_15102718.json', 'a075345_c125_2004_2007a_14419765.json']
[0.5, 'a078348_c125_2004_2008a_15517251.json', 'a072391_c125_2004_2006a_12728776.json']
[0.5, 'a075345_c125_2004_2007a_14419765.json', 'a096981_c125_2010.2012a_15102718.json']
[0.5, 'a072391_c125_2004_2006a_12728776.json', 'a078348_c125_2004_2008a_15517251.json']
[0.43, 'a078348_c125_2004_2008a_15517251.json', 'a075345_c125_2004_2007a_14419765.json']
[0.43, 'a075345_c125_2004_2007a_14419765.json', 'a078348_c125_2004_2008a_15517251.json']
[0.39, 'a07