# Similarity Measures
### Functions measuring similarity using graph edit distance.
#### The graph edit distance is the number of edge/node changes needed to make two graphs isomorphic.

In [82]:
import os, glob
import time, datetime
import csv
import networkx as nx
import simplejson as json
from pathlib import Path

def save_graph(graph, fname):
    nodes = [{'id': n, 'group': graph.nodes[n]['group'], 'degree': str(graph.nodes[n]['degree'])} for n in graph.nodes()]
    links = [{'source': u, 'target': v, 'label': d['label']} for u, v, d in graph.edges(data=True)]
    with open(fname, 'w') as f:
        json.dump({'nodes': nodes, 'links': links}, f, indent=4,)
    #print('Graph file %s is created.', fname)

def load_graph(filename, name):
    d = json.load(open(filename))
    g = nx.DiGraph(name=name)
    for n in d['nodes']:
        if n['group'] != 'OTHER':
            g.add_node(n['id'], group = n['group'], degree = n['degree'])
    for n in d['links']:
        g.add_edge(n['source'], n['target'], label = n['label'])
    return g

start_all_time = time.time()
print('STARTED: ', datetime.datetime.now())

STARTED:  2020-06-30 13:04:15.679673


#### 1. Identifying new nodes, removed nodes, new links and removed links

In [90]:
def calculate_similarity(folder_name):
    files = glob.glob(os.path.join(folder_name, '*.json'))
    num_files = len(files)
    print(f"Total number of input files in '{folder_name}' folder is {num_files}.")
    similariy_scores = []

    for i in range(0, num_files-1):
        G1 = load_graph(files[i], 'G1')

        for j in range(i+1, num_files):
            try:
                #start_time = time.time()
                print(f'\n====================================\nFILE 1: {Path(files[i]).name} - Graph', nx.info(G1), '\n')

                G2 = load_graph(files[j], 'G2')
                print(f'FILE 2: {Path(files[j]).name} - Graph', nx.info(G2), '\n')

                # Total nodes in both graphs
                total_nodes = list(set(G1.nodes()).union(G2.nodes()))
                print('NODE INFORMATION:\nTotal nodes in both: ', len(total_nodes)) # , '\n', same_nodes)

                # The same nodes
                same_nodes = list(set(G1.nodes()).intersection(G2.nodes()))
                print('Same nodes in both: ', len(same_nodes)) # , '\n', same_nodes)

                # New nodes in G1
                node_difference1 = list(set(G1) - set(G2))
                print('Different nodes in G1: ', len(node_difference1)) # , '\n', difference1)

                # New nodes in G2
                node_difference2 = list(set(G2) - set(G1))
                print('Different nodes in G2: ', len(node_difference2)) # , '\n', difference2)

                if len(node_difference1) > len(node_difference2):
                    print(f'--- G1 has {len(node_difference1) - len(node_difference2)} more nodes than G2.')
                elif len(node_difference1) == 0 and len(node_difference2) == 0:
                    print(f'--- G1 = G2, has {len(node_difference1)} nodes.')
                else:
                    print(f'--- G2 has {len(node_difference2) - len(node_difference1)} more nodes than G1.')

                # Total links in both graphs
                total_links = list(set(G1.edges()).union(G2.edges()))
                print('\nLINK INFORMATION:\nTotal links in both: ', len(total_links)) # , '\n', same_nodes)

                # The same links
                same_links = list(set(G1.edges()).intersection(G2.edges()))
                print('Same links in both: ', len(same_links)) # , '\n', same_nodes)

                # New links in G1
                link_difference1 = list(set(G1.edges()) - set(G2.edges()))
                print('Different links in G1: ', len(link_difference1)) # , '\n') #, difference1)

                # New links in G2
                link_difference2 = list(set(G2.edges()) - set(G1.edges()))
                print('Different links in G2: ', len(link_difference2)) #, '\n') #, difference2)

                if len(link_difference1) > len(link_difference2):
                    print(f'--- G1 has {len(link_difference1) - len(link_difference2)} more links than G2.')
                elif len(link_difference1) == 0 and len(link_difference2) == 0:
                    print(f'--- G1 = G2, has {G1.nodes()} links.')
                else:
                    print(f'--- G2 has {len(link_difference2) - len(link_difference1)} more links than G1.')

                # Scoring
                sim_score = round(len(same_nodes + same_links)/len(total_nodes + total_links), 2)
                print(f"\n*** Similarity score = {sim_score:.2f}")
                similariy_scores.append([sim_score, Path(files[i]).name, Path(files[j]).name])

                #print("Duration = ", time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)), '\n')
                #break

            except Exception as ex:
                print("Error msg: " + str(ex))
                count_files_error += 1
                #break

    sorted_scores = sorted(similariy_scores, reverse = True)
    print('\nNum scores = ', len(similariy_scores)) #, len(sorted_scores))

    #print('\nSimilarity scores:', *sorted_scores, sep='\n')
    hours, rem = divmod(time.time() - start_all_time, 3600)
    minutes, seconds = divmod(rem, 60)
    print("Total processing time: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
    print('FINISHED: ', datetime.datetime.now())
    
    return sorted_scores

In [91]:
calculate_similarity(r"../wamex_data/wamex_graphs_fastest/gold/")

Total number of input files in '../wamex_data/wamex_graphs_fastest/gold/' folder is 4.

FILE 1: gold_a084492_coolgardie_annual_report_2009_15818118.json - Graph Name: G1
Type: DiGraph
Number of nodes: 28
Number of edges: 36
Average in degree:   1.2857
Average out degree:   1.2857 

FILE 2: gold_a074513_coolgardie_combined_annual_2006_final_13997651.json - Graph Name: G2
Type: DiGraph
Number of nodes: 178
Number of edges: 678
Average in degree:   3.8090
Average out degree:   3.8090 

NODE INFORMATION:
Total nodes in both:  185
Same nodes in both:  21
Different nodes in G1:  7
Different nodes in G2:  157
--- G2 has 150 more nodes than G1.

LINK INFORMATION:
Total links in both:  705
Same links in both:  9
Different links in G1:  27
Different links in G2:  669
--- G2 has 642 more links than G1.

*** Similarity score = 0.03

FILE 1: gold_a084492_coolgardie_annual_report_2009_15818118.json - Graph Name: G1
Type: DiGraph
Number of nodes: 28
Number of edges: 36
Average in degree:   1.2857
Ave

[[0.78,
  'gold_a074513_coolgardie_combined_annual_2006_final_13997651.json',
  'gold_a072821_coolgardie_combined_annual_2005_12942716.json'],
 [0.66,
  'gold_a084492_coolgardie_annual_report_2009_15818118.json',
  'gold_a076168_coolgardie annual report 2007_9968549.json'],
 [0.04,
  'gold_a084492_coolgardie_annual_report_2009_15818118.json',
  'gold_a072821_coolgardie_combined_annual_2005_12942716.json'],
 [0.04,
  'gold_a074513_coolgardie_combined_annual_2006_final_13997651.json',
  'gold_a076168_coolgardie annual report 2007_9968549.json'],
 [0.04,
  'gold_a072821_coolgardie_combined_annual_2005_12942716.json',
  'gold_a076168_coolgardie annual report 2007_9968549.json'],
 [0.03,
  'gold_a084492_coolgardie_annual_report_2009_15818118.json',
  'gold_a074513_coolgardie_combined_annual_2006_final_13997651.json']]

In [85]:
calculate_similarity(r"../wamex_data/wamex_graphs_fastest/iron_ore/")

Total number of input files in '../wamex_data/wamex_graphs_fastest/iron_ore/' folder is 7.

G1: a078606_c125-1997-2008a_12703370.json - Graph Name: G1
Type: DiGraph
Number of nodes: 90
Number of edges: 169
Average in degree:   1.8778
Average out degree:   1.8778 

G2: a072391_c125_2004_2006a_12728776.json - Graph Name: G2
Type: DiGraph
Number of nodes: 203
Number of edges: 730
Average in degree:   3.5961
Average out degree:   3.5961 

NODE INFORMATION:
Total nodes in both:  258
Same nodes in both:  35
Different nodes in G1:  55
Different nodes in G2:  168
--- G2 has 113 more nodes than G1.

LINK INFORMATION:
Total links in both:  889
Same links in both:  10
Different links in G1:  159
Different links in G2:  720
--- G2 has 561 more links than G1.

*** Similarity score = 0.04

G1: a078606_c125-1997-2008a_12703370.json - Graph Name: G1
Type: DiGraph
Number of nodes: 90
Number of edges: 169
Average in degree:   1.8778
Average out degree:   1.8778 

G2: a086152_c125_1997_2010a_14952469.jso

[[0.55,
  'a078606_c125-1997-2008a_12703370.json',
  'a086152_c125_1997_2010a_14952469.json'],
 [0.2,
  'a078348_c125_2004_2008a_15517251.json',
  'a075345_c125_2004_2007a_14419765.json'],
 [0.2,
  'a072391_c125_2004_2006a_12728776.json',
  'a075345_c125_2004_2007a_14419765.json'],
 [0.11,
  'a072391_c125_2004_2006a_12728776.json',
  'a078348_c125_2004_2008a_15517251.json'],
 [0.06,
  'a096981_c125_2010.2012a_15102718.json',
  'a086072_c125_2008_2009a_12463362.json'],
 [0.05,
  'a086152_c125_1997_2010a_14952469.json',
  'a096981_c125_2010.2012a_15102718.json'],
 [0.05,
  'a086152_c125_1997_2010a_14952469.json',
  'a086072_c125_2008_2009a_12463362.json'],
 [0.05,
  'a078606_c125-1997-2008a_12703370.json',
  'a096981_c125_2010.2012a_15102718.json'],
 [0.04,
  'a086152_c125_1997_2010a_14952469.json',
  'a078348_c125_2004_2008a_15517251.json'],
 [0.04,
  'a078606_c125-1997-2008a_12703370.json',
  'a086072_c125_2008_2009a_12463362.json'],
 [0.04,
  'a078606_c125-1997-2008a_12703370.json',
 