To reproduce our results, you can download the matrices we generated from the following link:
https://drive.google.com/file/d/18HsH8Kk1XNEtbYxdsJ_sK10d-gJQHYiX



In [119]:
from glob import glob
import os
%load_ext autoreload
%autoreload 2
from generate_groups import panel_provenance_groups, document_provenance
import json
from metrics import panel_evaluation, doc_evaluation, rp_eval, rr_eval, global_precision, global_recall, f1, gp_eval, gr_eval
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm
%pylab inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


### CODE FOR CHECKING THE RESULTS

In [99]:
def digest_panel_results(method_input, our_annotation):

    all_predicted_graphs = {}
    index = 1
    results = {} 
    results['global'] = {}
    results_rp = pd.DataFrame()
    results_rr = pd.DataFrame()
    results_gp = pd.DataFrame()
    results_gr = pd.DataFrame()
    total_dataset = 0
    for method_name, adj_matrix, _class, dataset in method_input:
        adj_matrix = np.load(adj_matrix)
        with open(f'../dataset/{dataset}/{_class}-dataset.json') as f:
            dataset = json.load(f)
        predicted_graphs = panel_provenance_groups(dataset, adj_matrix)
        metrics = panel_evaluation(predicted_graphs, our_annotation, dataset, adj_matrix, _class)
    
        for pred in predicted_graphs.values():
            all_predicted_graphs[f'GROUP-{index}'] = pred
            index +=1
        results[_class] = {'pairing': metrics["relationship_quality"],
                           'grouping': metrics["grouping_quality"],
                           }
        # Relationship eval
        rp = rp_eval(predicted_graphs, our_annotation, dataset, adj_matrix)
        rr = rr_eval(predicted_graphs, our_annotation, dataset, adj_matrix)

        results_rp = pd.concat([results_rp, rp[rp['Class'] == _class]])
        results_rr =  pd.concat([results_rr, rr[rr['Class'] == _class]])
        # Grouping eval
        gp = gp_eval(predicted_graphs, our_annotation)
        gr = gr_eval(predicted_graphs, our_annotation)
        results_gp = pd.concat([results_gp, gp[gp['Class'] == _class]])
        results_gr = pd.concat([results_gr, gr[gr['Class'] == _class]])


    precision = global_precision(all_predicted_graphs, our_annotation, ignore_graphs=True)
    recall =  global_recall(all_predicted_graphs, our_annotation, ignore_graphs=True)
    results['global']['classification'] = f1(precision, recall)
    results['global']['pairing'] = f1(results_rp['Precision'].mean(), results_rr['Recall'].mean())
    results['global']['grouping'] = f1(results_gp['Precision'].mean(), results_gr['Recall'].mean())
    return results

# Panel Evaluation

In [112]:
# Load Data
sila_spp = [('SILA', 'adjacency-matrices/SPP/sila/Microscopy.npy','Microscopy', 'spm'),
('SILA', 'adjacency-matrices/SPP/sila/Blots.npy','Blots', 'spm'),
('SILA', 'adjacency-matrices/SPP/sila/FlowCytometry.npy','FlowCytometry', 'spm')]

sila_spp_v1 = [('SILA', 'adjacency-matrices/SPP-v1/sila/Microscopy.npy','Microscopy', 'spm-v1'),
('SILA', 'adjacency-matrices/SPP-v1/sila/Blots.npy','Blots', 'spm-v1'),
('SILA', 'adjacency-matrices/SPP-v1/sila/FlowCytometry.npy','FlowCytometry', 'spm-v1')]

sila_spp_v2 = [('SILA', 'adjacency-matrices/SPP-v2/sila/Microscopy.npy','Microscopy', 'spm-v2'),
('SILA', 'adjacency-matrices/SPP-v2/sila/Blots.npy','Blots', 'spm-v2'),
('SILA', 'adjacency-matrices/SPP-v2/sila/FlowCytometry.npy','FlowCytometry', 'spm-v2')]

ours_spp = [('Ours', 'adjacency-matrices/SPP/ours/vlfeat_sift_heq-BF-CV_MAGSAC-400-20-300-0.01-Microscopy.npy','Microscopy', 'spm'),
('Ours', 'adjacency-matrices/SPP/ours/vlfeat_sift_heq-BF-CV_MAGSAC-400-20-300-0.01-Blots.npy','Blots', 'spm'),
('Ours', 'adjacency-matrices/SPP/ours/vlfeat_sift_heq-BF-CV_MAGSAC-400-20-300-0.01-FlowCytometry.npy','FlowCytometry', 'spm')]

ours_spp_v1 = [('Ours', 'adjacency-matrices/SPP-v1/ours/vlfeat_sift_heq-BF-CV_MAGSAC-400-20-300-0.01-Microscopy.npy','Microscopy', 'spm-v1'),
('Ours', 'adjacency-matrices/SPP-v1/ours/vlfeat_sift_heq-BF-CV_MAGSAC-400-20-300-0.01-Blots.npy','Blots', 'spm-v1'),
('Ours', 'adjacency-matrices/SPP-v1/ours/vlfeat_sift_heq-BF-CV_MAGSAC-400-20-300-0.01-FlowCytometry.npy','FlowCytometry', 'spm-v1')]

ours_spp_v2 = [('Ours', 'adjacency-matrices/SPP-v2/ours/vlfeat_sift_heq-BF-CV_MAGSAC-400-20-300-0.01-Microscopy.npy','Microscopy', 'spm-v2'),
('Ours', 'adjacency-matrices/SPP-v2/ours/vlfeat_sift_heq-BF-CV_MAGSAC-400-20-300-0.01-Blots.npy','Blots', 'spm-v2'),
('Ours', 'adjacency-matrices/SPP-v2/ours/vlfeat_sift_heq-BF-CV_MAGSAC-400-20-300-0.01-FlowCytometry.npy','FlowCytometry', 'spm-v2')]

acuna_spp = [('Acuna', 'adjacency-matrices/SPP/acuna/Microscopy-adjacency_matrix.npy','Microscopy', 'spm'),
('Acuna', 'adjacency-matrices/SPP/acuna/Blots-adjacency_matrix.npy','Blots', 'spm'),
('Acuna', 'adjacency-matrices/SPP/acuna/FlowCytometry-adjacency_matrix.npy','FlowCytometry', 'spm')]

acuna_spp_v1 = [('Acuna', 'adjacency-matrices/SPP-v1/acuna/Microscopy-adjacency_matrix.npy','Microscopy', 'spm-v1'),
('Acuna', 'adjacency-matrices/SPP-v1/acuna/Blots-adjacency_matrix.npy','Blots', 'spm-v1'),
('Acuna', 'adjacency-matrices/SPP-v1/acuna/FlowCytometry-adjacency_matrix.npy','FlowCytometry', 'spm-v1')]

acuna_spp_v2 = [('Acuna', 'adjacency-matrices/SPP-v2/acuna/Microscopy-adjacency_matrix.npy','Microscopy', 'spm-v2'),
('Acuna', 'adjacency-matrices/SPP-v2/acuna/Blots-adjacency_matrix.npy','Blots', 'spm-v2'),
('Acuna', 'adjacency-matrices/SPP-v2/acuna/FlowCytometry-adjacency_matrix.npy','FlowCytometry', 'spm-v2')]


In [None]:
# Load Ground Truth
with open("our_annotation.json") as f:
    our_annotation = json.load(f)


In [113]:
# SPP results
spp_results = {}
spp_results['SILA'] = digest_panel_results(sila_spp, our_annotation)["global"]
spp_results['Ours'] = digest_panel_results(ours_spp, our_annotation)["global"]
spp_results['Acuna'] = digest_panel_results(acuna_spp, our_annotation)["global"]


In [114]:
spp_results

{'SILA': {'classification': 0.8447136563876652,
  'pairing': 0.7282079889302411,
  'grouping': 0.8069200272343928},
 'Ours': {'classification': 0.8690026954177897,
  'pairing': 0.7410617928984826,
  'grouping': 0.8363649092931222},
 'Acuna': {'classification': 0.8736532810969637,
  'pairing': 0.5360969926234008,
  'grouping': 0.8129067016626514}}

In [115]:
# SPP-v1 results
spp_v1_results = {}
spp_v1_results['SILA'] = digest_panel_results(sila_spp_v1, our_annotation)["global"]
spp_v1_results['Ours'] = digest_panel_results(ours_spp_v1, our_annotation)["global"]
spp_v1_results['Acuna'] = digest_panel_results(acuna_spp_v1, our_annotation)["global"]


In [116]:
spp_v1_results

{'SILA': {'classification': 0.7842535787321063,
  'pairing': 0.6717441386663093,
  'grouping': 0.7496533466649101},
 'Ours': {'classification': 0.8707409410492158,
  'pairing': 0.7412034305335525,
  'grouping': 0.8380612987267677},
 'Acuna': {'classification': 0.4731125827814569,
  'pairing': 0.2713730280298792,
  'grouping': 0.26283854517445004}}

In [117]:
# SPP-v2 results
spp_v2_results = {}
spp_v2_results['SILA'] = digest_panel_results(sila_spp_v2, our_annotation)["global"]
spp_v2_results['Ours'] = digest_panel_results(ours_spp_v2, our_annotation)["global"]
spp_v2_results['Acuna'] = digest_panel_results(acuna_spp_v2, our_annotation)["global"]


In [118]:
spp_v2_results

{'SILA': {'classification': 0.649724692926726,
  'pairing': 0.5486731369747352,
  'grouping': 0.6219617835071215},
 'Ours': {'classification': 0.8698481561822126,
  'pairing': 0.7339883530120476,
  'grouping': 0.8365592769815049},
 'Acuna': {'classification': 0.1293461153240807,
  'pairing': 0.06787681886978132,
  'grouping': 0.02595565842507837}}

# Document-Level Evaluation

In [None]:

sila_spp = [('SILA', 'adjacency-matrices/SPP/sila/Microscopy.npy','Microscopy', 'spm'),
('SILA', 'adjacency-matrices/SPP/sila/Blots.npy','Blots', 'spm'),
('SILA', 'adjacency-matrices/SPP/sila/FlowCytometry.npy','FlowCytometry', 'spm')]

In [144]:
# Creates document dataset
def create_doc_dataset(data_version):
    doc_dataset = set()
    for _class in ['Microscopy', 'Blots', 'FlowCytometry', 'BodyImaging']:
        if not os.path.isfile(f'../dataset/{data_version}/{_class}-dataset.json'):
            continue
        for data in glob(f'../dataset/{data_version}/{_class}-dataset.json'):
            with open(data) as f:
                dataset = json.load(f)
            doc_data = [d['doc_id'] for d in dataset.values()]
            doc_dataset = doc_dataset.union(doc_data)
    doc_dataset = sorted(list(doc_dataset))
    return doc_dataset

In [155]:
# Load Data
spp_doc_dataset = create_doc_dataset('spm')
spp_v1_doc_dataset = create_doc_dataset('spm-v1')
spp_v2_doc_dataset = create_doc_dataset('spm-v2')

# Load annotatation
with open("document-level-annotation.json") as f:
    doc_annotation = json.load(f)

In [163]:
def create_doc_adj_matrix(method_input, doc_dataset):
    
    doc_adjacency_matrix = np.zeros((len(doc_dataset), len(doc_dataset)))

    for matrix, dataset in method_input:
        panel_adjacency_matrix = np.load(matrix)
        with open(dataset) as f:
            panel_dataset = json.load(f)
            
        for i in range(len(panel_adjacency_matrix)):
            for j in range(i+1,len(panel_adjacency_matrix)):
                if panel_adjacency_matrix[i,j]:
                    doc_i = doc_dataset.index(panel_dataset[str(i)]['doc_id'])
                    doc_j = doc_dataset.index(panel_dataset[str(j)]['doc_id'])
                    
                    # Mark that the document share elements
                    doc_adjacency_matrix[doc_i,doc_j] += 1
                    doc_adjacency_matrix[doc_j,doc_i] += 1
    return doc_adjacency_matrix

# Create Doc Provenance Groups
import networkx as nx
from pyvis.network import Network

def create_doc_provenance(doc_adjacency_matrix, doc_dataset):
    doc_provenance_graphs = []
    # Connect components
    G = nx.from_numpy_array(doc_adjacency_matrix)
    # get all connected componentes of the grap
    componnets = nx.connected_components(G)

    # Insert all connected components that have more than 1 node
    # into a list
    doc_provenance_graphs = []
    for cc in componnets:
        graph = G.subgraph(cc)
        if graph.number_of_nodes() > 1:
            nt = Network()
            nt.from_nx(graph)
            for index, node in enumerate(nt.nodes):
                nt.nodes[index]['title'] = doc_dataset[node['id']]
            doc_provenance_graphs.append(nt)

    doc_provenance = {}
    for graph_id, graph in enumerate(doc_provenance_graphs):
        doc_provenance['GROUP-%d'%(graph_id+1)] = {}
        doc_provenance['GROUP-%d'%(graph_id+1)] = list(set([ i['title'] for i in graph.nodes]))

    return doc_provenance

In [154]:
sila_doc_spp = [(data[1], f'../dataset/{data[3]}/{data[2]}-dataset.json') for data in sila_spp]
ours_doc_spp = [(data[1], f'../dataset/{data[3]}/{data[2]}-dataset.json') for data in ours_spp]
acuna_doc_spp = [(data[1], f'../dataset/{data[3]}/{data[2]}-dataset.json') for data in acuna_spp]

sila_doc_spp_v1 = [(data[1], f'../dataset/{data[3]}/{data[2]}-dataset.json') for data in sila_spp_v1]
ours_doc_spp_v1 = [(data[1], f'../dataset/{data[3]}/{data[2]}-dataset.json') for data in ours_spp_v1]
acuna_doc_spp_v1 = [(data[1], f'../dataset/{data[3]}/{data[2]}-dataset.json') for data in acuna_spp_v1]

sila_doc_spp_v2 = [(data[1], f'../dataset/{data[3]}/{data[2]}-dataset.json') for data in sila_spp_v2]
ours_doc_spp_v2 = [(data[1], f'../dataset/{data[3]}/{data[2]}-dataset.json') for data in ours_spp_v2]
acuna_doc_spp_v2 = [(data[1], f'../dataset/{data[3]}/{data[2]}-dataset.json') for data in acuna_spp_v2]


In [164]:
doc_provenance = create_doc_provenance(m, spp_doc_dataset)

In [140]:
m = create_doc_adj_matrix(sila_doc_spp, spp_doc_dataset)

In [168]:
doc_dataset = {}
index = 0
for d in spp_doc_dataset:
    doc_dataset[str(index)] = {'doc_id':d}
    index +=1
doc_evaluation(doc_provenance, doc_annotation, doc_dataset, m)

{'relationship_precision': 0.8603636501633343,
 'relationship_recall': 0.8263715852869296,
 'relationship_quality': 0.8430251038408313,
 'grouping_precision': 1.0,
 'grouping_recall': 0.9835393757257019,
 'grouping_quality': 0.9917013876932612,
 'global_precision': 1.0,
 'global_recall': 0.9917355371900827,
 'global_quality': 0.995850622406639}

In [175]:
def digest_doc_results(method_input, doc_dataset, doc_annotation):
    
    
    provenance_matrix = create_doc_adj_matrix(method_input, doc_dataset)
    predicted_graphs = create_doc_provenance(provenance_matrix, doc_dataset)
    # update format doc_dataset
    dict_doc_dataset = {}
    index = 0
    for d in doc_dataset:
        dict_doc_dataset[str(index)] = {'doc_id':d}
        index +=1
    # Evaluate results
    metrics = doc_evaluation(predicted_graphs, doc_annotation,
                              dict_doc_dataset, provenance_matrix)
    
    results = {}
    results['pairing'] = metrics["relationship_quality"]
    results['grouping'] = metrics["grouping_quality"]
    results['classification'] = metrics['global_quality']
    
    return results

In [177]:
# SPP doc results
spp_doc_results = {}
spp_doc_results['SILA'] = digest_doc_results(sila_doc_spp, spp_doc_dataset, doc_annotation)
spp_doc_results['Ours'] = digest_doc_results(ours_doc_spp, spp_doc_dataset, doc_annotation)
spp_doc_results['Acuna'] = digest_doc_results(acuna_doc_spp, spp_doc_dataset, doc_annotation)


In [178]:
spp_doc_results

{'SILA': {'pairing': 0.8430251038408313,
  'grouping': 0.9917013876932612,
  'classification': 0.995850622406639},
 'Ours': {'pairing': 0.8383047259088356,
  'grouping': 1.0,
  'classification': 1.0},
 'Acuna': {'pairing': 0.7182050135612684,
  'grouping': 1.0,
  'classification': 1.0}}

In [180]:
# SPP-v1 doc results
spp_v1_doc_results = {}
spp_v1_doc_results['SILA'] = digest_doc_results(sila_doc_spp_v1, spp_v1_doc_dataset, doc_annotation)
spp_v1_doc_results['Ours'] = digest_doc_results(ours_doc_spp_v1, spp_v1_doc_dataset, doc_annotation)
spp_v1_doc_results['Acuna'] = digest_doc_results(acuna_doc_spp_v1, spp_v1_doc_dataset, doc_annotation)


In [181]:
spp_v1_doc_results

{'SILA': {'pairing': 0.8430251038408313,
  'grouping': 0.9917013876932612,
  'classification': 0.995850622406639},
 'Ours': {'pairing': 0.8349512500546418,
  'grouping': 1.0,
  'classification': 1.0},
 'Acuna': {'pairing': 0.3295940423477277,
  'grouping': 0.21517117726160476,
  'classification': 0.5127118644067797}}

In [179]:
# SPP-v2 doc results
spp_v2_doc_results = {}
spp_v2_doc_results['SILA'] = digest_doc_results(sila_doc_spp_v2, spp_v2_doc_dataset, doc_annotation)
spp_v2_doc_results['Ours'] = digest_doc_results(ours_doc_spp_v2, spp_v2_doc_dataset, doc_annotation)
spp_v2_doc_results['Acuna'] = digest_doc_results(acuna_doc_spp_v2, spp_v2_doc_dataset, doc_annotation)


In [182]:
spp_v2_doc_results

{'SILA': {'pairing': 0.8361972230693725,
  'grouping': 0.983572965404187,
  'classification': 0.9876543209876544},
 'Ours': {'pairing': 0.8259535613674006,
  'grouping': 1.0,
  'classification': 1.0},
 'Acuna': {'pairing': 0.05373074202548507,
  'grouping': 0.0077622064155237775,
  'classification': 0.11713455953533398}}