# Setup

In [1]:
%load_ext autoreload
%autoreload
from pprint import pprint as print
from notebook_utils import *
from taxonomy_metrics import *
from comparative_measures import *
import scipy
import qgrid 
import time

# Reading data
Read the CSVs, remove the underscores and stop words.
Construct the networkx graphs then write the result as GEXF.

## Reading JRC data

In [2]:
# Dataset access - JRC-ISPRA legal document mining
NETWORK1_CONCEPTS_PATH=pathlib.Path("data/leg-doc-mining/network-1-concepts.csv")
NETWORK1_WORDS_PATH=pathlib.Path("data/leg-doc-mining/network-1-words.csv")
NETWORK2_CONCEPTS_PATH=pathlib.Path("data/leg-doc-mining/network-2-concepts.csv")
NETWORK2_WORDS_PATH=pathlib.Path("data/leg-doc-mining/network-2-words.csv")

COLUMN_TYPES = {"count":np.int32,"concept":str,"node1":str,"node2":str,"node3":str}
COLUMN_NAS = {"count":0,"concept":"","node1":"","node2":"","node3":""}

NETWORK1_CONCEPTS = pd.read_csv(NETWORK1_CONCEPTS_PATH.resolve(), delimiter=";")
NETWORK1_CONCEPTS.fillna(COLUMN_NAS, inplace=True)
NETWORK1_CONCEPTS.astype(COLUMN_TYPES,inplace=True)
    
NETWORK1_WORDS = pd.read_csv(NETWORK1_WORDS_PATH.resolve(), delimiter=";")
NETWORK1_WORDS.fillna(COLUMN_NAS, inplace=True)
NETWORK1_WORDS.astype(COLUMN_TYPES, inplace=True)

NETWORK2_CONCEPTS = pd.read_csv(NETWORK2_CONCEPTS_PATH.resolve(), delimiter=";")
NETWORK2_CONCEPTS.fillna(COLUMN_NAS, inplace=True)
NETWORK2_CONCEPTS.astype(COLUMN_TYPES, inplace=True)

NETWORK2_WORDS = pd.read_csv(NETWORK2_WORDS_PATH.resolve(), delimiter=";")
NETWORK2_WORDS.fillna(COLUMN_NAS, inplace=True)
NETWORK2_WORDS.astype(COLUMN_TYPES, inplace=True)

CONCEPT_GRAPH_PATH=pathlib.Path("data/leg-doc-mining/network-concepts.gexf")
WORD_GRAPH_PATH=pathlib.Path("data/leg-doc-mining/network-words.gexf")
MERGED_GRAPH_PATH=pathlib.Path("data/leg-doc-mining/network-merged.gexf")

CONCEPT_GRAPH_DF = pd.concat([NETWORK1_CONCEPTS, NETWORK2_CONCEPTS])
WORD_GRAPH_DF = pd.concat([NETWORK1_WORDS, NETWORK2_WORDS])

CONCEPT_GRAPH = create_grah_from_jrc_dataset(filter_stop_words(CONCEPT_GRAPH_DF))
WORD_GRAPH = create_grah_from_jrc_dataset(filter_stop_words(WORD_GRAPH_DF))

## Reading EuroVoc data

In [4]:
EUROVOC_DATASET_PATH= pathlib.Path("data/eurovoc-4.9/EVAP49.ttl").resolve()
QUERY_LABELS_RESULTSET_PATH = pathlib.Path("data/eurovoc-4.9/QUERY_LABELS_RESULTSET.csv").resolve()
QUERY_LABELS_RESULTSET = pd.read_csv(QUERY_LABELS_RESULTSET_PATH, delimiter=",")
# greaqting the conceptual graphs
EUROVOC_GRAPH = create_grah_from_eurovoc_label_query(QUERY_LABELS_RESULTSET)
CONCEPT_SCHEME_GRAPHS = create_grah_per_scheme_from_eurovoc_label_query(QUERY_LABELS_RESULTSET)

## Writing graphs to disc (if needed uncomment)

In [5]:
# # write individual graphs
# nx.write_gexf(create_grah_from_jrc_dataset(filter_stop_words(NETWORK1_CONCEPTS)), str(NETWORK1_CONCEPTS_PATH) + ".gexf")
# nx.write_gexf(create_grah_from_jrc_dataset(filter_stop_words(NETWORK1_WORDS)), str(NETWORK1_WORDS_PATH) + ".gexf")
# nx.write_gexf(create_grah_from_jrc_dataset(filter_stop_words(NETWORK2_CONCEPTS)), str(NETWORK2_CONCEPTS_PATH) + ".gexf")
# nx.write_gexf(create_grah_from_jrc_dataset(filter_stop_words(NETWORK2_WORDS)), str(NETWORK2_WORDS_PATH) + ".gexf")

# # write merged graphs
# nx.write_gexf(create_grah_from_jrc_dataset(filter_stop_words(CONCEPT_GRAPH_DF)), str(CONCEPT_GRAPH_PATH))
# nx.write_gexf(create_grah_from_jrc_dataset(filter_stop_words(WORD_GRAPH_DF)), str(WORD_GRAPH_PATH))

# # write the total graph
# nx.write_gexf(create_grah_from_jrc_dataset(filter_stop_words( pd.concat([CONCEPT_GRAPH_DF, WORD_GRAPH_DF] ) )), str(MERGED_GRAPH_PATH))

# nx.write_gexf(EUROVOC_GRAPH, str(QUERY_LABELS_RESULTSET_PATH) + ".gexf")
# for name, graph in CONCEPT_SCHEME_GRAPHS:
#     nx.write_gexf(graph, str(QUERY_LABELS_RESULTSET_PATH)+"."+name+ ".gexf")

# Taxonomy metrics 

## Primary statistics

In [6]:
# euro_voc_stats = concept_scheme_stats(  [("EuroVoc",EUROVOC_GRAPH),] )
# euro_voc_stats.head()

In [7]:
t0 = time.time()
concept_scheme_measurements = concept_scheme_stats(CONCEPT_SCHEME_GRAPHS)
to_excel(concept_scheme_measurements, str(EUROVOC_DATASET_PATH) + "CS_statistics.xlsx" )
t1 = time.time()
print( str(t1-t0) )


PermissionError: [Errno 13] Permission denied: 'D:\\Work\\workspace_notebook\\taxonomy-metrics\\data\\eurovoc-4.9\\EVAP49.ttlCS_statistics.xlsx'

## Eurovoc meaures

In [None]:
ev_nodes = eurovoc_nodes_as_pd(EUROVOC_GRAPH)
ev_nodes_centrality_metrics = nodes_centrality_metrics(EUROVOC_GRAPH)

ev_combined = ev_nodes.join(ev_nodes_centrality_metrics)
to_excel(ev_combined, str(EUROVOC_DATASET_PATH) + "nodes_centrality_metrics.xlsx" )
ev_combined.corr()

## Bivariate scatter plots
TODO: continue fom [here](https://scikit-learn.org/stable/auto_examples/plot_anomaly_comparison.html#sphx-glr-auto-examples-plot-anomaly-comparison-py)

In [None]:


## plot each centrality and depth of the node
# ev_combined.plot(kind='scatter',x='depth',y='degree centrality',color='b')
# ev_combined.plot(kind='scatter',x='depth',y='eigenvector centrality',color='b')
# ev_combined.plot(kind='scatter',x='depth',y='authorities',color='b')
# ev_combined.plot(kind='scatter',x='depth',y='hubs',color='b')
# ev_combined.plot(kind='scatter',x='depth',y='page rank',color='b')

# 
# ev_combined.plot(kind='scatter',x='eigenvector centrality',y='degree centrality',color='b')

## Concept network measures

In [None]:
cm_nodes = cm_nodes_as_pd(CONCEPT_GRAPH)
cm_centrality = nodes_centrality_metrics(CONCEPT_GRAPH)

cm_combined = cm_nodes.join(cm_centrality)
to_excel(cm_combined, str(CONCEPT_GRAPH_PATH) + "nodes_centrality_metrics.xlsx" )
# cm_combined.to_excel(str(CONCEPT_GRAPH_PATH) + "nodes_centrality_metrics.xlsx" )
cm_combined.corr()

In [None]:
# plot each centrality and depth of the node
cm_combined.plot(kind='scatter',x='weight',y='degree centrality',color='b')
cm_combined.plot(kind='scatter',x='weight',y='eigenvector centrality',color='b')
cm_combined.plot(kind='scatter',x='weight',y='authorities',color='b')
cm_combined.plot(kind='scatter',x='weight',y='hubs',color='b')
cm_combined.plot(kind='scatter',x='weight',y='page rank',color='b')

# Comparative measurements

## Show the nerighours in JRC and EuroVoc CN (for each exact match)
note: 294 discovered exact matches

In [None]:
concept_adjacency = common_concept_acjacency(EUROVOC_GRAPH, CONCEPT_GRAPH,"EuroVoc","JRC CG")
to_excel(concept_adjacency, str(EUROVOC_DATASET_PATH) + "common_concept_acjacency.xlsx" )

## Show the paths between concepts in JRC and EuroVoc CN (for each exact match)

In [None]:
common_concept_paths = paths_between_common_concepts(EUROVOC_GRAPH, CONCEPT_GRAPH,"EuroVoc CG","JRC CG")
to_excel(common_concept_paths, str(EUROVOC_DATASET_PATH) + "paths_between_common_concepts.xlsx" )

# Some graph debugging  

In [None]:
expected_top_concepts = 33
target_cs_name = "7231 economic geography"
t_cs = [ g for n,g in CONCEPT_SCHEME_GRAPHS if n==target_cs_name]
graph = t_cs[0]
top_nodes = [ label for label, node in graph.nodes(data=True) if node["hierarchical_in"]>0 and node["hierarchical_out"]==0] 
print ("Difference between found and expected concepts")
print (expected_top_concepts - len (top_nodes))

In [50]:
expected_leaf_concepts = 180
target_cs_name = "7236 political geography"
t_cs = [ g for n,g in CONCEPT_SCHEME_GRAPHS if n==target_cs_name]
graph = t_cs[0]
graph = subgraph_of_hierarchical_nodes(graph)
leaf_nodes = [ label for label, node in graph.nodes(data=True) if node["hierarchical_out"]>0 ]
print ("Difference between found and expected concepts")
print (expected_leaf_concepts - len (leaf_nodes))

'Difference between found and expected concepts'
60
