In [None]:
import glob
import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns
print(nx.__version__)

from matplotlib import pyplot as plt

import sys
sys.path.append('../.')


from comap.mapper import CoMap
from comap.graph_utils import (compute_graph_deltas)
from comap.helper_utils import (get_reduced_categories)


%load_ext autoreload
%reload_ext autoreload
%autoreload 2

### Read data

In [None]:
# list excel filer i mappe

files = glob.glob('../../../Analyse/[A-Z,a-z]*.xlsx')

print("** Listing files in directory: **", files)

input_excel = files[2]
print("---> Reading (user):", input_excel)

# Read excel file into dataframe
#sheetlist_usr = pd.read_excel(input_excel)
#pd.read_excel(sheetname='Friteskt') #Hente en bestemt sheet
#sheetlist_usr = list(np.unique(sheetlist_usr['Kandidatnummer'].dropna())) 

#cat_sheet = 'Kategorier' # navn på sheet som holder på kategoriene

### recategorise and aggregate

In [None]:
# recategorise
#display(Markdown("**Aggregating user maps**\n"))
drop_list_usr=[38,
               43,
               44,45,46,47,48,49]
cat_dict_usr,cat_list_usr = get_reduced_categories(input_excel,'Kategorier',exclude=drop_list_usr)

print(cat_dict_usr)

#usrG, deltas_usr = build_aggregate_graph(file_usr, drop_nodes=drop_list_usr)

In [None]:
def clean_up(df, node_list=[]):
    
    df_raw = df.copy()
    
    # tags: education, innsatsgruppe, age group
    tag_list = df_raw.columns.values
    
    # extract mapping of node names to categories and put this in a list
    category_list = df_raw.iloc[0].values
    df_raw.drop( 0, inplace=True )

    # extract raw node value from map and put in a list
    raw_node_list = df_raw.iloc[1].values
    df_raw.drop( 1, inplace=True )

    # rename first column of dataframe to take name "index". This column contains original node names.
    df_raw.rename(columns={tag_list[0] : 'index'},inplace = True)

    for i in range(1,len(category_list)):
        df_raw.rename(columns={tag_list[i] : category_list[i]},inplace=True)

    # remove first column named 'index' containing node names (dataframe still has a "normal" index column)
    del df_raw['index']
    
    # get array of column names
    raw_index_list = df_raw.columns.values

    # replace index by new cateogories --> should give a "symmetric" matrix
    df_raw.set_index(raw_index_list,inplace=True) 
    
    # remove "comment node"
    #com = 101
    #if(com in df_raw.columns):
    #    df_raw.drop(columns=[101], inplace=True)
    #    df_raw.drop(101, inplace=True)
    #print(raw_index_list)
    
    # remove nodes not in include list
    drop_nodes=[ c for c in df_raw.columns if c not in node_list ]
    print('len dropnodes:', len(drop_nodes))
    #print(len(df_raw))
    #print("--->DROPPING NODES:", drop_nodes)
    if 43 in drop_nodes: 
        print("*********************************43 in DROPNODES", drop_nodes)
    print(df_raw.shape)
    df_raw = df_raw.drop(columns=drop_nodes,axis=1)
    df_raw = df_raw.drop(index=drop_nodes,axis=0)
    print(df_raw.shape)
    #print(len(df_raw))


    # replace NaNs with zeros --> should now have a matrix with 0s and 1s
    df_raw.fillna(0,inplace=True)

    # collapse and add up nodes belonging to the same category. Corresponding cell values will be added. You are left with a reduced matrix.
    # first collapse rows
    
    df_reduc = df_raw.groupby(df_raw.index).agg('sum')
    # now collapse columns
    df_reduc = df_reduc.T
    df_reduc = df_reduc.groupby(df_raw.index).agg('sum')
    #A_reduc = np.array(df_reduc.values)

    #g = nx.from_pandas_adjacency(df_reduc)
    df_raw2 = df_raw.copy()
    new_index_list = list( range( len(df_raw.columns.values) ) )
    #print(df_raw2.shape, len(new_index_list))
    df_raw2.index = new_index_list #df_raw.set_index(new_index_list,inplace=False)
    df_raw2.columns = new_index_list

    return df_raw2, df_reduc

In [None]:
# create list of individual map IDs present in input file
sheetlist = list( np.unique( pd.read_excel( input_excel )['Kandidatnummer'].dropna() ) )
num_maps = len(sheetlist)

# initialise empty list of DiGraphs
maps = {}
# list to hold unconnected graphs (should be empty!)
disconnects = []

# loop over sheets in excel file, create networkx graph from sheet and add to maps
for counter, sheet in enumerate(sheetlist):
    
    # get dataframe with user map
    df_map_raw = pd.read_excel(input_excel, sheet_name=sheet)
    print("Map #", counter,":",len(df_map_raw))

    # clean up to produce a symmetric adjacency matrix
    df_map_clean_raw, df_map_clean_red = clean_up(df_map_raw, node_list=cat_dict_usr)
    print("raw shape:", df_map_clean_raw.shape,"reduced shape:", df_map_clean_red.shape)
    
    # create DiGraphs of both raw and recategorised graphs and append to list
    g_raw   = nx.from_pandas_adjacency(df_map_clean_raw, create_using=nx.DiGraph)
    g_recat = nx.from_pandas_adjacency(df_map_clean_red, create_using=nx.DiGraph)
    
    # check that all nodes are accounted for
    if( ( len(df_map_clean_raw) != len(g_raw) ) ):
        print("Mismatch in raw:",counter, len(df_map_clean_raw), len(g_raw))
    if( ( len(df_map_clean_red) != len(g_recat) ) ):
        print("Mismatch in recat:",counter, len(df_map_clean_red), len(g_recat))
    
    # check that both raw and reduced graphs are connected
    if( ( nx.is_connected( g_raw.to_undirected() ) ) == False 
       or ( nx.is_connected( g_recat.to_undirected() ) )== False ):
       print("Connected? ", nx.is_connected(g_raw.to_undirected()), nx.is_connected(g_recat.to_undirected()) )
       largest_cc_raw = len(max(nx.connected_component_subgraphs(g_raw.to_undirected()), key=len))
       largest_cc_recat = len(max(nx.connected_component_subgraphs(g_recat.to_undirected()), key=len))
       smallest_cc_recat = min(nx.connected_component_subgraphs(g_recat.to_undirected()), key=len)
       print("Largest raw: ", len(g_raw), largest_cc_raw )
       print("Larget recat:", len(g_recat), largest_cc_recat )
       print("Smallest recat:", smallest_cc_recat.nodes())
       
       if(nx.is_connected(g_recat.to_undirected())==False):
           Gc = min(nx.connected_component_subgraphs(g_recat.to_undirected()), key=len)
           print("Connected components:", Gc.nodes())
           disconnects.append( list(Gc.nodes())[0] )
    
    # Add networkx graphs to dictionary
    maps[g_raw] = g_recat
    

print("Number of maps:", len(maps))
print("disconnects:", disconnects)

In [None]:
cat_dict_usr

In [None]:
G_bruker = CoMap(name='Agg_bruker')
deltas = compute_graph_deltas( maps )
G_bruker.aggregate_maps(maps.values(), cat_dict_usr)
G_bruker.set_deltas(deltas)
nx.draw_circular(G_bruker.map, node_color=G_bruker.node_colors.values(), with_labels=True)

In [None]:
G_bruker.plot_map_deltas()

In [None]:
G_bruker.plot_map()

In [None]:
G_bruker.map_properties(sort_by=['Pagerank'])

In [None]:
G_bruker.plot_quadrant_scatter()
#plt.savefig('non-synthetic.png')

## Create synthetic aggregate


In [None]:
S, m_diff, arr = G_bruker.generate_synthetic_graph(noise_scale=.5, smear_func='laplace')
#nx.draw_circular(S.map, node_color=S.node_colors.values(), with_labels=True)


In [None]:
smear_func='sl'
func = smear_func in ['laplace','normal']
func

In [None]:
S.plot_map()


In [None]:
S.get_n_highest_ranking_nodes(n=10)

In [None]:
S.plot_quadrant_scatter()
#plt.savefig('synthetic.png')