In [1]:
%load_ext autoreload
%autoreload 2

import subprocess
import os
import glob
from multiprocessing import Pool

import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from infomap import Infomap

from main import *

In [2]:
!cd simulate && cargo build --release

[0m[0m[1m[32m    Finished[0m release [optimized] target(s) in 0.00s


In [3]:
columns = ["seniority", "status", "gender", "office", "years with firm", "age", "practice", "law school"]

attr = pd.read_csv("data/LazegaLawyers/ELattr.dat", sep=" ", names=columns)
attr

Unnamed: 0,seniority,status,gender,office,years with firm,age,practice,law school
0,1,1,1,1,31,64,1,1
1,2,1,1,1,32,62,2,1
2,3,1,1,2,13,67,1,1
3,4,1,1,1,31,59,2,3
4,5,1,1,2,31,59,1,2
...,...,...,...,...,...,...,...,...
66,67,2,2,1,1,35,1,2
67,68,2,1,1,1,26,1,3
68,69,2,2,1,1,38,1,3
69,70,2,1,1,1,31,2,2


In [4]:
A = np.loadtxt("data/LazegaLawyers/ELfriend.dat")
A = A + A.T
A[A > 1] = 1
A

array([[0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 1., 1., 0.]])

In [None]:
with open("lazega.net", "w") as f:
    for source, row in enumerate(A):
        for target, weight in enumerate(row):
            if weight > 0:
                f.write(f"{source} {target}\n")

In [None]:
gender = attr["gender"].to_dict()

gender_name = {
    1: "male",
    2: "female"
}

def get_node_name(node):
    return f"{node} {gender_name[gender[node]]}"

with open("lazega.meta", "w") as f:
    for node, gender in gender.items():
        f.write(f"{node} {gender}\n")

In [None]:
H = nx.from_numpy_matrix(A)

nx.set_node_attributes(H, attr["gender"], "type")

H.remove_nodes_from(list(nx.isolates(H)))

pos = nx.spring_layout(H, seed=27)

In [None]:
!rm -rf *.tree *.clu *.json *.temp

In [None]:
same_code_prob = 1
diff_code_prob = [1, 1/4, 1/8]
n_samples = 10**7

In [None]:
%%time
simulate = "simulate/target/release/simulate"

with Pool() as p:
    args = ([simulate, '-c',
             "lazega.net", "lazega.meta", f"lazega_{i}.temp",
             str(same_code_prob),
             str(diff * same_code_prob),
             str(n_samples)]
            for i, diff in enumerate(diff_code_prob))

    p.map(subprocess.run, args)

In [None]:
num_figs = len(diff_code_prob)
fig, axs = plt.subplots(1, num_figs, figsize=(num_figs*8, 8))
fig.tight_layout()

module_map = {
    1: {2: 3, 3: 2},
    2: {2: 3, 3: 2}
}

for i, file in enumerate(sorted(glob.glob("lazega_*.temp"))):
    im = Infomap(two_level=True, num_trials=100, flow_model="rawdir", silent=True)
    im.read_file(file)
    im.run()
    im.write_tree(f"lazega_{i}.tree")
    
    modules = dict(im.modules)
    
    if i == 0:
        # Fix simulation noise
        modules[60] = 4
    
    if i in module_map:
        for node, module in modules.items():
            for source, target in module_map[i].items():
                if module == source:
                    modules[node] = target


    nx.set_node_attributes(H, modules, "modules")
    
    axs[i].axis("off")
    draw_network(H, pos, ax=axs[i])
    
    nx.draw_networkx_labels(H, pos=pos, ax=axs[i])

axs[-1].axis("off")

fig.savefig("figures/lazega_friends_color.svg")

# Scotts metadata map equation

In [None]:
fig, axs = plt.subplots(1, num_figs, figsize=(num_figs*8, 8))
fig.tight_layout()

etas = [0, 0.7, 1.25]

module_map = {
    1: {2: 3, 3: 2, 6: 5, 5: 4, 4: 7},
    2:             {6: 5, 5: 4, 4: 7, 7: 6}
}

for i, eta in enumerate(etas):
    im = Infomap(two_level=True, meta_data_rate=eta, silent=True)
    im.add_networkx_graph(H)
    
    for node, meta in H.nodes.data("type"):
        im.set_meta_data(node, meta)
    
    im.run()
    im.write_tree(f"eta_{eta}.tree")

    modules = dict(im.modules)
    
    if i in module_map:
        for node, module in modules.items():
            for source, target in module_map[i].items():
                if module == source:
                    modules[node] = target

    nx.set_node_attributes(H, modules, "modules")
    axs[i].axis("off")
    draw_network(H, pos, ax=axs[i])

    nx.draw_networkx_labels(H, pos=pos, ax=axs[i])
    
axs[-1].axis("off")

fig.savefig("figures/lazega_friends_meta.svg")

# Analysis

In [None]:
%%time

same_code_prob = 1
diff_code_prob = [1, 1/2, 1/4, 1/8]
diff_code_prob = np.logspace(0, -4, base=2, endpoint=True)
#diff_code_prob = np.linspace(1, 1/8, endpoint=True)
n_samples = 10**6

data = []

for diff in diff_code_prob:
    diff_coding_prob = diff * same_code_prob
    
    G = generate_links(edgelist,
                       gender,
                       same_coding_prob=same_code_prob,
                       diff_coding_prob=diff_coding_prob,
                       n_samples=n_samples)

    im = run_infomap(G, get_node_name=get_node_name)

    flow_ = {node.node_id: node.data.flow for node in im.tree if node.is_leaf}

    data_ = {node: (module, gender[node] - 1, flow_[node])
             for (node, module) in G.nodes.data("modules")}
    
    data.append(data_)

In [None]:
from scipy.stats import entropy

num_genders = 2

eff_assignments = np.zeros((len(data), num_genders))

for idx, network in enumerate(data):
    N_i_u = defaultdict(lambda: defaultdict(float)) # module -> gender -> flow
    N_u = defaultdict(float)
    N_i = defaultdict(float)

    for i, u, flow in network.values():
        N_i_u[i][u] += flow
        N_u[u] += flow
        N_i[i] += flow

    for i, assignments in N_i_u.items():
        p_i = 2**entropy([n/N_i[i] for n in assignments.values()], base=2)
        
        for u, n in assignments.items():
            eff_assignments[idx, u] += n/N_u[u] * p_i
            
eff_assignments

In [None]:
from math import log2

df = pd.DataFrame(data=eff_assignments, columns=["male", "female"])
df['-log2(p_diff)'] = [-log2(exp) for exp in diff_code_prob]
df.plot(x='-log2(p_diff)', figsize=(12, 8))

In [None]:
# Idea:
# Seniority is correlated to pagerank
# Maybe not a good idea in friendship network.

flow = {node_id: flow for node_id, (_, _, flow) in data[0].items()}

# add missing nodes
flow[43] = 0
flow[46] = 0

flow = [flow for _, flow in sorted(flow.items())]

seniority = attr["seniority"]

sns.lineplot(x=seniority, y=flow)

In [None]:
# Compare Color Map Equation where we encode on each step to when we add metadata
from scipy.stats import pearsonr

corr = []
first = None

for each in data:
    flow = {node_id: flow for node_id, (_, _, flow) in each.items()}

    node_ids = set(flow.keys())
    missing = set(range(max(node_ids))) - node_ids
    
    for missing_id in missing:
        flow[missing_id] = 0
        
    flow = [flow for _, flow in sorted(flow.items())]

    if not first:
        first = flow
    
    r = pearsonr(first, flow)
    corr.append(r[0])

df["pearson r"] = corr
ax = df.plot(x='-log2(p_diff)', y=["pearson r"], figsize=(12, 8))
ax.set_ylabel("PCC flow")