In [1]:
from datetime import datetime
from lassort import load, localAssortF
import numpy as np

In [3]:
networkfile = '../../problem_sets/2/facebook100txt/Reed98.txt'
metadatafile = '../../problem_sets/2/facebook100txt/Reed98_attr.txt'


Ms = []
Ts = []
Zs = []

for col in range(1,6):
    start = datetime.now()
    E, M = load(
        networkfile, 
        metadatafile, 
        zero_index=1, 
        meta_col=col,
        header=True,
        reindex=True,
        missing_value=0
    )
    
    assortM, assortT, Z = localAssortF(E,M,pr=np.arange(0,1,0.1))
    Ms.append(assortM)
    Ts.append(assortT)
    Zs.append(Z)
    end = datetime.now()
    print(f"Ran multiscale mixing for attribute {col} in {(end-start).total_seconds()} seconds.")

Ran multiscale mixing for attribute 1 in 6.455181 seconds.
Ran multiscale mixing for attribute 2 in 7.242667 seconds.
Ran multiscale mixing for attribute 3 in 8.76179 seconds.
Ran multiscale mixing for attribute 4 in 7.974282 seconds.
Ran multiscale mixing for attribute 5 in 7.12952 seconds.


# TODO:
Play with some toy sample networks to build intuition.  

What happens with one mismatched node in an assortative neighborhood.  How about one assortative node in a dissasortative neighborhood?  How about a node with a null value?  Nulls will become very important here.

In [62]:
from networkx.generators.community import stochastic_block_model as sbm
import networkx as nx

In [113]:
sizes = [200, 100]

p = [[.1,.05], [.05, .1]]

G = sbm(sizes, p)

E = nx.convert_matrix.to_pandas_edgelist(G).values
M = np.hstack([np.zeros(sizes[0]), np.ones(sizes[1])])

assortM, assortT, Z = localAssortF(E,M,pr=np.arange(0,1,0.1))

In [114]:
tm = pd.DataFrame([assortT, M]).T
tm.corr()

Unnamed: 0,0,1
0,1.0,-0.858052
1,-0.858052,1.0


In [115]:
tm[tm[1]==0][0].mean()

0.4289589320654478

In [116]:
tm[tm[1]==1][0].mean()

0.02883230660439054

In [117]:
sizes = [100, 50]

p = [[.1,.05], [.05, .1]]

G = sbm(sizes, p)

E = nx.convert_matrix.to_pandas_edgelist(G).values
M = np.hstack([np.zeros(sizes[0]), np.ones(sizes[1])])

assortM, assortT, Z = localAssortF(E,M,pr=np.arange(0,1,0.1))
tm = pd.DataFrame([assortT, M]).T
tm.corr()

Unnamed: 0,0,1
0,1.0,-0.78771
1,-0.78771,1.0


In [118]:
tm[tm[1]==0][0].mean()

0.38448229887830687

In [119]:
tm[tm[1]==1][0].mean()

-0.07788745829983985

In [127]:
sizes = [100, 50]

p = [[.1,.05], [.05, .1]]

G = sbm(sizes, p)

E = nx.convert_matrix.to_pandas_edgelist(G).values
M = np.hstack([np.zeros(sizes[0]), np.ones(sizes[1])])

assortM, assortT, Z = localAssortF(E,M,pr=np.arange(0,1,0.1))
tm = pd.DataFrame([assortT, M]).T
tm.corr()

Unnamed: 0,0,1
0,1.0,-0.795238
1,-0.795238,1.0


In [128]:
tm[tm[1]==0][0].mean()

0.40274211331718734

In [129]:
tm[tm[1]==1][0].mean()

-0.06171927218209498

When there are unbalanced classes, the smaller class comes out with a lower multiscale mixing coefficient even when there is a higher probability of pairing.  How does this change in disassortative networks?

TODO: what happens when we only look at smaller scales?

In [131]:
sizes = [100, 50]

p = [[.05,.2], [.2, .05]]

G = sbm(sizes, p)

E = nx.convert_matrix.to_pandas_edgelist(G).values
M = np.hstack([np.zeros(sizes[0]), np.ones(sizes[1])])

assortM, assortT, Z = localAssortF(E,M,pr=np.arange(0,1,0.1))
tm = pd.DataFrame([assortT, M]).T
tm.corr()

Unnamed: 0,0,1
0,1.0,-0.654095
1,-0.654095,1.0


In [132]:
tm[tm[1]==0][0].mean()

-0.4374436736675371

In [133]:
tm[tm[1]==1][0].mean()

-0.6598697622429177

The smaller group here will be recieve a lower score because nodes will on average have more neighbors in the other group.  This makes sense I think...

What happens if we add a few nulls?

In [185]:
sizes = [100, 50]

p = [[.05,.2], [.2, .05]]

G = sbm(sizes, p)

nulls = np.random.choice(sum(sizes), 3)
for n in nulls:
    G.nodes[n]["block"] = -1

E = nx.convert_matrix.to_pandas_edgelist(G).values
M = np.hstack([np.zeros(sizes[0]), np.ones(sizes[1])])
M[nulls] = -1

assortM, assortT, Z = localAssortF(E,M,pr=np.arange(0,1,0.1))
tm = pd.DataFrame([assortT, M]).T
tm.corr()

Unnamed: 0,0,1
0,1.0,-0.625614
1,-0.625614,1.0


In [186]:
tm.iloc[nulls]

Unnamed: 0,0,1
130,-0.490576,-1.0
133,-0.514159,-1.0
19,-0.583676,-1.0


In [187]:
tm[tm[1]==0][0].mean()

-0.44282499452653956

In [188]:
tm[tm[1]==1][0].mean()

-0.669816683009928

In [189]:
Z[nulls]

array([0.46433425, 0.46631966, 0.4689766 ])

Nulls appear to be less assortative than the baseline for their group.  What about in an assortative network?

In [193]:
sizes = [100, 50]

p = [[.3,.03], [.03, .3]]

G = sbm(sizes, p)

nulls = np.random.choice(sum(sizes), 3)
for n in nulls:
    G.nodes[n]["block"] = -1

E = nx.convert_matrix.to_pandas_edgelist(G).values
M = np.hstack([np.zeros(sizes[0]), np.ones(sizes[1])])
M[nulls] = -1

assortM, assortT, Z = localAssortF(E,M,pr=np.arange(0,1,0.1))
tm = pd.DataFrame([assortT, M]).T
tm.corr()

Unnamed: 0,0,1
0,1.0,-0.756486
1,-0.756486,1.0


In [194]:
tm[tm[1]==0][0].mean()

0.8295097824295542

In [195]:
tm[tm[1]==1][0].mean()

0.5479452937462972

In [196]:
tm.iloc[nulls]

Unnamed: 0,0,1
92,0.802403,-1.0
7,0.78713,-1.0
118,0.671871,-1.0


In [208]:
Z[nulls]

array([0.47983764, 0.47918146, 0.47429308])

In [207]:
Z

array([0.98392421, 0.98672734, 0.96393149, 0.96661903, 0.95246534,
       0.98663789, 0.98610764, 0.47918146, 0.98490224, 0.98562941,
       0.98600621, 0.96080906, 0.96176982, 0.96541365, 0.98537786,
       0.94438043, 0.96695719, 0.98682698, 0.96799251, 0.98541582,
       0.96621717, 0.98480794, 0.96926452, 0.98551886, 0.98573399,
       0.98556829, 0.96394408, 0.98369197, 0.9829494 , 0.94445679,
       0.96104915, 0.98720283, 0.96337437, 0.98571397, 0.95429427,
       0.96466672, 0.98575611, 0.96120573, 0.98507701, 0.98706352,
       0.96365208, 0.96703355, 0.98579286, 0.96692636, 0.96503826,
       0.98644305, 0.96510599, 0.98433744, 0.98512257, 0.96439358,
       0.98541253, 0.96282514, 0.96345924, 0.9602817 , 0.98639021,
       0.98473054, 0.98427973, 0.96075897, 0.9852566 , 0.9855253 ,
       0.98495613, 0.98516022, 0.98650375, 0.98590403, 0.96540248,
       0.9857383 , 0.96970803, 0.94123719, 0.96934322, 0.96680045,
       0.98571165, 0.9582131 , 0.98593271, 0.98446372, 0.98564

Okay I think I get it... nulls are pulled closer to the global mean because their label is missing from the sample of the neighborhood.

What about when you put one mismatched node in the middle of a VERY assortative neighborhood?

In [217]:
sizes = [100, 50]

p = [[.4,.01], [.01, .4]]

G = sbm(sizes, p)

# swap = np.random.choice(sizes[0])
# G.nodes[swap]["block"] = 1

E = nx.convert_matrix.to_pandas_edgelist(G).values
M = np.hstack([np.zeros(sizes[0]), np.ones(sizes[1])])

assortM, assortT, Z = localAssortF(E,M,pr=np.arange(0,1,0.1))
tm = pd.DataFrame([assortT, M]).T
tm.corr()

Unnamed: 0,0,1
0,1.0,-0.619492
1,-0.619492,1.0


In [218]:
tm[tm[1]==0][0].mean()

0.9654350111766228

In [219]:
tm[tm[1]==1][0].mean()

0.889745396634061

In [220]:
sizes = [100, 50]

p = [[.4,.01], [.01, .4]]

G = sbm(sizes, p)

swap = np.random.choice(sizes[0])
G.nodes[swap]["block"] = 1

E = nx.convert_matrix.to_pandas_edgelist(G).values
M = np.hstack([np.zeros(sizes[0]), np.ones(sizes[1])])
M[swap] = 1

assortM, assortT, Z = localAssortF(E,M,pr=np.arange(0,1,0.1))
tm = pd.DataFrame([assortT, M]).T
tm.corr()

Unnamed: 0,0,1
0,1.0,-0.344674
1,-0.344674,1.0


In [221]:
tm[tm[1]==0][0].mean()

0.9053527816001847

In [222]:
tm[tm[1]==1][0].mean()

0.808790090200564

In [223]:
tm.iloc[swap]

0   -0.550017
1    1.000000
Name: 73, dtype: float64

In [225]:
Z[swap]

1.0000000000000007

In [224]:
tm.head()

Unnamed: 0,0,1
0,0.91452,0.0
1,0.905355,0.0
2,0.91988,0.0
3,0.886172,0.0
4,0.861936,0.0


In [227]:
tm.sort_values(0).head()

Unnamed: 0,0,1
73,-0.550017,1.0
142,0.654003,1.0
107,0.690184,1.0
122,0.704483,1.0
102,0.708737,1.0


Okay looks like the one node in an assortative neighborhood with mismatched labels does come out very disassortative.  This is reassuring.

So the one big question is how do you deal with unbalanced classes (as usual... FML)

Options:

1. Just use it and understand the impact of class imbalance.
2. Take the final T_assort score and divide by the frequency of the starting node's class.  This is wrong (I think!) because the bias will be different at every node in the neighborhood depending on all of their classes.
3. Alter the source code to normalize by class frequency at every step.  Look for and deal with any unanticipated consequences of doing this.
4. Ditch multiscale mixing altogether and just the node's immediate neighbors.  Consider only working with networks of high mean degree or excluding nodes of low mean degree.

In [22]:
with open("../../problem_sets/2/facebook100txt/Reed98_attr.txt") as f:
    print(f.readline())

id	status	gender	major	dorm	year



In [29]:
np.isnan(T).any()

False

In [32]:
T.mean(axis=1)

array([0.29234465, 0.02837804, 0.05417487, 0.14063715, 0.37408552])

In [35]:
import pandas as pd
pd.Series(Z).describe()

count    962.000000
mean       0.790133
std        0.184279
min        0.231184
25%        0.797505
50%        0.854555
75%        0.895501
max        0.967966
dtype: float64

In [41]:
np.hstack(Ms)[:5, :20]

array([[ 0.68605323,  0.65406055,  0.62011382,  0.58402034,  0.54554005,
         0.50437737,  0.46017216,  0.41248988,  0.3608109 ,  0.30451898,
        -0.00819243, -0.02786717, -0.04674454, -0.06472269, -0.08166825,
        -0.09740664, -0.11171044, -0.12428589, -0.13475748, -0.14265056],
       [ 0.68169286,  0.66001166,  0.63689971,  0.61181461,  0.58405399,
         0.55267835,  0.51640707,  0.47348203,  0.42149384,  0.35716519,
         0.10557929,  0.06653648,  0.03019543, -0.00338379, -0.03409618,
        -0.06178186, -0.08621237, -0.10707248, -0.12393716, -0.13624287],
       [ 1.        ,  0.71688624,  0.47544154,  0.27618432,  0.12004964,
         0.00848091, -0.0564428 , -0.07183812, -0.03380137,  0.06286065,
        -1.0443902 , -1.03607233, -1.01100212, -0.96892756, -0.9094827 ,
        -0.83216877, -0.73632812, -0.62111023, -0.48542856, -0.32790727],
       [ 1.        ,  0.98656291,  0.9699976 ,  0.94944189,  0.92360981,
         0.89049364,  0.84681463,  0.78701976,  

In [52]:
np.hstack(Ms).mean(axis=0)

array([ 0.30588893,  0.30222484,  0.29911297,  0.29655946,  0.2945505 ,
        0.29302089,  0.29177978,  0.29034386,  0.2875844 ,  0.28102227,
       -0.15131465, -0.14974378, -0.14811274, -0.14642603, -0.14469543,
       -0.14294721, -0.14123736, -0.13968363, -0.13853005, -0.13827156,
        0.03428544,  0.03347926,  0.03265541,  0.03180951,  0.03093733,
        0.03003584,  0.0291057 ,  0.02815749,  0.02722511,  0.02639316,
        0.04107555,  0.03978362,  0.03837452,  0.03680697,  0.03501856,
        0.03290881,  0.03030457,  0.02688895,  0.02206003,  0.01465982,
        0.23807638,  0.23900372,  0.23995328,  0.2408909 ,  0.24176108,
        0.24246728,  0.24283044,  0.24250216,  0.24078977,  0.23631787])

In [53]:
np.hstack(Ms)

array([[ 0.68605323,  0.65406055,  0.62011382, ...,  0.15396345,
         0.17027345,  0.18921269],
       [ 0.68169286,  0.66001166,  0.63689971, ...,  0.30731533,
         0.28169637,  0.25418847],
       [ 1.        ,  0.71688624,  0.47544154, ..., -0.05933412,
         0.01683835,  0.10661169],
       ...,
       [ 0.75619027,  0.72714125,  0.6964998 , ...,  0.22613851,
         0.22690235,  0.227655  ],
       [-2.81968572, -2.56246783, -2.30515339, ...,  0.02768302,
         0.08086656,  0.14008465],
       [-0.24139786, -0.2473983 , -0.24920557, ...,  0.1961651 ,
         0.19410676,  0.19637812]])