In [None]:
import os
import graphistry
import pandas as pd
import torch.nn.functional as F

from ml.dgl_utils import *
from ml.utils import *

In [None]:
import umap
import umap.plot

In [None]:
import logging
logging.basicConfig()
logger = logging.getLogger('demo')
logger.setLevel(logging.DEBUG)

In [None]:
def scatterplot(ux, color_labels=None):
    #small helper viz
    import matplotlib.pyplot as plt

    plt.figure(figsize=(10, 8))
    plt.scatter(ux.T[0], ux.T[1], c=color_labels, s=100, alpha=0.4)

In [None]:
#graphistry.register(api=3, protocol="https", server="hub.graphistry.com", username="..", password="..")
graphistry.register(api=3, protocol="https", server="hub.graphistry.com", username=os.environ['USERNAME'], password=os.environ['GRAPHISTRY_PASSWORD']) 

# We import a subgraph from the LittleSis dataset centered around BlackRock, Inc

In [None]:
edf = pd.read_csv('data/edges_blackrock.csv', index_col=0)
ndf = pd.read_csv('data/nodes_blackrock.csv', index_col=0)

In [None]:
edf = edf.astype(str)
edf = edf.drop(columns=['src', 'dst'])
edf = edf.reset_index()

ndf = ndf.astype(str)

In [None]:
edf.index.is_unique, ndf.index.is_unique

# Explore subgraphs
`get_graphistry_from_search` is a useful way to do fuzzy search over the dataframes to retrieve useful information

In [None]:
%%timeit
search_to_df('Bank', 'to_node', edf)

In [None]:
# g = get_graphistry_from_search('Bank', 'to_node', 'from_node', 'Node', edf, ndf)
# g.plot()

In [None]:
# g = get_graphistry_from_search('climate', 'to_node', 'from_node', 'Node', edf, ndf)
# g.plot()

# Explore Milieu
`get_graphistry_from_milieu_search` is a useful way to do fuzzy search over the dataframes to retrieve useful information over 1 and 2 connections from `search_term`

In [None]:
# # this works much better on full LittleSis data, than just the small BlackRock sample above...
# g = get_graphistry_from_milieu_search('meta', 'to_node', 'from_node', 'Node', edf, ndf, both=True)
# g.plot()

# Let's encode the graph as a DGL graph for use in Machine Learning

In [None]:

simple_target = False

if simple_target:
    # Let's explicitly make a node level target (simplified to two classes)
    node_target = ndf.Types.apply(lambda x: x.split(',')[0])
    node_target = pd.DataFrame({'Types': node_target.values}, index=node_target.index)
else:
    node_target = pd.DataFrame({'Types': ndf.Types.values}, index=ndf.index)

In [None]:
node_target

In [None]:
Counter(node_target.Types)  # we have a complex/simple target defined here

In [None]:
ndf.columns # not all of these are useful for building a model

In [None]:
good_node_columns = ['Node', 'link', 'Blurb', 'Summary', 'Types', 'Start date', 'Revenue',
       'Website', 'Aliases', 'Gender', 'Birthday', 'Region',
       'End date'] # just removed Date of Death

In [None]:
src, dst = 'from_node', 'to_node' #backwards due to the way we scraped the data
node_column = 'Node'

graph = BaseDGLGraphMixin()

In [None]:
g = graph.edges(edf, src, dst).nodes(ndf, node_column)

In [None]:
#g._convert_edgeDF_to_DGL(node_column, None) # works
y_nodes=pd.DataFrame({'Types': ndf.Types.values}, index=ndf.index)
y_edges=pd.DataFrame({'relationship_type': edf.relationship_type.values}, index=edf.index)

In [None]:
g.build_dgl_graph(node_column, y_nodes=y_nodes, y_edges=y_edges, use_node_columns=good_node_columns)#runs entire pipeline

In [None]:
g._MASK

In [None]:
# now we have a DGL graph with ndata and edata built via our featurization tools
g.DGL_graph

In [None]:
g2 = g.umap(kind='nodes', y=g.node_target.values.argmax(1))

In [None]:
g.node_target

In [None]:
umap.plot.points(g, theme='fire', labels=g.node_target.values.argmax(1))

In [None]:
edf.relationship_type[g._MASK]

In [None]:
g3 = g.umap(kind='edges', y = g.edge_target.values.argmax(1))

In [None]:
umap.plot.points(g3, theme='fire', labels=g.edge_target.values.argmax(1))#, labels=edf.relationship_type[g._MASK])

In [None]:
g[0] # slicing works on __getter__ method

In [None]:
wdf = g.weighted_edges_df_from_nodes

In [None]:
# now we have two adjacency matrices, one from standard edgelist, and another from UMAP
fig, ax = plt.subplots(2, 1, figsize=(15,30))

ax[0].imshow(g.weighted_adjacency_nodes.toarray(), aspect='auto') # super pretty
ax[1].imshow(g._adjacency.toarray(), aspect='auto')

In [None]:
# let's try to add the graphistry plottable with the umap coords and edgelist from umap
e2i = g.entity_to_index
ndf['n'] = ndf.Node.apply(lambda x: e2i[x])

In [None]:
wdf

In [None]:
gg = graphistry.nodes(ndf, 'n').edges(wdf, '_src', '_dst')

In [None]:
gg.plot()

# Now we with this in hand, we can train a model

In [None]:
from ml.networks import GCN  # this under the hood, only works for ndata
# this `logits = model(g, features)` breaks it if we switch to edata in training call. 
# TODO: understand why GCN is breaking this

In [None]:
# get the DGL graph object
G = g.DGL_graph

In [None]:
G.node_attr_schemes()
G.edge_attr_schemes()

In [None]:
G.ndata['train_mask'][:10]

In [None]:
def train_node_model(g, model, n_epochs=100):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    best_val_acc = 0
    best_test_acc = 0

    features = g.ndata['feature']
    targets = g.ndata['target']
    labels = targets.argmax(1) # a bit of a hack
    train_mask = g.ndata['train_mask']
    test_mask = g.ndata['test_mask']
    for e in range(n_epochs):
        # Forward
        logits = model(g, features.float())

        # Compute prediction
        pred = logits.argmax(1)

        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])

        # Compute accuracy on training/validation/test
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

        # Save the best validation accuracy and the corresponding test accuracy.
        if best_test_acc < test_acc:
            best_test_acc = test_acc

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if e % 20 == 0:
            print('In epoch {}, loss: {:.3f}, test acc: {:.3f} (best {:.3f})'.format(
                e, loss, test_acc, best_test_acc))



In [None]:
G.ndata

In [None]:
# to get a sense of the different parts in training loop above
features = G.ndata['feature']
labels = G.ndata['target']
train_mask = G.ndata['train_mask']
test_mask = G.ndata['test_mask']
targets = labels.argmax(1)

In [None]:
features.shape, labels.shape, targets.shape

# Define the Model 

In [None]:
num_features = G.ndata['feature'].shape[1]
latent_dim = 32
num_classes = G.ndata['target'].shape[1]

# here is the model
model = GCN(num_features, latent_dim, num_classes)
model

In [None]:
logits = model(G, features.float()) # have to call .float, or it gives a type(DOUBLE) error.
logits.shape

In [None]:
# untrained comparison
pred = logits.argmax(1)
sum(pred == targets)/len(pred)

## Train the Model

In [None]:
train_node_model(G, model, 621)

In [None]:
# trained comparison
logits = model(G, features.float())
pred = logits.argmax(1)

sum(pred == targets)/len(pred) #

In [None]:
# To get forward activations 
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

In [None]:
model.conv1.register_forward_hook(get_activation('conv1'))
model.conv2.register_forward_hook(get_activation('conv2'))
# now call model to do forward  
logits = model(G, features.float())
# this will load the dictionary
print(activation['conv1'])
print(activation['conv2'])

In [None]:
# just a pretty graph
plt.figure(); plt.imshow(np.cov(activation['conv1']>0), aspect='auto',  cmap=plt.get_cmap('plasma'))

# Let's UMAP it

In [None]:
ux = g.xy_nodes

In [None]:
ndf['x'] = ux.T[0]*100
ndf['y'] = ux.T[1]*100

scatterplot(ux, activation['conv2'].argmax(1))

In [None]:
# let's just make some targets, then we can use it in umap
from sklearn.preprocessing import OneHotEncoder

In [None]:
one_hot = OneHotEncoder(handle_unknown='ignore', sparse=False) # get a clean target

In [None]:
N = one_hot.fit_transform(ndf.Types.values.reshape(-1,1))

In [None]:
scatterplot(ux, N.argmax(1))

In [None]:
res = g.fit_transform(g.DGL_graph.ndata['feature'], N.argmax(1).reshape(-1,1))

In [None]:
scatterplot(res, N.argmax(1))

## Lets reduce using Louvain Embedding

In [None]:
from sknetwork.embedding import LouvainEmbedding
louvain = LouvainEmbedding()

In [None]:
embedding = louvain.fit_transform(g._adjacency.tocsr())
embedding.shape

In [None]:
emb = g.fit_transform(embedding, N.argmax(1).reshape(-1,1))

In [None]:
emb.shape, N.shape

In [None]:
scatterplot(emb, N.argmax(1)) #meh but cool that it sorts by Type

# Let's reduce the edata

In [None]:
X = np.array(g.DGL_graph.edata['feature']) # bring it out of torch
t = np.array(g.DGL_graph.edata['target'])

In [None]:
X.shape, t.shape

In [None]:
res = g.fit_transform(X, t.argmax(1).reshape(-1, 1))

In [None]:
scatterplot(res, color_labels=t.argmax(1))

In [None]:
## sums the feature along 0 axis
dgl.readout_nodes(G, 'feature').shape

In [None]:
dgl.mean_nodes(G, 'feature')

In [None]:
G['_E']

In [None]:
lg = G.line_graph()

In [None]:
lg

In [None]:
lg.adjacency_matrix()