In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
torch.set_printoptions(sci_mode=False)
import time

In [2]:
import warnings
warnings.filterwarnings("ignore") 

In [3]:
data = pd.read_csv('data/zips_merged.csv', delimiter=',')
data = data.rename(columns={'total': 'weight', 'w_zip':'origin', 'h_zip':'destination'})
data = data[data.destination.isin(data.origin.unique())]
data.head()

Unnamed: 0,origin,destination,weight,initialFeat,true_label
0,11436,10009,1,4.0,4
1,11436,10011,1,4.0,4
2,11436,10013,1,4.0,4
3,11436,10019,1,4.0,4
4,11436,10021,1,4.0,4


In [4]:
cuda = torch.cuda.is_available()
weight_decay = 10e-4
epochs = 10001
seed = 165
hidden = 10
lr = 0.0001

In [5]:
np.random.seed(seed)
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed(seed)

In [6]:
def normalize(adj):

    adj = torch.FloatTensor(adj)
    adj_id = torch.FloatTensor(torch.eye(adj.shape[1]))
    adj_id = adj_id.reshape((1, adj.shape[1], adj.shape[1]))
    adj_id = adj_id.repeat(adj.shape[0], 1, 1)
    adj = adj + adj_id
    rowsum = torch.FloatTensor(adj.sum(2))
    degree_mat_inv_sqrt = torch.diag_embed(torch.float_power(rowsum,-0.5), dim1=-2, dim2=-1).float()
    adj_norm = torch.bmm(torch.transpose(torch.bmm(adj,degree_mat_inv_sqrt),1,2),degree_mat_inv_sqrt)

    return adj_norm


def doublerelu(x):
    return torch.clamp(x, 0, 1)

class GNN1Layer(Module):

    def __init__(self, batch_size, in_features, out_features, first):
        super(GNN1Layer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.batch_size = batch_size

        weight1_eye = torch.FloatTensor(torch.eye(in_features, out_features))
        weight1_eye = weight1_eye.reshape((1, in_features, out_features))
        weight1_eye = weight1_eye.repeat(batch_size, 1, 1)
        self.weight1 = Parameter(weight1_eye)
        if not first:
            self.weight2 = Parameter(torch.zeros(batch_size, in_features, out_features))
        else:
            self.weight2 = Parameter(torch.empty(batch_size, in_features, out_features))
            nn.init.kaiming_normal_(self.weight2, mode='fan_out')

    def forward(self, input, adj):
        v1 = torch.bmm(input, self.weight1)
        v2 = torch.bmm(torch.bmm(adj, input), self.weight2)
        output = v1 + v2
        return output

In [7]:
class GNN1(nn.Module):

    def __init__(self, batch_size, nfeat, ndim, hidden, first):
        super(GNN1, self).__init__()

        self.gc1 = GNN1Layer(batch_size, nfeat, ndim, first)

    def forward(self, x, adj, random_indices):
        f = torch.clone(x)
        x = doublerelu(self.gc1(x, adj))
        x = x/x.sum(axis=2).unsqueeze(2) #normalize st sum = 1

        f[0][random_indices, :x.shape[2]] = x[0][random_indices, :]
        
        return f[:,:,:x.shape[2]]

In [8]:
def train(adj,features,labels,random_indices,first=False):
    
    adj_norm = normalize(adj)
    
    labels = labels - 1
    
    adj = torch.FloatTensor(adj)
    adj_norm = torch.FloatTensor(adj_norm)
    features = torch.FloatTensor(features)
    labels = torch.FloatTensor(labels)
    
    embedx, embedy = svdApprox(adj,dim=3)
    embedx = embedx.unsqueeze(0)
    embedy = embedy.unsqueeze(0)
    features = torch.cat((features,embedx,embedy),axis=2)
    
    model = GNN1(batch_size=adj.shape[0],
                nfeat=features.shape[-1],
                ndim=nb_label,
                hidden=hidden,
                first=first)
    if cuda:
        model.cuda()
        features = features.cuda()
        adj = adj.cuda()
        adj_norm = adj_norm.cuda()
        labels = labels.cuda()
    
    # Train model
    t_total = time.time()

    optimizer = optim.Adam(model.parameters(),
                           lr=lr, weight_decay=weight_decay)
    
    criterion = nn.CrossEntropyLoss()
    
    for epoch in range(epochs):

        t = time.time()
        model.train()
        optimizer.zero_grad()

        output = model(features, adj_norm, random_indices)
            
        accuracy = torch.sum(torch.argmax(output,axis=2)==labels.reshape(1,-1))/labels.shape[0]
        
        loss = criterion(output[0],labels.reshape(-1).long())

        loss.backward(retain_graph=True)

        optimizer.step()

        if epoch == 0:
            best_loss = loss
            best_output = output
            best_acc = accuracy
        else:
            if loss < best_loss:
                best_loss = loss
                best_output = output
                best_acc = accuracy

        if epoch % 1000 == 0:
            print('Epoch: {:04d}'.format(epoch + 1),
                  'Accuracy: {:.4f}'.format(best_acc.item()),
                  'Loss: {:.8f}'.format(best_loss.item()),
                  'time: {:.4f}s'.format(time.time() - t))
            
    print("Optimization Finished!")
    print("Total time elapsed: {:.4f}s".format(time.time() - t_total))
    
    return best_loss,best_output

In [9]:
def svdApprox(adj, dim, relu=False):
    adj = torch.FloatTensor(adj[0])
    U, S, Vh = torch.linalg.svd(adj)
    mu = torch.matmul(torch.matmul(U[:, :dim], torch.diag(S[:dim])), Vh[:dim, :])

    embedx = torch.matmul(U[:, :dim], torch.diag(torch.pow(S[:dim], 0.5)))
    embedy = torch.transpose(torch.matmul(torch.diag(torch.pow(S[:dim], 0.5)), Vh[:dim, :]), 0, 1)

    return embedx, embedy

In [10]:
def load_data():

    data = pd.read_csv('data/zips_merged.csv', delimiter=',')
    data = data.rename(columns={'total': 'weight', 'w_zip':'origin', 'h_zip':'destination'})
    data = data[data.destination.isin(data.origin.unique())]
    G = nx.from_pandas_edgelist(data, 'origin', 'destination', 'weight',create_using=nx.DiGraph())
    adj_list = np.array([nx.adjacency_matrix(G).todense()], dtype=float)
    init_feat = np.array(data.groupby('origin')['initialFeat'].agg(['unique']))
    true_label = np.array(data.groupby('origin')['true_label'].agg(['unique']))
    init_feat = np.array(list(map(lambda x: x[0], init_feat))).reshape(-1, 1)
    true_label = np.array(list(map(lambda x: x[0][0], true_label))).reshape(-1, 1)
    return adj_list,init_feat,true_label

adj,feature,labels = load_data()

feature = feature - 1
nb_label = int(max(feature)) + 1
featuress = np.eye(nb_label)[np.array(feature,dtype=int).reshape(1,-1)]

In [11]:
mask_percentage = [1.]

for m in mask_percentage:
    
    features = np.copy(featuress)
    
    
    # Masking
    number_of_rows = features[0].shape[0]
    random_indices = np.random.choice(number_of_rows, size=int(m*number_of_rows), replace=False)
    
    # exclude low activity areas from masking
    zip_sum = data.groupby(by='origin', as_index=False).sum()
    low_act = zip_sum[zip_sum.weight < 3000].index
    
    random_indices = np.setdiff1d(random_indices, low_act)
    random_rows = features[0][random_indices, :]
    features[0][random_indices, :] = np.tile(np.array([[0.2]]),random_rows.shape)
    
    
    print("\nMasked {}% of nodes\n".format(int(m*100)))
    prev_loss, op = train(adj,features,labels, random_indices, False)
    #print(op)
    loss, op = train(adj,op.cpu().detach().numpy(),labels, random_indices)
    while loss < prev_loss :
        prev_loss = loss
        loss, op = train(adj,op.cpu().detach().numpy(),labels, random_indices)


Masked 100% of nodes

Epoch: 0001 Accuracy: 0.4567 Loss: 1.48497260 time: 0.4338s
Epoch: 1001 Accuracy: 0.5048 Loss: 1.34438717 time: 0.0030s
Epoch: 2001 Accuracy: 0.5048 Loss: 1.34438717 time: 0.0040s
Epoch: 3001 Accuracy: 0.5048 Loss: 1.34438717 time: 0.0030s
Epoch: 4001 Accuracy: 0.5048 Loss: 1.34438717 time: 0.0030s
Epoch: 5001 Accuracy: 0.5048 Loss: 1.34438717 time: 0.0040s
Epoch: 6001 Accuracy: 0.5048 Loss: 1.34438717 time: 0.0030s
Epoch: 7001 Accuracy: 0.5048 Loss: 1.34438717 time: 0.0030s
Epoch: 8001 Accuracy: 0.5048 Loss: 1.34438717 time: 0.0030s
Epoch: 9001 Accuracy: 0.5048 Loss: 1.34438717 time: 0.0030s
Epoch: 10001 Accuracy: 0.5048 Loss: 1.34438717 time: 0.0020s
Optimization Finished!
Total time elapsed: 27.2360s
Epoch: 0001 Accuracy: 0.5048 Loss: 1.34438717 time: 0.0050s
Epoch: 1001 Accuracy: 0.6106 Loss: 1.30145943 time: 0.0040s
Epoch: 2001 Accuracy: 0.6106 Loss: 1.30145943 time: 0.0030s
Epoch: 3001 Accuracy: 0.6106 Loss: 1.30145943 time: 0.0020s
Epoch: 4001 Accuracy: 0.

Epoch: 6001 Accuracy: 0.7404 Loss: 1.16840196 time: 0.0024s
Epoch: 7001 Accuracy: 0.7404 Loss: 1.16840196 time: 0.0020s
Epoch: 8001 Accuracy: 0.7404 Loss: 1.16840196 time: 0.0020s
Epoch: 9001 Accuracy: 0.7404 Loss: 1.16840196 time: 0.0030s
Epoch: 10001 Accuracy: 0.7404 Loss: 1.16840196 time: 0.0030s
Optimization Finished!
Total time elapsed: 22.6067s
Epoch: 0001 Accuracy: 0.7404 Loss: 1.16840196 time: 0.0030s
Epoch: 1001 Accuracy: 0.7404 Loss: 1.16374290 time: 0.0030s
Epoch: 2001 Accuracy: 0.7404 Loss: 1.16374290 time: 0.0020s
Epoch: 3001 Accuracy: 0.7404 Loss: 1.16374290 time: 0.0020s
Epoch: 4001 Accuracy: 0.7404 Loss: 1.16374290 time: 0.0030s
Epoch: 5001 Accuracy: 0.7404 Loss: 1.16374290 time: 0.0030s
Epoch: 6001 Accuracy: 0.7404 Loss: 1.16374290 time: 0.0030s
Epoch: 7001 Accuracy: 0.7404 Loss: 1.16374290 time: 0.0030s
Epoch: 8001 Accuracy: 0.7404 Loss: 1.16374290 time: 0.0030s
Epoch: 9001 Accuracy: 0.7404 Loss: 1.16374290 time: 0.0020s
Epoch: 10001 Accuracy: 0.7404 Loss: 1.16374290 

Epoch: 1001 Accuracy: 0.7885 Loss: 1.11766958 time: 0.0030s
Epoch: 2001 Accuracy: 0.7885 Loss: 1.11766958 time: 0.0020s
Epoch: 3001 Accuracy: 0.7885 Loss: 1.11766958 time: 0.0020s
Epoch: 4001 Accuracy: 0.7885 Loss: 1.11766958 time: 0.0030s
Epoch: 5001 Accuracy: 0.7885 Loss: 1.11766958 time: 0.0030s
Epoch: 6001 Accuracy: 0.7885 Loss: 1.11766958 time: 0.0030s
Epoch: 7001 Accuracy: 0.7885 Loss: 1.11766958 time: 0.0030s
Epoch: 8001 Accuracy: 0.7885 Loss: 1.11766958 time: 0.0020s
Epoch: 9001 Accuracy: 0.7885 Loss: 1.11766958 time: 0.0030s
Epoch: 10001 Accuracy: 0.7885 Loss: 1.11766958 time: 0.0030s
Optimization Finished!
Total time elapsed: 26.3076s
Epoch: 0001 Accuracy: 0.7885 Loss: 1.11766958 time: 0.0020s
Epoch: 1001 Accuracy: 0.7885 Loss: 1.11485624 time: 0.0030s
Epoch: 2001 Accuracy: 0.7885 Loss: 1.11485624 time: 0.0030s
Epoch: 3001 Accuracy: 0.7885 Loss: 1.11485624 time: 0.0030s
Epoch: 4001 Accuracy: 0.7885 Loss: 1.11485624 time: 0.0020s
Epoch: 5001 Accuracy: 0.7885 Loss: 1.11485624 t