In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
torch.set_printoptions(sci_mode=False)
import time

In [2]:
import warnings
warnings.filterwarnings("ignore") 

In [3]:
data = pd.read_csv('data/zips_merged.csv', delimiter=',')
data = data.rename(columns={'total': 'weight', 'w_zip':'origin', 'h_zip':'destination'})
data = data[data.destination.isin(data.origin.unique())]
data.head()

Unnamed: 0,origin,destination,weight,initialFeat,true_label
0,11436,10009,1,4.0,4
1,11436,10011,1,4.0,4
2,11436,10013,1,4.0,4
3,11436,10019,1,4.0,4
4,11436,10021,1,4.0,4


In [4]:
populationByAge = pd.read_csv('data/zicode_populationByAge.csv', delimiter=',')
populationByAge = populationByAge.iloc[:,0:2]
populationByAge.rename(columns={'ZIPCODE':'destination'}, inplace=True)

In [5]:
housePrice = pd.read_csv('data/zipcode_housePrice.csv', delimiter=',')
weights = [5000,12500,17500,22500,27500,32500,37500,45000,55000,65000,75000,85000
                    ,95000,112500,137500,162500,187500,225000,275000,350000,450000,625000,875000
                        ,1250000,1750000,2000000]
for i in range(len(weights)):
    housePrice.iloc[i,2:] = housePrice.iloc[i,2:]*weights[i]
housePrice.iloc[:,1] [housePrice.iloc[:,1] == 0] = 1

tmp = (housePrice.iloc[:,2:] != 0 ).sum(axis=1)
tmp[tmp == 0] = 1
housePrice = pd.concat([housePrice.iloc[:,0],housePrice.iloc[:,2:].sum(axis=1) / tmp ], axis = 1 )
housePrice.rename(columns={'ZIPCODE':'destination'}, inplace=True)

In [6]:
area = pd.read_csv('data/zips_area.csv', delimiter=',')
area = area.iloc[:,:2]
area.rename(columns={'ZIPCODE':'destination'}, inplace=True)

In [7]:
income = pd.read_csv('data/zipcode_income.csv', delimiter=',')
income = income.iloc[:,:2]
income[income.isna()] = 0
income.rename(columns={'ZIPCODE':'destination'}, inplace=True)

In [8]:
populationJobs = pd.read_csv('data/zipcode_population_Jobs.csv', delimiter=',')
populationJobs = populationJobs.iloc[:,:2]
populationJobs.rename(columns={'ZIPCODE':'destination'}, inplace=True)

In [9]:
data = data.merge(populationByAge)
data = data.merge(housePrice)
data = data.merge(area)
data = data.merge(income)
data = data.merge(populationJobs)

In [10]:
data.iloc[:,5:]=(data.iloc[:,5:]-data.iloc[:,5:].min())/(data.iloc[:,5:].max()-data.iloc[:,5:].min())

In [11]:
data

Unnamed: 0,origin,destination,weight,initialFeat,true_label,Estimate!!Total!!Total population,0,AREA,median_familyIncome(USD),totalJobs
0,11436,10009,1,4.0,4,0.300165,0.005332,0.032183,0.243977,0.042630
1,11213,10009,14,3.0,3,0.300165,0.005332,0.032183,0.243977,0.042630
2,11212,10009,27,3.0,3,0.300165,0.005332,0.032183,0.243977,0.042630
3,11225,10009,26,3.0,3,0.300165,0.005332,0.032183,0.243977,0.042630
4,11218,10009,60,3.0,3,0.300165,0.005332,0.032183,0.243977,0.042630
...,...,...,...,...,...,...,...,...,...,...
36407,11211,11371,7,1.0,3,0.000000,0.000000,0.063146,0.000000,0.050874
36408,11373,11371,3,4.0,4,0.000000,0.000000,0.063146,0.000000,0.050874
36409,10168,11371,1,1.0,1,0.000000,0.000000,0.063146,0.000000,0.050874
36410,10278,11371,1,1.0,1,0.000000,0.000000,0.063146,0.000000,0.050874


In [12]:
cuda = torch.cuda.is_available()
weight_decay = 10e-4
epochs = 10001
seed = 165
hidden = 10
lr = 0.0001

In [13]:
np.random.seed(seed)
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed(seed)

In [14]:
def normalize(adj):

    adj = torch.FloatTensor(adj)
    adj_id = torch.FloatTensor(torch.eye(adj.shape[1]))
    adj_id = adj_id.reshape((1, adj.shape[1], adj.shape[1]))
    adj_id = adj_id.repeat(adj.shape[0], 1, 1)
    adj = adj + adj_id
    rowsum = torch.FloatTensor(adj.sum(2))
    degree_mat_inv_sqrt = torch.diag_embed(torch.float_power(rowsum,-0.5), dim1=-2, dim2=-1).float()
    adj_norm = torch.bmm(torch.transpose(torch.bmm(adj,degree_mat_inv_sqrt),1,2),degree_mat_inv_sqrt)

    return adj_norm


def doublerelu(x):
    return torch.clamp(x, 0, 1)

class GNN1Layer(Module):

    def __init__(self, batch_size, in_features, out_features, first):
        super(GNN1Layer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.batch_size = batch_size

        weight1_eye = torch.FloatTensor(torch.eye(in_features, out_features))
        weight1_eye = weight1_eye.reshape((1, in_features, out_features))
        weight1_eye = weight1_eye.repeat(batch_size, 1, 1)
        self.weight1 = Parameter(weight1_eye)
        if not first:
            self.weight2 = Parameter(torch.zeros(batch_size, in_features, out_features))
        else:
            self.weight2 = Parameter(torch.empty(batch_size, in_features, out_features))
            nn.init.kaiming_normal_(self.weight2, mode='fan_out')

    def forward(self, input, adj):
        v1 = torch.bmm(input, self.weight1)
        v2 = torch.bmm(torch.bmm(adj, input), self.weight2)
        output = v1 + v2
        return output

In [15]:
class GNN1(nn.Module):

    def __init__(self, batch_size, nfeat, ndim, hidden, first):
        super(GNN1, self).__init__()

        self.gc1 = GNN1Layer(batch_size, nfeat, ndim, first)

    def forward(self, x, adj, random_indices):
        f = torch.clone(x)
        x = doublerelu(self.gc1(x, adj))
        x = x/x.sum(axis=2).unsqueeze(2) #normalize st sum = 1

        f[0][random_indices, :x.shape[2]] = x[0][random_indices, :]
        
        return f[:,:,:x.shape[2]]

In [16]:
def train(adj,features,labels,random_indices,first=False):
    
    adj_norm = normalize(adj)
    
    labels = labels - 1
    
    adj = torch.FloatTensor(adj)
    adj_norm = torch.FloatTensor(adj_norm)
    features = torch.FloatTensor(features)
    labels = torch.FloatTensor(labels)
    
    model = GNN1(batch_size=adj.shape[0],
                nfeat=features.shape[-1],
                ndim=nb_label,
                hidden=hidden,
                first=first)
    if cuda:
        model.cuda()
        features = features.cuda()
        adj = adj.cuda()
        adj_norm = adj_norm.cuda()
        labels = labels.cuda()
    
    # Train model
    t_total = time.time()

    optimizer = optim.Adam(model.parameters(),
                           lr=lr, weight_decay=weight_decay)
    
    criterion = nn.CrossEntropyLoss()
    
    for epoch in range(epochs):

        t = time.time()
        model.train()
        optimizer.zero_grad()

        output = model(features, adj_norm, random_indices)
            
        accuracy = torch.sum(torch.argmax(output,axis=2)==labels.reshape(1,-1))/labels.shape[0]
        
        loss = criterion(output[0],labels.reshape(-1).long())

        loss.backward(retain_graph=True)

        optimizer.step()

        if epoch == 0:
            best_loss = loss
            best_output = output
            best_acc = accuracy
        else:
            if loss < best_loss:
                best_loss = loss
                best_output = output
                best_acc = accuracy

        if epoch % 1000 == 0:
            print('Epoch: {:04d}'.format(epoch + 1),
                  'Accuracy: {:.4f}'.format(best_acc.item()),
                  'Loss: {:.8f}'.format(best_loss.item()),
                  'time: {:.4f}s'.format(time.time() - t))
            
    print("Optimization Finished!")
    print("Total time elapsed: {:.4f}s".format(time.time() - t_total))
    
    return best_loss,best_output

In [17]:
def svdApprox(adj, dim, relu=False):
    adj = torch.FloatTensor(adj[0])
    U, S, Vh = torch.linalg.svd(adj)
    mu = torch.matmul(torch.matmul(U[:, :dim], torch.diag(S[:dim])), Vh[:dim, :])

    embedx = torch.matmul(U[:, :dim], torch.diag(torch.pow(S[:dim], 0.5)))
    embedy = torch.transpose(torch.matmul(torch.diag(torch.pow(S[:dim], 0.5)), Vh[:dim, :]), 0, 1)

    return embedx, embedy

In [18]:
def load_data():

    G = nx.from_pandas_edgelist(data, 'origin', 'destination', 'weight',create_using=nx.DiGraph())
    adj_list = np.array([nx.adjacency_matrix(G).todense()], dtype=float)
    #init_feat = np.array(data.groupby('origin')['initialFeat'].agg(['unique']))
    
    init_feat1 = np.array(data.groupby('origin')['Estimate!!Total!!Total population'].agg(['unique']))
    init_feat1 = np.array(list(map(lambda x: x[0][0], init_feat1))).reshape(-1, 1)
    init_feat2 = np.array(data.groupby('origin')[0].agg(['unique']))
    init_feat2 = np.array(list(map(lambda x: x[0][0], init_feat2))).reshape(-1, 1)
    init_feat3 = np.array(data.groupby('origin')['AREA'].agg(['unique']))
    init_feat3 = np.array(list(map(lambda x: x[0][0], init_feat3))).reshape(-1, 1)
    init_feat4 = np.array(data.groupby('origin')['median_familyIncome(USD)'].agg(['unique']))
    init_feat4 = np.array(list(map(lambda x: x[0][0], init_feat4))).reshape(-1, 1)
    init_feat5 = np.array(data.groupby('origin')['totalJobs'].agg(['unique']))
    init_feat5 = np.array(list(map(lambda x: x[0][0], init_feat5))).reshape(-1, 1)
    
    init_feat = np.concatenate([init_feat1,init_feat2,init_feat3,init_feat4,init_feat5],axis=1)
    
    true_label = np.array(data.groupby('origin')['true_label'].agg(['unique']))
    
    true_label = np.array(list(map(lambda x: x[0][0], true_label))).reshape(-1, 1)
    return adj_list,init_feat,true_label

adj,feature,labels = load_data()

features = np.expand_dims(feature, axis=0)

#feature = feature - 1
#nb_label = int(max(feature)) + 1
#featuress = np.eye(nb_label)[np.array(feature,dtype=int).reshape(1,-1)]

In [19]:
init_feat = np.array(data.groupby('origin')['initialFeat'].agg(['unique']))
init_feat = np.array(list(map(lambda x: x[0][0], init_feat))).reshape(-1, 1)
init_feat = init_feat - 1
nb_label = int(max(init_feat)) + 1

init_feat = np.eye(nb_label)[np.array(init_feat,dtype=int).reshape(1,-1)]

embedx, embedy = svdApprox(adj,dim=3)
embedx = np.array(embedx.unsqueeze(0))
embedy = np.array(embedy.unsqueeze(0))
features = np.concatenate([init_feat,embedx,embedy],axis=2)


In [21]:
mask_percentage = [1.]

for m in mask_percentage:
    
    #features = np.expand_dims(feature, axis=0)
    #print(features.shape)
    
    # Masking
    number_of_rows = features[0].shape[0]
    random_indices = np.random.choice(number_of_rows, size=int(m*number_of_rows), replace=False)
    
    # exclude low activity areas from masking
    zip_sum = data.groupby(by='origin', as_index=False).sum()
    low_act = zip_sum[zip_sum.weight < 3000].index
    
    random_indices = np.setdiff1d(random_indices, low_act)
    random_rows = features[0][random_indices, :]
    features[0][random_indices, :] = np.tile(np.array([[0.2]]),random_rows.shape)
    
    
    print("\nMasked {}% of nodes\n".format(int(m*100)))
    prev_loss, op = train(adj,features,labels, random_indices, True)
    #print(op)
    loss, op = train(adj,op.cpu().detach().numpy(),labels, random_indices)
    while loss < prev_loss :
        prev_loss = loss
        loss, op = train(adj,op.cpu().detach().numpy(),labels, random_indices)


Masked 100% of nodes

Epoch: 0001 Accuracy: 0.3750 Loss: 1.53493810 time: 0.0020s
Epoch: 1001 Accuracy: 0.4279 Loss: 1.48245108 time: 0.0020s
Epoch: 2001 Accuracy: 0.4471 Loss: 1.45914567 time: 0.0020s
Epoch: 3001 Accuracy: 0.5240 Loss: 1.42639744 time: 0.0030s
Epoch: 4001 Accuracy: 0.5288 Loss: 1.38942194 time: 0.0030s
Epoch: 5001 Accuracy: 0.5288 Loss: 1.38942194 time: 0.0020s
Epoch: 6001 Accuracy: 0.5288 Loss: 1.38942194 time: 0.0020s
Epoch: 7001 Accuracy: 0.5288 Loss: 1.38942194 time: 0.0010s
Epoch: 8001 Accuracy: 0.5288 Loss: 1.38942194 time: 0.0020s
Epoch: 9001 Accuracy: 0.5288 Loss: 1.38942194 time: 0.0020s
Epoch: 10001 Accuracy: 0.5288 Loss: 1.38942194 time: 0.0030s
Optimization Finished!
Total time elapsed: 19.6913s
Epoch: 0001 Accuracy: 0.5288 Loss: 1.38942194 time: 0.0030s
Epoch: 1001 Accuracy: 0.5337 Loss: 1.35994554 time: 0.0020s
Epoch: 2001 Accuracy: 0.5288 Loss: 1.34822702 time: 0.0020s
Epoch: 3001 Accuracy: 0.5288 Loss: 1.33670127 time: 0.0020s
Epoch: 4001 Accuracy: 0.

Epoch: 6001 Accuracy: 0.7740 Loss: 1.13269830 time: 0.0030s
Epoch: 7001 Accuracy: 0.7740 Loss: 1.13249767 time: 0.0030s
Epoch: 8001 Accuracy: 0.7740 Loss: 1.13198626 time: 0.0020s
Epoch: 9001 Accuracy: 0.7692 Loss: 1.12637949 time: 0.0030s
Epoch: 10001 Accuracy: 0.7692 Loss: 1.12637949 time: 0.0020s
Optimization Finished!
Total time elapsed: 21.9742s
Epoch: 0001 Accuracy: 0.7692 Loss: 1.12637949 time: 0.0030s
Epoch: 1001 Accuracy: 0.7692 Loss: 1.12628126 time: 0.0020s
Epoch: 2001 Accuracy: 0.7740 Loss: 1.12609851 time: 0.0020s
Epoch: 3001 Accuracy: 0.7692 Loss: 1.12594247 time: 0.0020s
Epoch: 4001 Accuracy: 0.7692 Loss: 1.12576962 time: 0.0020s
Epoch: 5001 Accuracy: 0.7788 Loss: 1.12559521 time: 0.0020s
Epoch: 6001 Accuracy: 0.7788 Loss: 1.12540269 time: 0.0020s
Epoch: 7001 Accuracy: 0.7788 Loss: 1.12518275 time: 0.0020s
Epoch: 8001 Accuracy: 0.7788 Loss: 1.12491333 time: 0.0030s
Epoch: 9001 Accuracy: 0.7788 Loss: 1.12430274 time: 0.0020s
Epoch: 10001 Accuracy: 0.7788 Loss: 1.12352586 

Epoch: 1001 Accuracy: 0.8077 Loss: 1.09714913 time: 0.0020s
Epoch: 2001 Accuracy: 0.8077 Loss: 1.09664798 time: 0.0030s
Epoch: 3001 Accuracy: 0.8077 Loss: 1.09599125 time: 0.0018s
Epoch: 4001 Accuracy: 0.8077 Loss: 1.09586728 time: 0.0030s
Epoch: 5001 Accuracy: 0.8077 Loss: 1.09578371 time: 0.0020s
Epoch: 6001 Accuracy: 0.8077 Loss: 1.09568763 time: 0.0020s
Epoch: 7001 Accuracy: 0.8077 Loss: 1.09551001 time: 0.0020s
Epoch: 8001 Accuracy: 0.8077 Loss: 1.09517157 time: 0.0020s
Epoch: 9001 Accuracy: 0.8077 Loss: 1.09501672 time: 0.0020s
Epoch: 10001 Accuracy: 0.8077 Loss: 1.09501672 time: 0.0030s
Optimization Finished!
Total time elapsed: 20.0555s
Epoch: 0001 Accuracy: 0.8077 Loss: 1.09501672 time: 0.0020s
Epoch: 1001 Accuracy: 0.8077 Loss: 1.09500706 time: 0.0020s
Epoch: 2001 Accuracy: 0.8077 Loss: 1.09498906 time: 0.0020s
Epoch: 3001 Accuracy: 0.8077 Loss: 1.09495521 time: 0.0020s
Epoch: 4001 Accuracy: 0.8077 Loss: 1.09490085 time: 0.0020s
Epoch: 5001 Accuracy: 0.8077 Loss: 1.09482968 t

Epoch: 7001 Accuracy: 0.8077 Loss: 1.09354258 time: 0.0020s
Epoch: 8001 Accuracy: 0.8077 Loss: 1.09354234 time: 0.0020s
Epoch: 9001 Accuracy: 0.8077 Loss: 1.09354198 time: 0.0020s
Epoch: 10001 Accuracy: 0.8077 Loss: 1.09354126 time: 0.0020s
Optimization Finished!
Total time elapsed: 23.0682s
Epoch: 0001 Accuracy: 0.8077 Loss: 1.09354126 time: 0.0030s
Epoch: 1001 Accuracy: 0.8077 Loss: 1.09354103 time: 0.0020s
Epoch: 2001 Accuracy: 0.8077 Loss: 1.09354091 time: 0.0023s
Epoch: 3001 Accuracy: 0.8077 Loss: 1.09354091 time: 0.0024s
Epoch: 4001 Accuracy: 0.8077 Loss: 1.09354091 time: 0.0022s
Epoch: 5001 Accuracy: 0.8077 Loss: 1.09354079 time: 0.0020s
Epoch: 6001 Accuracy: 0.8077 Loss: 1.09354067 time: 0.0030s
Epoch: 7001 Accuracy: 0.8077 Loss: 1.09354043 time: 0.0030s
Epoch: 8001 Accuracy: 0.8077 Loss: 1.09354043 time: 0.0029s
Epoch: 9001 Accuracy: 0.8077 Loss: 1.09354007 time: 0.0024s
Epoch: 10001 Accuracy: 0.8077 Loss: 1.09353936 time: 0.0025s
Optimization Finished!
Total time elapsed: 22.