In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn import decomposition

import warnings
warnings.filterwarnings("ignore") 

## Import census data

In [2]:
data = pd.read_csv('data/data_311_ct_joined.csv', delimiter=',')
data = data.rename(columns={'Complaint Type' : 'Complaint', 'GEOID' : 'ct'})
data = data[['ct', 'Complaint']]
data.head()

Unnamed: 0,GEOID,Complaint
0,36061000100,Lost Property
1,36061000100,Lost Property
2,36061000100,Lost Property
3,36061000100,Lost Property
4,36061000100,Lost Property


In [3]:
data = data.groupby(['ct', 'Complaint']).size().reset_index(name='count')
data.head()

Unnamed: 0,GEOID,Complaint,count
0,36005000100,Blocked Driveway,1
1,36005000100,Consumer Complaint,8
2,36005000100,Food Poisoning,1
3,36005000100,Noise - Residential,1
4,36005000100,Non-Residential Heat,5


## Convert complaint types to categorical one hot variables

In [4]:
#convert a list to a 2d table with zip codes as rows and complaint types as columns
data=pd.pivot_table(data,index='ct',columns='Complaint',values='count',fill_value=0)
data.head()

Complaint,APPLIANCE,Abandoned Vehicle,Air Quality,Animal Facility - No Permit,Animal in a Park,Animal-Abuse,Appliance,Asbestos,BEST/Site Safety,Beach/Pool/Sauna Complaint,...,Vacant Lot,Vending,Violation of Park Rules,WATER LEAK,Water Conservation,Water Leak,Water Quality,Water System,Window Guard,X-Ray Machine/Equipment
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
36005000100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36005000200,0,65,0,0,2,5,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
36005000400,0,33,0,0,2,3,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
36005001600,0,41,0,0,0,5,0,1,0,0,...,0,0,1,0,0,0,0,2,0,0
36005001901,0,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
Total=data.sum(axis=1) #total 311 activity per census tract
data=data.div(data.sum(axis=1), axis=0) #normalize activity of various cathegories within census tract by total
data=data.loc[Total>100] #keep only those census tracts having sufficient activity
data.reset_index(inplace=True)
data.head()

Complaint,GEOID,APPLIANCE,Abandoned Vehicle,Air Quality,Animal Facility - No Permit,Animal in a Park,Animal-Abuse,Appliance,Asbestos,BEST/Site Safety,...,Vacant Lot,Vending,Violation of Park Rules,WATER LEAK,Water Conservation,Water Leak,Water Quality,Water System,Window Guard,X-Ray Machine/Equipment
0,36005000200,0.0,0.090782,0.0,0.0,0.002793,0.006983,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001397,0.0,0.0
1,36005000400,0.0,0.078759,0.0,0.0,0.004773,0.00716,0.0,0.0,0.0,...,0.0,0.0,0.0,0.002387,0.0,0.0,0.0,0.0,0.0,0.0
2,36005001600,0.0,0.127726,0.0,0.0,0.0,0.015576,0.0,0.003115,0.0,...,0.0,0.0,0.003115,0.0,0.0,0.0,0.0,0.006231,0.0,0.0
3,36005001901,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,36005001902,0.0,0.036066,0.003279,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003279,0.0,0.0


## Import the labels

In [12]:
data1 = pd.read_csv('data/LEHD_311_2019.csv', delimiter=',')
data1 = data1.rename(columns={'distance': 'weight'})
data1 = data1[data1.destination.isin(data1.origin.unique())]
data1 = data1[['origin', 'destination', 'weight', 'true_label']]
data1 = data1[data1['origin'].isin(data['ct'])][data1['destination'].isin(data['ct'])]

## Merge all the features

In [13]:
data2 = data1[['origin','true_label']].drop_duplicates()
data2 = data2.rename(columns={'origin': 'ct'})
data = data.merge(data2)

In [14]:
data

Unnamed: 0,GEOID,APPLIANCE,Abandoned Vehicle,Air Quality,Animal Facility - No Permit,Animal in a Park,Animal-Abuse,Appliance,Asbestos,BEST/Site Safety,...,Vending,Violation of Park Rules,WATER LEAK,Water Conservation,Water Leak,Water Quality,Water System,Window Guard,X-Ray Machine/Equipment,true_label
0,36005000200,0.000000,0.090782,0.000000,0.0,0.002793,0.006983,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.001397,0.0,0.0,2.0
1,36005000400,0.000000,0.078759,0.000000,0.0,0.004773,0.007160,0.0,0.000000,0.0,...,0.000000,0.000000,0.002387,0.000000,0.0,0.000000,0.000000,0.0,0.0,2.0
2,36005001600,0.000000,0.127726,0.000000,0.0,0.000000,0.015576,0.0,0.003115,0.0,...,0.000000,0.003115,0.000000,0.000000,0.0,0.000000,0.006231,0.0,0.0,2.0
3,36005002500,0.004367,0.000000,0.000000,0.0,0.000000,0.008734,0.0,0.000000,0.0,...,0.008734,0.000000,0.000000,0.000000,0.0,0.000000,0.004367,0.0,0.0,2.0
4,36005002701,0.000000,0.006849,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.006849,0.000000,0.000000,0.0,0.000000,0.006849,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1884,36085030301,0.000000,0.035556,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.004444,0.0,0.000000,0.008889,0.0,0.0,5.0
1885,36085030302,0.000000,0.066667,0.000000,0.0,0.000000,0.008696,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.002899,0.002899,0.0,0.0,5.0
1886,36085031901,0.000000,0.104167,0.000000,0.0,0.006944,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.006944,0.0,0.000000,0.000000,0.0,0.0,5.0
1887,36085031902,0.000000,0.112676,0.002347,0.0,0.000000,0.004695,0.0,0.002347,0.0,...,0.000000,0.002347,0.000000,0.000000,0.0,0.000000,0.004695,0.0,0.0,5.0


## Load Data for processing

In [15]:
# function for loading data, returns adjacency matrix, initial feature assignments and true labels

def load_data():
    
    G = nx.from_pandas_edgelist(data1, 'origin', 'destination', 'weight',create_using=nx.DiGraph())
    adj_list = np.array([nx.adjacency_matrix(G).todense()], dtype=float)
    
    init_feat = data.to_numpy()[:,:-1]
    
    true_label = data.to_numpy()[:,-1].reshape(-1, 1)
    
    return adj_list,init_feat,true_label

adj,feature,labels = load_data()

labels = labels - 1

## PCA and reduce dimension

In [16]:
pca = decomposition.PCA(n_components=10)
feature = pca.fit_transform(feature)

features = np.expand_dims(feature, axis=0)

## Importing different classifier from sklearn

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from xgboost.sklearn import XGBClassifier

from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

## Divide into 80% train and 20% validation Set

In [18]:
X_train, X_test, y_train, y_test = train_test_split(feature, labels, test_size = 0.2, random_state = seed)

## Define and train various baseline models on the data

In [19]:
# Baseline Model building to train
names = ['K Nearest Neighbors', 'Logistic Regression', 'Decision Tree', 'Random Forest', 'Neural Net',
         'AddaBoost', 'Gaussian Naive Bayes', 'SVM Sigmoid', 'Gradient Boosting', 'XGBoost', 'GNN']
Classifiers = [
    KNeighborsClassifier(n_neighbors = 5),
    LogisticRegression(),
    DecisionTreeClassifier(max_depth = 5),
    RandomForestClassifier(max_depth = 5, n_estimators = 10, max_features =  "auto" ),
    MLPClassifier(alpha = .01),
    AdaBoostClassifier(),
    GaussianNB(),
    svm.SVC(kernel = 'sigmoid'),
    GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=5),
    XGBClassifier()
    ]
models = zip(names, Classifiers)

train_results = []
val_results = []

for name, model in models:
    
    model.fit(X_train,y_train)
    train_acc = model.score(X_train,y_train)
    val_acc = model.score(X_test,y_test)
    
    msg = "{0}:\nTraining Accuracy : {1} Validation Accuracy : {2}\n".format(name, train_acc, val_acc)
    print(msg)
    
    train_results.append(train_acc)
    val_results.append(val_acc)

K Nearest Neighbors:
Training Accuracy : 0.9841164791528789 Validation Accuracy : 0.9788359788359788

Logistic Regression:
Training Accuracy : 0.4387822634017207 Validation Accuracy : 0.4470899470899471

Decision Tree:
Training Accuracy : 0.9874255459960292 Validation Accuracy : 0.9841269841269841

Random Forest:
Training Accuracy : 0.9702183984116479 Validation Accuracy : 0.9682539682539683

Neural Net:
Training Accuracy : 0.20450033090668432 Validation Accuracy : 0.2222222222222222

AddaBoost:
Training Accuracy : 0.4930509596293845 Validation Accuracy : 0.5317460317460317

Gaussian Naive Bayes:
Training Accuracy : 0.9814692256783587 Validation Accuracy : 0.9867724867724867

SVM Sigmoid:
Training Accuracy : 0.9364659166115156 Validation Accuracy : 0.9285714285714286

Gradient Boosting:
Training Accuracy : 1.0 Validation Accuracy : 0.9814814814814815

XGBoost:
Training Accuracy : 1.0 Validation Accuracy : 0.9814814814814815



## Import packages for GNN

In [20]:
# import packages

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
torch.set_printoptions(sci_mode=False)
import time

## Hyperparameters

In [21]:
# set initial model config

cuda = torch.cuda.is_available()
weight_decay = 1e-6
epochs = 250000
seed = 4830
hidden = 20
lr = 0.0005

In [22]:
np.random.seed(seed)
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed(seed)

## Define symmetric normalization for the adjacency matrix

In [23]:
def normalize(adj):

    adj = torch.FloatTensor(adj)
    adj_id = torch.FloatTensor(torch.eye(adj.shape[1]))
    adj_id = adj_id.reshape((1, adj.shape[1], adj.shape[1]))
    adj_id = adj_id.repeat(adj.shape[0], 1, 1)
    adj = adj + adj_id
    rowsum = torch.FloatTensor(adj.sum(2))
    degree_mat_inv_sqrt = torch.diag_embed(torch.float_power(rowsum,-0.5), dim1=-2, dim2=-1).float()
    adj_norm = torch.bmm(torch.transpose(torch.bmm(adj,degree_mat_inv_sqrt),1,2),degree_mat_inv_sqrt)

    return adj_norm

## Define GNN Layers

In [24]:
class GNN1Layer(Module):

    def __init__(self, batch_size, in_features, out_features, first):
        super(GNN1Layer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.batch_size = batch_size
        
        # Initialse W1 = 1, W2 = 0 as pytorch learnable weights (parameters) that have require_grad = True which is
        # required for calculating gradients while backpropogating using gradient descent
        weight1_eye = torch.FloatTensor(torch.eye(in_features, out_features))
        weight1_eye = weight1_eye.reshape((1, in_features, out_features))
        weight1_eye = weight1_eye.repeat(batch_size, 1, 1)
        self.weight1 = Parameter(weight1_eye)
        
        if not first:
            self.weight2 = Parameter(torch.zeros(batch_size, in_features, out_features))
        else:
            self.weight2 = Parameter(torch.empty(batch_size, in_features, out_features))
            nn.init.kaiming_normal_(self.weight2, mode='fan_out')
            nn.init.kaiming_normal_(self.weight1, mode='fan_out')

    def forward(self, input, adj):
        # first term H*W1
        v1 = torch.bmm(input, self.weight1)
        # second term adj_norm*H*W2
        v2 = torch.bmm(torch.bmm(adj, input), self.weight2)
        # adding the two terms
        output = v1 + v2

        return output

## Define GNN Model

In [25]:
class GNN1(nn.Module):

    def __init__(self, batch_size, nfeat, ndim, hidden, first):
        super(GNN1, self).__init__()

        self.gc1 = GNN1Layer(batch_size, nfeat, hidden, first)
        self.gc2 = GNN1Layer(batch_size, hidden, ndim, first)

    def forward(self, x, adj):

        # Applying activation function sigma on the layer propogation
        x = nn.functional.sigmoid(self.gc1(x, adj))
        x = nn.functional.sigmoid(self.gc2(x, adj))
        x = x/x.sum(axis=2).unsqueeze(2) #normalize st sum = 1
        
        return x

## Define the training function

In [26]:
def train(adj,features,labels,train_indices,val_indices,first=False):
    
    # calculate symmetric normalisation for layer propogation
    adj_norm = normalize(adj)
    
    # Convert from numpy to torch tensors
    adj = torch.FloatTensor(adj)
    adj_norm = torch.FloatTensor(adj_norm)
    features = torch.FloatTensor(features)
    labels = torch.FloatTensor(labels)
    
    # initialise the model
    model = GNN1(batch_size=adj.shape[0],
                nfeat=features.shape[-1],
                ndim=5,
                hidden=hidden,
                first=first)
    
    # Transfer the weights to GPU for training
    if cuda:
        model.cuda()
        features = features.cuda()
        adj = adj.cuda()
        adj_norm = adj_norm.cuda()
        labels = labels.cuda()
    
    # Train model
    t_total = time.time()

    # Using adam optimizers for backpropogation
    optimizer = optim.Adam(model.parameters(),
                           lr=lr, weight_decay=weight_decay)
    
    # loss function criteria is cross entropy loss
    criterion = nn.CrossEntropyLoss()
    
    # Train for the no of epochs
    for epoch in range(epochs):

        t = time.time()
        
        model.train()
        
        # Pytorch accumulates gradient after every operation on tensors (defined by the model architecture)
        # with require_grad = True. With each new epoch, we need to reset this gradient to 0 to calculate gradient
        # for this epoch.
        optimizer.zero_grad()

        # get the output from forward propogation of our model
        output = model(features, adj_norm)
        
        
        # Calculate Train accuracy
        train_output = output[:,train_indices,:]
        train_labels = labels[train_indices,:]
        
        train_accuracy = torch.sum(torch.argmax(train_output,axis=2)==train_labels.reshape(1,-1))/train_labels.shape[0]
        
        # Calculate the loss between our models training output and true label
        loss = criterion(output[0],labels.reshape(-1).long())
        
        # Calculate the gradients 
        loss.backward(retain_graph=True)

        # Update the weights
        optimizer.step()
        
        model.eval()
        
        # Calculate Validation accuracy
        with torch.no_grad():
            val_output = output[:,val_indices,:]
            val_labels = labels[val_indices,:]
            val_accuracy = torch.sum(torch.argmax(val_output,axis=2)==val_labels.reshape(1,-1))/val_labels.shape[0]

        # Print summary of training 
        if epoch == 0:
            best_loss = loss
            best_output = output
            best_acc = train_accuracy
            best_val_acc = val_accuracy
            best_val_output = val_output
        else:
            if loss < best_loss:
                best_loss = loss
                best_output = output
                best_acc = train_accuracy
                best_val_acc = val_accuracy
                best_val_output = val_output

        if epoch == 0 or (epoch+1) % 1000 == 0:
            print('Epoch: {:04d}'.format(epoch + 1),
                  'Train Accuracy: {:.4f}'.format(best_acc.item()),
                  'Validation Accuracy: {:.4f}'.format(best_val_acc.item()),
                  'Loss: {:.8f}'.format(best_loss.item()),
                  'time: {:.4f}s'.format(time.time() - t))
            
    print("Optimization Finished!")
    print("Total time elapsed: {:.4f}s".format(time.time() - t_total))
    
    return best_loss, best_output, best_acc.item(), best_val_acc.item()

## Divide dataset into 80% train and 20% validation and run the training

In [27]:
# set Train %

train_percentage = .8
    
# Train set
number_of_rows = features[0].shape[0]
train_indices = np.random.choice(number_of_rows, size=int(train_percentage*number_of_rows), replace=False)
val_indices = np.setdiff1d(np.arange(adj.shape[1]),train_indices)

# Start Train
loss, op, train_acc, val_acc = train(adj,features,labels,train_indices,val_indices,True)

train_results.append(train_acc)
val_results.append(val_acc)

Epoch: 0001 Train Accuracy: 0.0079 Validation Accuracy: 0.0053 Loss: 1.58273482 time: 4.4025s
Epoch: 1000 Train Accuracy: 0.6466 Validation Accuracy: 0.6323 Loss: 1.25642002 time: 0.0025s
Epoch: 2000 Train Accuracy: 0.6466 Validation Accuracy: 0.6323 Loss: 1.23692572 time: 0.0000s
Epoch: 3000 Train Accuracy: 0.7174 Validation Accuracy: 0.7011 Loss: 1.20823228 time: 0.0000s
Epoch: 4000 Train Accuracy: 0.7174 Validation Accuracy: 0.7011 Loss: 1.19336462 time: 0.0070s
Epoch: 5000 Train Accuracy: 0.7280 Validation Accuracy: 0.7249 Loss: 1.17925107 time: 0.0000s
Epoch: 6000 Train Accuracy: 0.7743 Validation Accuracy: 0.7672 Loss: 1.16850817 time: 0.0000s
Epoch: 7000 Train Accuracy: 0.7823 Validation Accuracy: 0.7751 Loss: 1.16043556 time: 0.0000s
Epoch: 8000 Train Accuracy: 0.7842 Validation Accuracy: 0.7804 Loss: 1.15424967 time: 0.0086s
Epoch: 9000 Train Accuracy: 0.7829 Validation Accuracy: 0.7831 Loss: 1.14937556 time: 0.0100s
Epoch: 10000 Train Accuracy: 0.7842 Validation Accuracy: 0.7

Epoch: 87000 Train Accuracy: 0.8683 Validation Accuracy: 0.8704 Loss: 1.04028201 time: 0.0000s
Epoch: 88000 Train Accuracy: 0.8683 Validation Accuracy: 0.8704 Loss: 1.04028201 time: 0.0045s
Epoch: 89000 Train Accuracy: 0.8683 Validation Accuracy: 0.8704 Loss: 1.04028201 time: 0.0000s
Epoch: 90000 Train Accuracy: 0.8683 Validation Accuracy: 0.8704 Loss: 1.04028201 time: 0.0051s
Epoch: 91000 Train Accuracy: 0.8670 Validation Accuracy: 0.8624 Loss: 1.03911519 time: 0.0000s
Epoch: 92000 Train Accuracy: 0.8670 Validation Accuracy: 0.8624 Loss: 1.03893185 time: 0.0000s
Epoch: 93000 Train Accuracy: 0.8676 Validation Accuracy: 0.8624 Loss: 1.03874707 time: 0.0000s
Epoch: 94000 Train Accuracy: 0.8676 Validation Accuracy: 0.8624 Loss: 1.03874707 time: 0.0000s
Epoch: 95000 Train Accuracy: 0.8676 Validation Accuracy: 0.8624 Loss: 1.03874707 time: 0.0000s
Epoch: 96000 Train Accuracy: 0.8676 Validation Accuracy: 0.8624 Loss: 1.03874707 time: 0.0101s
Epoch: 97000 Train Accuracy: 0.8676 Validation Acc

Epoch: 173000 Train Accuracy: 0.8782 Validation Accuracy: 0.8757 Loss: 1.03259826 time: 0.0010s
Epoch: 174000 Train Accuracy: 0.8782 Validation Accuracy: 0.8757 Loss: 1.03259826 time: 0.0000s
Epoch: 175000 Train Accuracy: 0.8782 Validation Accuracy: 0.8757 Loss: 1.03259826 time: 0.0000s
Epoch: 176000 Train Accuracy: 0.8782 Validation Accuracy: 0.8757 Loss: 1.03259826 time: 0.0000s
Epoch: 177000 Train Accuracy: 0.8782 Validation Accuracy: 0.8757 Loss: 1.03259826 time: 0.0061s
Epoch: 178000 Train Accuracy: 0.8782 Validation Accuracy: 0.8757 Loss: 1.03259826 time: 0.0000s
Epoch: 179000 Train Accuracy: 0.8782 Validation Accuracy: 0.8757 Loss: 1.03259826 time: 0.0000s
Epoch: 180000 Train Accuracy: 0.8782 Validation Accuracy: 0.8757 Loss: 1.03259826 time: 0.0100s
Epoch: 181000 Train Accuracy: 0.8782 Validation Accuracy: 0.8757 Loss: 1.03259826 time: 0.0030s
Epoch: 182000 Train Accuracy: 0.8782 Validation Accuracy: 0.8757 Loss: 1.03259826 time: 0.0080s
Epoch: 183000 Train Accuracy: 0.8782 Val

## Compare the accuracy of different models

In [28]:
d = {'Algorithm': names, 'Train Accuracy': train_results, 'Validation Accuracy': val_results}
df = pd.DataFrame(data=d)
df

Unnamed: 0,Algorithm,Train Accuracy,Validation Accuracy
0,K Nearest Neighbors,0.984116,0.978836
1,Logistic Regression,0.438782,0.44709
2,Decision Tree,0.987426,0.984127
3,Random Forest,0.970218,0.968254
4,Neural Net,0.2045,0.222222
5,AddaBoost,0.493051,0.531746
6,Gaussian Naive Bayes,0.981469,0.986772
7,SVM Sigmoid,0.936466,0.928571
8,Gradient Boosting,1.0,0.981481
9,XGBoost,1.0,0.981481
