In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn import decomposition

import warnings
warnings.filterwarnings("ignore") 

seed = 433

## Import census data

In [2]:
data = pd.read_csv('data_311_ct_joined.csv', delimiter=',')
data = data.rename(columns={'Incident Zip': 'Zip', 'Complaint Type' : 'Complaint'})
data.Zip=pd.to_numeric(data.Zip,errors='coerce')
data=data.loc[(data.Zip>=10000)&(data.Zip<11500)]
data = data[['GEOID', 'Complaint']]
data.head()

Unnamed: 0,GEOID,Complaint
0,36061000100,Lost Property
1,36061000100,Lost Property
2,36061000100,Lost Property
3,36061000100,Lost Property
4,36061000100,Lost Property


In [3]:
data = data.groupby(['GEOID', 'Complaint']).size().reset_index(name='count')
data.head()

Unnamed: 0,GEOID,Complaint,count
0,36005000100,Blocked Driveway,1
1,36005000100,Consumer Complaint,8
2,36005000100,Food Poisoning,1
3,36005000100,Noise - Residential,1
4,36005000100,Non-Residential Heat,5


## Convert complaint types to categorical one hot variables

In [4]:
#convert a list to a 2d table with zip codes as rows and complaint types as columns
data=pd.pivot_table(data,index='GEOID',columns='Complaint',values='count',fill_value=0)
data.head()

Complaint,APPLIANCE,Abandoned Vehicle,Air Quality,Animal Facility - No Permit,Animal in a Park,Animal-Abuse,Appliance,Asbestos,BEST/Site Safety,Beach/Pool/Sauna Complaint,...,Vacant Lot,Vending,Violation of Park Rules,WATER LEAK,Water Conservation,Water Leak,Water Quality,Water System,Window Guard,X-Ray Machine/Equipment
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
36005000100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36005000200,0,65,0,0,2,5,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
36005000400,0,33,0,0,2,3,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
36005001600,0,41,0,0,0,5,0,1,0,0,...,0,0,1,0,0,0,0,2,0,0
36005001901,0,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
Total=data.sum(axis=1) #total 311 activity per census tract
data=data.div(data.sum(axis=1), axis=0) #normalize activity of various cathegories within census tract by total
data=data.loc[Total>100] #keep only those census tracts having sufficient activity
data.reset_index(inplace=True)
data.head()

Complaint,GEOID,APPLIANCE,Abandoned Vehicle,Air Quality,Animal Facility - No Permit,Animal in a Park,Animal-Abuse,Appliance,Asbestos,BEST/Site Safety,...,Vacant Lot,Vending,Violation of Park Rules,WATER LEAK,Water Conservation,Water Leak,Water Quality,Water System,Window Guard,X-Ray Machine/Equipment
0,36005000200,0.0,0.090782,0.0,0.0,0.002793,0.006983,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001397,0.0,0.0
1,36005000400,0.0,0.078759,0.0,0.0,0.004773,0.00716,0.0,0.0,0.0,...,0.0,0.0,0.0,0.002387,0.0,0.0,0.0,0.0,0.0,0.0
2,36005001600,0.0,0.127726,0.0,0.0,0.0,0.015576,0.0,0.003115,0.0,...,0.0,0.0,0.003115,0.0,0.0,0.0,0.0,0.006231,0.0,0.0
3,36005001901,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,36005001902,0.0,0.036066,0.003279,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003279,0.0,0.0


## Add the income level feature into the dataset

In [6]:
"""# Add the income level feature into the dataset

income = pd.read_csv('data/zipcode_income.csv', delimiter=',')
income = income.iloc[:,:2]
income[income.isna()] = 0
income.rename(columns={'ZIPCODE':'Zip', 'median_familyIncome(USD)' : 'income'}, inplace=True)
income=income.loc[(income.Zip>=10000)&(income.Zip<11500)]
income['income'][income['income'] == 0.0] = income['income'].mean() # Replace 0 income with mean
income['income'] = income['income'] / income['income'].sum()
income.head()"""

"# Add the income level feature into the dataset\n\nincome = pd.read_csv('data/zipcode_income.csv', delimiter=',')\nincome = income.iloc[:,:2]\nincome[income.isna()] = 0\nincome.rename(columns={'ZIPCODE':'Zip', 'median_familyIncome(USD)' : 'income'}, inplace=True)\nincome=income.loc[(income.Zip>=10000)&(income.Zip<11500)]\nincome['income'][income['income'] == 0.0] = income['income'].mean() # Replace 0 income with mean\nincome['income'] = income['income'] / income['income'].sum()\nincome.head()"

## Add the area size feature into the dataset

In [7]:
"""# Add the area size feature into the dataset

area = pd.read_csv('data/zips_area.csv', delimiter=',')
area = area.iloc[:,:2]
area.rename(columns={'ZIPCODE':'Zip', 'AREA' : 'area'}, inplace=True)
areas = area.copy()
area=area.loc[(area.Zip>=10000)&(area.Zip<11500)]
area['area'] = area['area'] / area['area'].sum()
area.head()"""

"# Add the area size feature into the dataset\n\narea = pd.read_csv('data/zips_area.csv', delimiter=',')\narea = area.iloc[:,:2]\narea.rename(columns={'ZIPCODE':'Zip', 'AREA' : 'area'}, inplace=True)\nareas = area.copy()\narea=area.loc[(area.Zip>=10000)&(area.Zip<11500)]\narea['area'] = area['area'] / area['area'].sum()\narea.head()"

## Add the population jobs feature into the dataset

In [8]:
"""# Add the population jobs feature into the dataset

jobs = pd.read_csv('data/zipcode_population_Jobs.csv', delimiter=',')
jobs = jobs.iloc[:,:2]
jobs.rename(columns={'ZIPCODE':'Zip', 'totalJobs' : 'jobs'}, inplace=True)
jobs['jobs'] = jobs['jobs'] / areas['area']
jobs.head()"""

"# Add the population jobs feature into the dataset\n\njobs = pd.read_csv('data/zipcode_population_Jobs.csv', delimiter=',')\njobs = jobs.iloc[:,:2]\njobs.rename(columns={'ZIPCODE':'Zip', 'totalJobs' : 'jobs'}, inplace=True)\njobs['jobs'] = jobs['jobs'] / areas['area']\njobs.head()"

## Add the population feature into the dataset

In [9]:
"""# Add the population feature into the dataset

population = pd.read_csv('data/zipcode_population_Jobs.csv', delimiter=',')
population = population[['ZIPCODE','POPULATION']]
population.rename(columns={'ZIPCODE':'Zip', 'POPULATION' : 'population'}, inplace=True)
population['population'][population['population'] < 30.0] = population['population'].mean() # Replace 0 population with mean
population['population'] = population['population'] / areas['area']
population.head()"""

"# Add the population feature into the dataset\n\npopulation = pd.read_csv('data/zipcode_population_Jobs.csv', delimiter=',')\npopulation = population[['ZIPCODE','POPULATION']]\npopulation.rename(columns={'ZIPCODE':'Zip', 'POPULATION' : 'population'}, inplace=True)\npopulation['population'][population['population'] < 30.0] = population['population'].mean() # Replace 0 population with mean\npopulation['population'] = population['population'] / areas['area']\npopulation.head()"

## Add the house price feature into the dataset

In [10]:
"""# Add the house price feature into the dataset

housePrice = pd.read_csv('data/zipcode_housePrice.csv', delimiter=',')
weights = [5000,12500,17500,22500,27500,32500,37500,45000,55000,65000,75000,85000
                    ,95000,112500,137500,162500,187500,225000,275000,350000,450000,625000,875000
                        ,1250000,1750000,2000000]
for i in range(len(weights)):
    housePrice.iloc[i,2:] = housePrice.iloc[i,2:]*weights[i]
housePrice.iloc[:,1] [housePrice.iloc[:,1] == 0] = 1

tmp = (housePrice.iloc[:,2:] != 0 ).sum(axis=1)
tmp[tmp == 0] = 1
housePrice = pd.concat([housePrice.iloc[:,0],housePrice.iloc[:,2:].sum(axis=1) / tmp ], axis = 1 )
housePrice.rename(columns={'ZIPCODE':'Zip', 0: 'house_price'}, inplace=True)
housePrice['house_price'] = housePrice['house_price']/housePrice['house_price'].sum()"""

"# Add the house price feature into the dataset\n\nhousePrice = pd.read_csv('data/zipcode_housePrice.csv', delimiter=',')\nweights = [5000,12500,17500,22500,27500,32500,37500,45000,55000,65000,75000,85000\n                    ,95000,112500,137500,162500,187500,225000,275000,350000,450000,625000,875000\n                        ,1250000,1750000,2000000]\nfor i in range(len(weights)):\n    housePrice.iloc[i,2:] = housePrice.iloc[i,2:]*weights[i]\nhousePrice.iloc[:,1] [housePrice.iloc[:,1] == 0] = 1\n\ntmp = (housePrice.iloc[:,2:] != 0 ).sum(axis=1)\ntmp[tmp == 0] = 1\nhousePrice = pd.concat([housePrice.iloc[:,0],housePrice.iloc[:,2:].sum(axis=1) / tmp ], axis = 1 )\nhousePrice.rename(columns={'ZIPCODE':'Zip', 0: 'house_price'}, inplace=True)\nhousePrice['house_price'] = housePrice['house_price']/housePrice['house_price'].sum()"

## Merge all the features

In [11]:
"""data = pd.merge(data, income, on="Zip")
data = data.merge(population)
data = data.merge(jobs)
data = data.merge(housePrice)
data = data.merge(area)
data = data.drop_duplicates(subset=['Zip'], keep='last')
data.head()"""

'data = pd.merge(data, income, on="Zip")\ndata = data.merge(population)\ndata = data.merge(jobs)\ndata = data.merge(housePrice)\ndata = data.merge(area)\ndata = data.drop_duplicates(subset=[\'Zip\'], keep=\'last\')\ndata.head()'

In [12]:
data1 = pd.read_csv('LEHD_311_2019.csv', delimiter=',')
data1 = data1.rename(columns={'distance': 'weight'})
data1 = data1[data1.destination.isin(data1.origin.unique())]
data1 = data1[['origin', 'destination', 'weight', 'true_label']]
data1 = data1[data1['origin'].isin(data['GEOID'])][data1['destination'].isin(data['GEOID'])]

In [13]:
data2 = data1[['origin','true_label']].drop_duplicates()
data2 = data2.rename(columns={'origin': 'GEOID'})
data = data.merge(data2)

In [14]:
data

Unnamed: 0,GEOID,APPLIANCE,Abandoned Vehicle,Air Quality,Animal Facility - No Permit,Animal in a Park,Animal-Abuse,Appliance,Asbestos,BEST/Site Safety,...,Vending,Violation of Park Rules,WATER LEAK,Water Conservation,Water Leak,Water Quality,Water System,Window Guard,X-Ray Machine/Equipment,true_label
0,36005000200,0.000000,0.090782,0.000000,0.0,0.002793,0.006983,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.001397,0.0,0.0,2.0
1,36005000400,0.000000,0.078759,0.000000,0.0,0.004773,0.007160,0.0,0.000000,0.0,...,0.000000,0.000000,0.002387,0.000000,0.0,0.000000,0.000000,0.0,0.0,2.0
2,36005001600,0.000000,0.127726,0.000000,0.0,0.000000,0.015576,0.0,0.003115,0.0,...,0.000000,0.003115,0.000000,0.000000,0.0,0.000000,0.006231,0.0,0.0,2.0
3,36005002500,0.004367,0.000000,0.000000,0.0,0.000000,0.008734,0.0,0.000000,0.0,...,0.008734,0.000000,0.000000,0.000000,0.0,0.000000,0.004367,0.0,0.0,2.0
4,36005002701,0.000000,0.006849,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.006849,0.000000,0.000000,0.0,0.000000,0.006849,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1884,36085030301,0.000000,0.035556,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.004444,0.0,0.000000,0.008889,0.0,0.0,5.0
1885,36085030302,0.000000,0.066667,0.000000,0.0,0.000000,0.008696,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.002899,0.002899,0.0,0.0,5.0
1886,36085031901,0.000000,0.104167,0.000000,0.0,0.006944,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.006944,0.0,0.000000,0.000000,0.0,0.0,5.0
1887,36085031902,0.000000,0.112676,0.002347,0.0,0.000000,0.004695,0.0,0.002347,0.0,...,0.000000,0.002347,0.000000,0.000000,0.0,0.000000,0.004695,0.0,0.0,5.0


## Load Data for processing

In [15]:
# function for loading data, returns adjacency matrix, initial feature assignments and true labels

def load_data():
    
    data1 = pd.read_csv('LEHD_311_2019.csv', delimiter=',')
    data1 = data1[data1.origin.isin(data.GEOID.unique())][data1.destination.isin(data.GEOID.unique())]
    G = nx.from_pandas_edgelist(data1, 'origin', 'destination', 'distance',create_using=nx.DiGraph())
    adj_list = np.array([nx.adjacency_matrix(G).todense()], dtype=float)
    
    init_feat = data.to_numpy()[:,:-1]
    
    true_label = data.to_numpy()[:,-1].reshape(-1, 1)
    
    return adj_list,init_feat,true_label

adj,feature,labels = load_data()

labels = labels - 1

## PCA and reduce dimension

In [16]:
pca = decomposition.PCA(n_components=5)
feature = pca.fit_transform(feature)

features = np.expand_dims(feature, axis=0)

## Importing different classifier from sklearn

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from xgboost.sklearn import XGBClassifier

from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

## Divide into 80% train and 20% validation Set

In [18]:
X_train, X_test, y_train, y_test = train_test_split(feature, labels, test_size = 0.2, random_state = seed)

## Define and train various baseline models on the data

In [19]:
# Baseline Model building to train
names = ['K Nearest Neighbors', 'Logistic Regression', 'Decision Tree', 'Random Forest', 'Neural Net',
         'AddaBoost', 'Gaussian Naive Bayes', 'SVM Sigmoid', 'Gradient Boosting', 'XGBoost', 'Graph Attention']
Classifiers = [
    KNeighborsClassifier(n_neighbors = 5),
    LogisticRegression(),
    DecisionTreeClassifier(max_depth = 5),
    RandomForestClassifier(max_depth = 5, n_estimators = 10, max_features =  "auto" ),
    MLPClassifier(alpha = .01),
    AdaBoostClassifier(),
    GaussianNB(),
    svm.SVC(kernel = 'sigmoid'),
    GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=5),
    XGBClassifier()
    ]
models = zip(names, Classifiers)

train_results = []
val_results = []

for name, model in models:
    
    model.fit(X_train,y_train)
    train_acc = model.score(X_train,y_train)
    val_acc = model.score(X_test,y_test)
    
    msg = "{0}:\nTraining Accuracy : {1} Validation Accuracy : {2}\n".format(name, train_acc, val_acc)
    print(msg)
    
    train_results.append(train_acc)
    val_results.append(val_acc)

K Nearest Neighbors:
Training Accuracy : 0.9841164791528789 Validation Accuracy : 0.9788359788359788

Logistic Regression:
Training Accuracy : 0.4387822634017207 Validation Accuracy : 0.4470899470899471

Decision Tree:
Training Accuracy : 0.986763732627399 Validation Accuracy : 0.9814814814814815

Random Forest:
Training Accuracy : 0.9695565850430179 Validation Accuracy : 0.9391534391534392

Neural Net:
Training Accuracy : 0.4387822634017207 Validation Accuracy : 0.4470899470899471

AddaBoost:
Training Accuracy : 0.4884182660489742 Validation Accuracy : 0.5264550264550265

Gaussian Naive Bayes:
Training Accuracy : 0.9814692256783587 Validation Accuracy : 0.9867724867724867

SVM Sigmoid:
Training Accuracy : 0.9364659166115156 Validation Accuracy : 0.9285714285714286

Gradient Boosting:
Training Accuracy : 1.0 Validation Accuracy : 0.9788359788359788

XGBoost:
Training Accuracy : 1.0 Validation Accuracy : 0.9735449735449735



## Import packages for GNN

In [20]:
# import packages

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
torch.set_printoptions(sci_mode=False)
import time

## Hyperparameters

In [21]:
# set initial model config

cuda = torch.cuda.is_available()
weight_decay = 1e-7
epochs = 2500
seed = 627
hidden = 10
lr = 0.0001

In [22]:
np.random.seed(seed)
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed(seed)

## Define symmetric normalization for the adjacency matrix

In [23]:
def normalize(adj):

    adj = torch.FloatTensor(adj)
    adj_id = torch.FloatTensor(torch.eye(adj.shape[1]))
    adj_id = adj_id.reshape((1, adj.shape[1], adj.shape[1]))
    adj_id = adj_id.repeat(adj.shape[0], 1, 1)
    adj = adj + adj_id
    rowsum = torch.FloatTensor(adj.sum(2))
    degree_mat_inv_sqrt = torch.diag_embed(torch.float_power(rowsum,-0.5), dim1=-2, dim2=-1).float()
    adj_norm = torch.bmm(torch.transpose(torch.bmm(adj,degree_mat_inv_sqrt),1,2),degree_mat_inv_sqrt)

    return adj_norm

## Define GNN Layers

In [24]:
class GraphAttentionLayer(nn.Module):

    def __init__(self, batch_size, in_features, out_features, first, concat=True):
        super(GraphAttentionLayer, self).__init__()
        
        self.in_features = in_features
        self.out_features = out_features
        self.batch_size = batch_size
        self.concat = concat

        self.weight1 = Parameter(torch.empty(batch_size, in_features, out_features))
        nn.init.xavier_uniform_(self.weight1.data, gain=1.414)
        self.a = Parameter(torch.empty(size=(batch_size, 2*out_features, 1)))
        nn.init.xavier_uniform_(self.a.data, gain=1.414)

        self.leakyrelu = nn.LeakyReLU(0.2)

    def forward(self, input, adj):
        Wh = torch.bmm(input, self.weight1) # h.shape: (N, in_features), Wh.shape: (N, out_features)
        e = self._prepare_attentional_mechanism_input(Wh)

        zero_vec = -9e15*torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)
        attention = F.softmax(attention, dim=1)
        h_prime = torch.bmm(attention, Wh)

        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

    def _prepare_attentional_mechanism_input(self, Wh):
        # Wh.shape (N, out_feature)
        # self.a.shape (2 * out_feature, 1)
        # Wh1&2.shape (N, 1)
        # e.shape (N, N)
        Wh1 = torch.bmm(Wh, self.a[:, :self.out_features, :])
        Wh2 = torch.bmm(Wh, self.a[:, self.out_features:, :])
        # broadcast add
        e = Wh1 + Wh2.T
        return self.leakyrelu(e)

## Define GNN Model

In [25]:
class GAT(nn.Module):
    def __init__(self, batch_size, nfeat, ndim, hidden, first, nheads):
        super(GAT, self).__init__()

        self.attentions = [GraphAttentionLayer(batch_size, nfeat, hidden, first, concat=True) for _ in range(nheads)]
        for i, attention in enumerate(self.attentions):
            self.add_module('attention_{}'.format(i), attention)

        self.out_att = GraphAttentionLayer(batch_size, hidden * nheads, ndim, first, concat=False)

    def forward(self, x, adj):
        x = torch.cat([att(x, adj) for att in self.attentions], dim=2)
        x = F.elu(self.out_att(x, adj))
        return F.log_softmax(x, dim=1)

## Define the training function

In [26]:
def train(adj,features,labels,train_indices,val_indices,first=False,nheads=8):
    
    # calculate symmetric normalisation for layer propogation
    adj_norm = normalize(adj)
    
    # Convert from numpy to torch tensors
    adj = torch.FloatTensor(adj)
    adj_norm = torch.FloatTensor(adj_norm)
    features = torch.FloatTensor(features)
    labels = torch.FloatTensor(labels)
    
    # initialise the model
    model = GAT(batch_size=adj.shape[0],
                nfeat=features.shape[-1],
                ndim=5,
                hidden=hidden,
                first=first,
                nheads=nheads)
    
    # Transfer the weights to GPU for training
    if cuda:
        model.cuda()
        features = features.cuda()
        adj = adj.cuda()
        adj_norm = adj_norm.cuda()
        labels = labels.cuda()
    
    # Train model
    t_total = time.time()

    # Using adam optimizers for backpropogation
    optimizer = optim.Adam(model.parameters(),
                           lr=lr, weight_decay=weight_decay)
    
    # loss function criteria is cross entropy loss
    criterion = nn.CrossEntropyLoss()
    
    # Train for the no of epochs
    for epoch in range(epochs):

        t = time.time()
        
        model.train()
        
        # Pytorch accumulates gradient after every operation on tensors (defined by the model architecture)
        # with require_grad = True. With each new epoch, we need to reset this gradient to 0 to calculate gradient
        # for this epoch.
        optimizer.zero_grad()

        # get the output from forward propogation of our model
        output = model(features, adj_norm)
        
        
        # Calculate Train accuracy
        train_output = output[:,train_indices,:]
        train_labels = labels[train_indices,:]
        
        train_accuracy = torch.sum(torch.argmax(train_output,axis=2)==train_labels.reshape(1,-1))/train_labels.shape[0]
        
        # Calculate the loss between our models training output and true label
        loss = criterion(output[0],labels.reshape(-1).long())
        
        # Calculate the gradients 
        loss.backward(retain_graph=True)

        # Update the weights
        optimizer.step()
        
        model.eval()
        
        # Calculate Validation accuracy
        with torch.no_grad():
            val_output = output[:,val_indices,:]
            val_labels = labels[val_indices,:]
            val_accuracy = torch.sum(torch.argmax(val_output,axis=2)==val_labels.reshape(1,-1))/val_labels.shape[0]

        # Print summary of training 
        if epoch == 0:
            best_loss = loss
            best_output = output
            best_acc = train_accuracy
            best_val_acc = val_accuracy
            best_val_output = val_output
        else:
            if loss < best_loss and best_val_acc < val_accuracy :
                best_loss = loss
                best_output = output
                best_acc = train_accuracy
                best_val_acc = val_accuracy
                best_val_output = val_output

        if epoch == 0 or (epoch+1) % 10 == 0:
            print('Epoch: {:04d}'.format(epoch + 1),
                  'Train Accuracy: {:.4f}'.format(best_acc.item()),
                  'Validation Accuracy: {:.4f}'.format(best_val_acc.item()),
                  'Loss: {:.8f}'.format(best_loss.item()),
                  'time: {:.4f}s'.format(time.time() - t))
            
    print("Optimization Finished!")
    print("Total time elapsed: {:.4f}s".format(time.time() - t_total))
    
    return best_loss, best_output, best_acc.item(), best_val_acc.item()

## Divide dataset into 80% train and 20% validation and run the training

In [27]:
# set Train %

train_percentage = .8
    
# Train set
number_of_rows = features[0].shape[0]
train_indices = np.random.choice(number_of_rows, size=int(train_percentage*number_of_rows), replace=False)
val_indices = np.setdiff1d(np.arange(adj.shape[1]),train_indices)

# Start Train
loss, op, train_acc, val_acc = train(adj,features,labels,train_indices,val_indices,True,8)

train_results.append(train_acc)
val_results.append(val_acc)

Epoch: 0001 Train Accuracy: 0.0463 Validation Accuracy: 0.0503 Loss: 1.60980046 time: 1.2953s
Epoch: 0010 Train Accuracy: 0.2773 Validation Accuracy: 0.2910 Loss: 1.60941541 time: 0.0440s
Epoch: 0020 Train Accuracy: 0.2773 Validation Accuracy: 0.2910 Loss: 1.60941541 time: 0.0482s
Epoch: 0030 Train Accuracy: 0.2773 Validation Accuracy: 0.2910 Loss: 1.60941541 time: 0.0506s
Epoch: 0040 Train Accuracy: 0.2773 Validation Accuracy: 0.2910 Loss: 1.60941541 time: 0.0395s
Epoch: 0050 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0475s
Epoch: 0060 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0404s
Epoch: 0070 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0394s
Epoch: 0080 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0458s
Epoch: 0090 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0480s
Epoch: 0100 Train Accuracy: 0.3752 Validation Accuracy: 0.35

Epoch: 0880 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0408s
Epoch: 0890 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0469s
Epoch: 0900 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0501s
Epoch: 0910 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0403s
Epoch: 0920 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0404s
Epoch: 0930 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0404s
Epoch: 0940 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0445s
Epoch: 0950 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0456s
Epoch: 0960 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0405s
Epoch: 0970 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0371s
Epoch: 0980 Train Accuracy: 0.3752 Validation Accuracy: 0.35

Epoch: 1760 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0359s
Epoch: 1770 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0474s
Epoch: 1780 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0404s
Epoch: 1790 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0506s
Epoch: 1800 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0460s
Epoch: 1810 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0406s
Epoch: 1820 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0476s
Epoch: 1830 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0460s
Epoch: 1840 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0390s
Epoch: 1850 Train Accuracy: 0.3752 Validation Accuracy: 0.3545 Loss: 1.60927665 time: 0.0405s
Epoch: 1860 Train Accuracy: 0.3752 Validation Accuracy: 0.35

## Compare the accuracy of different models

In [28]:
d = {'Algorithm': names, 'Train Accuracy': train_results, 'Validation Accuracy': val_results}
df = pd.DataFrame(data=d)
df

Unnamed: 0,Algorithm,Train Accuracy,Validation Accuracy
0,K Nearest Neighbors,0.984116,0.978836
1,Logistic Regression,0.438782,0.44709
2,Decision Tree,0.986764,0.981481
3,Random Forest,0.969557,0.939153
4,Neural Net,0.438782,0.44709
5,AddaBoost,0.488418,0.526455
6,Gaussian Naive Bayes,0.981469,0.986772
7,SVM Sigmoid,0.936466,0.928571
8,Gradient Boosting,1.0,0.978836
9,XGBoost,1.0,0.973545
