In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn import decomposition

import warnings
warnings.filterwarnings("ignore") 

seed = 433

## Import census data

In [2]:
data = pd.read_csv('311_2019.csv', delimiter=',')
data = data.rename(columns={'Incident Zip': 'Zip', 'Complaint Type' : 'Complaint'})
data = data[['Zip', 'Complaint']]
data.Zip=pd.to_numeric(data.Zip,errors='coerce')
data=data.loc[(data.Zip>=10000)&(data.Zip<11500)]
data.head()

Unnamed: 0,Zip,Complaint
1,11434.0,Damaged Tree
2,11212.0,Graffiti
3,10016.0,Graffiti
4,10032.0,Graffiti
5,11420.0,Noise - Commercial


In [3]:
data = data.groupby(['Zip', 'Complaint']).size().reset_index(name='count')
data.head()

Unnamed: 0,Zip,Complaint,count
0,10000.0,Abandoned Vehicle,1
1,10000.0,Animal in a Park,9
2,10000.0,Animal-Abuse,5
3,10000.0,Bike/Roller/Skate Chronic,4
4,10000.0,Consumer Complaint,23


## Convert complaint types to categorical one hot variables

In [4]:
#convert a list to a 2d table with zip codes as rows and complaint types as columns
data=pd.pivot_table(data,index='Zip',columns='Complaint',values='count',fill_value=0)
data.head()

Complaint,APPLIANCE,Abandoned Vehicle,Air Quality,Animal Abuse,Animal Facility - No Permit,Animal in a Park,Animal-Abuse,Appliance,Asbestos,BEST/Site Safety,...,Vacant Lot,Vending,Violation of Park Rules,WATER LEAK,Water Conservation,Water Leak,Water Quality,Water System,Window Guard,X-Ray Machine/Equipment
Zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000.0,0,1,0,0,0,9,5,0,0,0,...,0,5,25,0,0,0,0,0,0,0
10001.0,0,9,19,0,0,7,16,0,1,0,...,0,64,8,0,2,0,2,23,0,0
10002.0,0,16,9,0,0,14,25,0,1,0,...,1,12,54,0,0,0,6,26,0,0
10003.0,1,18,7,1,0,12,50,0,8,0,...,0,29,43,0,1,0,17,25,0,0
10004.0,0,2,0,0,0,1,1,0,0,0,...,0,149,25,0,0,0,0,1,0,0


In [5]:
Total=data.sum(axis=1) #total 311 activity per zip code
data=data.div(data.sum(axis=1), axis=0) #normalize activity of various cathegories within zip code by total
data=data.loc[Total>100] #keep only those zip codes having sufficient activity
data.head()

Complaint,APPLIANCE,Abandoned Vehicle,Air Quality,Animal Abuse,Animal Facility - No Permit,Animal in a Park,Animal-Abuse,Appliance,Asbestos,BEST/Site Safety,...,Vacant Lot,Vending,Violation of Park Rules,WATER LEAK,Water Conservation,Water Leak,Water Quality,Water System,Window Guard,X-Ray Machine/Equipment
Zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000.0,0.0,0.005464,0.0,0.0,0.0,0.04918,0.027322,0.0,0.0,0.0,...,0.0,0.027322,0.136612,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10001.0,0.0,0.001866,0.003939,0.0,0.0,0.001451,0.003317,0.0,0.000207,0.0,...,0.0,0.01327,0.001659,0.0,0.000415,0.0,0.000415,0.004769,0.0,0.0
10002.0,0.0,0.002056,0.001157,0.0,0.0,0.001799,0.003213,0.0,0.000129,0.0,...,0.000129,0.001542,0.00694,0.0,0.0,0.0,0.000771,0.003341,0.0,0.0
10003.0,0.000135,0.002431,0.000946,0.000135,0.0,0.001621,0.006754,0.0,0.001081,0.0,...,0.0,0.003917,0.005808,0.0,0.000135,0.0,0.002296,0.003377,0.0,0.0
10004.0,0.0,0.001427,0.0,0.0,0.0,0.000713,0.000713,0.0,0.0,0.0,...,0.0,0.106277,0.017832,0.0,0.0,0.0,0.0,0.000713,0.0,0.0


## Add the income level feature into the dataset

In [6]:
# Add the income level feature into the dataset

income = pd.read_csv('data/zipcode_income.csv', delimiter=',')
income = income.iloc[:,:2]
income[income.isna()] = 0
income.rename(columns={'ZIPCODE':'Zip', 'median_familyIncome(USD)' : 'income'}, inplace=True)
income=income.loc[(income.Zip>=10000)&(income.Zip<11500)]
income['income'][income['income'] == 0.0] = income['income'].mean() # Replace 0 income with mean
income['income'] = income['income'] / income['income'].sum()
income.head()

Unnamed: 0,Zip,income
1,10001,0.007785
2,10002,0.002521
3,10003,0.011663
4,10004,0.005682
5,10006,0.012767


## Add the area size feature into the dataset

In [7]:
# Add the area size feature into the dataset

area = pd.read_csv('data/zips_area.csv', delimiter=',')
area = area.iloc[:,:2]
area.rename(columns={'ZIPCODE':'Zip', 'AREA' : 'area'}, inplace=True)
areas = area.copy()
area=area.loc[(area.Zip>=10000)&(area.Zip<11500)]
area['area'] = area['area'] / area['area'].sum()
area.head()

Unnamed: 0,Zip,area
0,11436,0.00282
1,11213,0.003681
2,11212,0.005214
3,11225,0.002944
4,11218,0.00458


## Add the population jobs feature into the dataset

In [8]:
# Add the population jobs feature into the dataset

jobs = pd.read_csv('data/zipcode_population_Jobs.csv', delimiter=',')
jobs = jobs.iloc[:,:2]
jobs.rename(columns={'ZIPCODE':'Zip', 'totalJobs' : 'jobs'}, inplace=True)
jobs['jobs'] = jobs['jobs'] / areas['area']
jobs.head()

Unnamed: 0,Zip,jobs
0,10001,0.009204
1,10002,0.000904
2,10003,0.002335
3,10004,0.003356
4,10005,0.001389


## Add the population feature into the dataset

In [9]:
# Add the population feature into the dataset

population = pd.read_csv('data/zipcode_population_Jobs.csv', delimiter=',')
population = population[['ZIPCODE','POPULATION']]
population.rename(columns={'ZIPCODE':'Zip', 'POPULATION' : 'population'}, inplace=True)
population['population'][population['population'] < 30.0] = population['population'].mean() # Replace 0 population with mean
population['population'] = population['population'] / areas['area']
population.head()

Unnamed: 0,Zip,population
0,10001,0.000987
1,10002,0.002744
2,10003,0.001331
3,10004,9.2e-05
4,10005,0.00022


## Add the house price feature into the dataset

In [10]:
# Add the house price feature into the dataset

housePrice = pd.read_csv('data/zipcode_housePrice.csv', delimiter=',')
weights = [5000,12500,17500,22500,27500,32500,37500,45000,55000,65000,75000,85000
                    ,95000,112500,137500,162500,187500,225000,275000,350000,450000,625000,875000
                        ,1250000,1750000,2000000]
for i in range(len(weights)):
    housePrice.iloc[i,2:] = housePrice.iloc[i,2:]*weights[i]
housePrice.iloc[:,1] [housePrice.iloc[:,1] == 0] = 1

tmp = (housePrice.iloc[:,2:] != 0 ).sum(axis=1)
tmp[tmp == 0] = 1
housePrice = pd.concat([housePrice.iloc[:,0],housePrice.iloc[:,2:].sum(axis=1) / tmp ], axis = 1 )
housePrice.rename(columns={'ZIPCODE':'Zip', 0: 'house_price'}, inplace=True)
housePrice['house_price'] = housePrice['house_price']/housePrice['house_price'].sum()

## Merge all the features

In [11]:
data = pd.merge(data, income, on="Zip")
data = data.merge(population)
data = data.merge(jobs)
data = data.merge(housePrice)
data = data.merge(area)
data = data.drop_duplicates(subset=['Zip'], keep='last')
data.head()

Unnamed: 0,Zip,APPLIANCE,Abandoned Vehicle,Air Quality,Animal Abuse,Animal Facility - No Permit,Animal in a Park,Animal-Abuse,Appliance,Asbestos,...,Water Leak,Water Quality,Water System,Window Guard,X-Ray Machine/Equipment,income,population,jobs,house_price,area
0,10001.0,0.0,0.001866,0.003939,0.0,0.0,0.001451,0.003317,0.0,0.000207,...,0.0,0.000415,0.004769,0.0,0.0,0.007785,0.000987,0.009204,0.000437,0.002211
1,10002.0,0.0,0.002056,0.001157,0.0,0.0,0.001799,0.003213,0.0,0.000129,...,0.0,0.000771,0.003341,0.0,0.0,0.002521,0.002744,0.000904,0.000571,0.003265
2,10003.0,0.000135,0.002431,0.000946,0.000135,0.0,0.001621,0.006754,0.0,0.001081,...,0.0,0.002296,0.003377,0.0,0.0,0.011663,0.001331,0.002335,0.002566,0.00193
6,10004.0,0.0,0.001427,0.0,0.0,0.0,0.000713,0.000713,0.0,0.0,...,0.0,0.0,0.000713,0.0,0.0,0.005682,9.2e-05,0.003356,0.0,8.3e-05
7,10006.0,0.0,0.007491,0.001873,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.001873,0.0,0.0,0.012767,7.6e-05,0.000546,0.00168,0.000213


In [12]:
data1 = pd.read_csv('data/zips_merged.csv', delimiter=',')
data1 = data1.rename(columns={'total': 'weight', 'w_zip':'origin', 'h_zip':'destination'})
data1 = data1[data1.destination.isin(data1.origin.unique())]
data1 = data1[['origin', 'destination', 'weight', 'true_label']]
data1 = data1[data1['origin'].isin(data['Zip'])][data1['destination'].isin(data['Zip'])]
data2 = data[data['Zip'].isin(data1['destination'])]

In [13]:
data.head()

Unnamed: 0,Zip,APPLIANCE,Abandoned Vehicle,Air Quality,Animal Abuse,Animal Facility - No Permit,Animal in a Park,Animal-Abuse,Appliance,Asbestos,...,Water Leak,Water Quality,Water System,Window Guard,X-Ray Machine/Equipment,income,population,jobs,house_price,area
0,10001.0,0.0,0.001866,0.003939,0.0,0.0,0.001451,0.003317,0.0,0.000207,...,0.0,0.000415,0.004769,0.0,0.0,0.007785,0.000987,0.009204,0.000437,0.002211
1,10002.0,0.0,0.002056,0.001157,0.0,0.0,0.001799,0.003213,0.0,0.000129,...,0.0,0.000771,0.003341,0.0,0.0,0.002521,0.002744,0.000904,0.000571,0.003265
2,10003.0,0.000135,0.002431,0.000946,0.000135,0.0,0.001621,0.006754,0.0,0.001081,...,0.0,0.002296,0.003377,0.0,0.0,0.011663,0.001331,0.002335,0.002566,0.00193
6,10004.0,0.0,0.001427,0.0,0.0,0.0,0.000713,0.000713,0.0,0.0,...,0.0,0.0,0.000713,0.0,0.0,0.005682,9.2e-05,0.003356,0.0,8.3e-05
7,10006.0,0.0,0.007491,0.001873,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.001873,0.0,0.0,0.012767,7.6e-05,0.000546,0.00168,0.000213


## Load Data for processing

In [14]:
# function for loading data, returns adjacency matrix, initial feature assignments and true labels

def load_data():

    G = nx.from_pandas_edgelist(data1, 'origin', 'destination', 'weight',create_using=nx.DiGraph())
    adj_list = np.array([nx.adjacency_matrix(G).todense()], dtype=float)
    
    init_feat = data.to_numpy()[:,:-5]
    
    true_label = data.to_numpy()[:,-5:]
    
    return adj_list,init_feat,true_label

adj,feature,labels = load_data()

## PCA and reduce dimension

In [15]:
pca = decomposition.PCA(n_components=10)
feature = pca.fit_transform(feature)

features = np.expand_dims(feature, axis=0)

## Importing different classifier from sklearn

In [16]:
from sklearn.linear_model import LinearRegression, BayesianRidge
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn import tree
from sklearn.gaussian_process import GaussianProcessRegressor

from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

## Define various baseline models on the data

In [17]:
train_results = []
val_results = []

# Baseline Model building to train
names = ['Linear Regression', 'Decision Tree Regression', 'Bayesian Ridge Regression', 'XGB Regression', 
         'Gradient Boosting Regression', 'Support Vector Regression', 'Gaussian Process Regression', 'GNN']

Classifiers = [
        LinearRegression(),
        tree.DecisionTreeRegressor(),
        BayesianRidge(),
        XGBRegressor(),
        GradientBoostingRegressor(),
        SVR(),
        GaussianProcessRegressor()
        ]

## Train various baseline models on the data

In [18]:
def baseline(i, label):

    print("Feature consider = "+data.columns[-5+i]+"\n\n\n")

    X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size = 0.2, random_state = seed)

    train_results.append([])
    val_results.append([])
    
    models = zip(names, Classifiers)

    for name, model in models:

        model.fit(X_train,y_train)
        train_acc = model.score(X_train,y_train)
        val_acc = model.score(X_test,y_test)

        msg = "{0}:\nTraining R2 : {1} Validation R2 : {2}\n".format(name, train_acc, val_acc)
        print(msg)

        train_results[i].append(train_acc)
        val_results[i].append(val_acc)

## Train baseline models on income feature

In [19]:
baseline(0,labels[:,0].reshape(-1,1))

Feature consider = income



Linear Regression:
Training R2 : 0.663132445212302 Validation R2 : 0.6223714495685317

Decision Tree Regression:
Training R2 : 1.0 Validation R2 : 0.4997405763156393

Bayesian Ridge Regression:
Training R2 : 0.6600590885455687 Validation R2 : 0.6409722490747973

XGB Regression:
Training R2 : 0.9794873191026398 Validation R2 : 0.7110566680560005

Gradient Boosting Regression:
Training R2 : 0.990418827992623 Validation R2 : 0.7543207243341643

Support Vector Regression:
Training R2 : -0.8889837587972114 Validation R2 : -0.6531644166957946

Gaussian Process Regression:
Training R2 : 1.0 Validation R2 : -0.2947811984249409



## Train baseline models on population feature

In [20]:
baseline(1,labels[:,1].reshape(-1,1))

Feature consider = population



Linear Regression:
Training R2 : 0.08983565862932819 Validation R2 : -0.05206144642851851

Decision Tree Regression:
Training R2 : 1.0 Validation R2 : -0.5836881326706358

Bayesian Ridge Regression:
Training R2 : 0.047411516015937405 Validation R2 : 0.016913767436320892

XGB Regression:
Training R2 : 0.9999638545231383 Validation R2 : 0.27306075383051365

Gradient Boosting Regression:
Training R2 : 0.9985769163693877 Validation R2 : 0.6367208635775177

Support Vector Regression:
Training R2 : -0.4108050961987164 Validation R2 : -0.82862683772049

Gaussian Process Regression:
Training R2 : 1.0 Validation R2 : -0.546860806344335



## Train baseline models on jobs feature

In [21]:
baseline(2,labels[:,2].reshape(-1,1))

Feature consider = jobs



Linear Regression:
Training R2 : 0.04882476967617111 Validation R2 : -0.8664389617668129

Decision Tree Regression:
Training R2 : 1.0 Validation R2 : 0.0031831923048470934

Bayesian Ridge Regression:
Training R2 : 0.025131041640210183 Validation R2 : -0.41034146908725355

XGB Regression:
Training R2 : 0.9995908504017639 Validation R2 : -0.04002501338812192

Gradient Boosting Regression:
Training R2 : 0.9983734355966841 Validation R2 : 0.011071922483373742

Support Vector Regression:
Training R2 : -6.614211242179265 Validation R2 : -69.12167940358452

Gaussian Process Regression:
Training R2 : 1.0 Validation R2 : -11.618351321318992



## Train baseline models on house price feature

In [22]:
baseline(3,labels[:,3].reshape(-1,1))

Feature consider = house_price



Linear Regression:
Training R2 : 0.15612812905474793 Validation R2 : 0.029524361415966482

Decision Tree Regression:
Training R2 : 0.9999999999996514 Validation R2 : -0.17698811625739697

Bayesian Ridge Regression:
Training R2 : 0.09122345854857306 Validation R2 : 0.028226314162391408

XGB Regression:
Training R2 : 0.9995392195449473 Validation R2 : 0.26638928306117104

Gradient Boosting Regression:
Training R2 : 0.9999846805123455 Validation R2 : 0.17384914672856477

Support Vector Regression:
Training R2 : -12.236689134707358 Validation R2 : -1.013870446211953

Gaussian Process Regression:
Training R2 : 1.0 Validation R2 : 0.7570748425707411



## Train baseline models on area feature

In [23]:
baseline(4,labels[:,4].reshape(-1,1))

Feature consider = area



Linear Regression:
Training R2 : 0.4024499843948215 Validation R2 : 0.14699050994020357

Decision Tree Regression:
Training R2 : 1.0 Validation R2 : 0.09186763236684692

Bayesian Ridge Regression:
Training R2 : 0.3937176094063686 Validation R2 : 0.1395509667342495

XGB Regression:
Training R2 : 0.9814117850667233 Validation R2 : 0.19852451518650094

Gradient Boosting Regression:
Training R2 : 0.9851595264908393 Validation R2 : 0.056893472531975076

Support Vector Regression:
Training R2 : -3.2565603229584834 Validation R2 : -0.44865544603645113

Gaussian Process Regression:
Training R2 : 1.0 Validation R2 : -0.14051871234128432



## Import packages for GNN

In [24]:
# import packages

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
torch.set_printoptions(sci_mode=False)
import time
from sklearn.metrics import r2_score

## Hyperparameters

In [25]:
# set initial model config

cuda = torch.cuda.is_available()
weight_decay = 1e-8
epochs = 250000
seed = 635
hidden = 16
lr = 0.001

In [26]:
np.random.seed(seed)
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed(seed)

## Define symmetric normalization for the adjacency matrix

In [27]:
def normalize(adj):

    adj = torch.FloatTensor(adj)
    adj_id = torch.FloatTensor(torch.eye(adj.shape[1]))
    adj_id = adj_id.reshape((1, adj.shape[1], adj.shape[1]))
    adj_id = adj_id.repeat(adj.shape[0], 1, 1)
    adj = adj + adj_id
    rowsum = torch.FloatTensor(adj.sum(2))
    degree_mat_inv_sqrt = torch.diag_embed(torch.float_power(rowsum,-0.5), dim1=-2, dim2=-1).float()
    adj_norm = torch.bmm(torch.transpose(torch.bmm(adj,degree_mat_inv_sqrt),1,2),degree_mat_inv_sqrt)

    return adj_norm

## Define GNN Layers

In [28]:
class GNN1Layer(Module):

    def __init__(self, batch_size, in_features, out_features, first):
        super(GNN1Layer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.batch_size = batch_size
        
        # Initialse W1 = 1, W2 = 0 as pytorch learnable weights (parameters) that have require_grad = True which is
        # required for calculating gradients while backpropogating using gradient descent
        weight1_eye = torch.FloatTensor(torch.eye(in_features, out_features))
        weight1_eye = weight1_eye.reshape((1, in_features, out_features))
        weight1_eye = weight1_eye.repeat(batch_size, 1, 1)
        self.weight1 = Parameter(weight1_eye)
        if not first:
            self.weight2 = Parameter(torch.zeros(batch_size, in_features, out_features))
        else:
            self.weight2 = Parameter(torch.empty(batch_size, in_features, out_features))
            nn.init.kaiming_normal_(self.weight2, mode='fan_out')

    def forward(self, input, adj):
        # first term H*W1
        v1 = torch.bmm(input, self.weight1)
        # second term adj_norm*H*W2
        v2 = torch.bmm(torch.bmm(adj, input), self.weight2)
        # adding the two terms
        output = v1 + v2

        return output

## Define GNN Model

In [29]:
class GNN1(nn.Module):

    def __init__(self, batch_size, nfeat, ndim, hidden, first):
        super(GNN1, self).__init__()

        self.gc1 = GNN1Layer(batch_size, nfeat, hidden, first)
        self.gc2 = GNN1Layer(batch_size, hidden, ndim, first)

    def forward(self, x, adj):

        # Applying activation function sigma (doublerelu) on the layer propogation
        x = nn.Sigmoid()(self.gc1(x, adj))
        x = nn.Sigmoid()(self.gc2(x, adj))
        #x = x/x.sum(axis=2).unsqueeze(2) #normalize st sum = 1
        
        return x

## Define the training function

In [30]:
def train(adj,features,labels,train_indices,val_indices,first=False):
    
    # calculate symmetric normalisation for layer propogation
    adj_norm = normalize(adj)
    
    #labels = labels - 1
    
    # Convert from numpy to torch tensors
    adj = torch.FloatTensor(adj)
    adj_norm = torch.FloatTensor(adj_norm)
    features = torch.FloatTensor(features)
    labels = torch.FloatTensor(labels)
    
    # initialise the model
    model = GNN1(batch_size=adj.shape[0],
                nfeat=features.shape[-1],
                ndim=1,
                hidden=hidden,
                first=first)
    
    # Transfer the weights to GPU for training
    if cuda:
        model.cuda()
        features = features.cuda()
        adj = adj.cuda()
        adj_norm = adj_norm.cuda()
        labels = labels.cuda()
    
    # Train model
    t_total = time.time()

    # Using adam optimizers for backpropogation
    optimizer = optim.Adam(model.parameters(),
                           lr=lr, weight_decay=weight_decay)
    
    # loss function criteria is MSE
    criterion = nn.MSELoss()
    
    # Train for the no of epochs
    for epoch in range(epochs):

        t = time.time()
        
        model.train()
        
        # Pytorch accumulates gradient after every operation on tensors (defined by the model architecture)
        # with require_grad = True. With each new epoch, we need to reset this gradient to 0 to calculate gradient
        # for this epoch.
        optimizer.zero_grad()

        # get the output from forward propogation of our model
        output = model(features, adj_norm)
        
        
        # Calculate Train accuracy
        train_output = output[:,train_indices,:]
        train_labels = labels[train_indices,:]
        
        #train_accuracy = torch.sum(torch.argmax(train_output,axis=2)==train_labels.reshape(1,-1))/train_labels.shape[0]
        train_r2 = r2_score(torch.flatten(train_labels).detach().cpu(), torch.flatten(train_output).detach().cpu())
        
        # Calculate the loss between our models training output and true label
        loss = criterion(torch.flatten(train_output),torch.flatten(train_labels))
        
        # Calculate the gradients 
        loss.backward(retain_graph=True)

        # Update the weights
        optimizer.step()
        
        model.eval()
        
        # Calculate Validation accuracy
        with torch.no_grad():
            val_output = output[:,val_indices,:]
            val_labels = labels[val_indices,:]
            val_r2 = r2_score(torch.flatten(val_labels).detach().cpu(), torch.flatten(val_output).detach().cpu())

        # Print summary of training 
        if epoch == 0:
            best_loss = loss
            best_output = output
            best_r2 = train_r2
            best_val_r2 = val_r2
            best_val_output = val_output
        else:
            if loss < best_loss:
                best_loss = loss
                best_output = output
                best_r2 = train_r2
                best_val_r2 = val_r2
                best_val_output = val_output

        if epoch == 0 or (epoch+1) % 10000 == 0:
            print('Epoch: {:04d}'.format(epoch + 1),
                  'Train R2 score: {:.4f}'.format(best_r2.item()),
                  'Validation R2 score: {:.4f}'.format(best_val_r2.item()),
                  'Loss: {:.8f}'.format(best_loss.item()),
                  'time: {:.4f}s'.format(time.time() - t))
            
        if val_r2 > best_val_r2:
            train_r2 = train_r2.item()
            val_r2 = val_r2.item()
            
    print("Optimization Finished!")
    print("Total time elapsed: {:.4f}s".format(time.time() - t_total))
    
    return best_loss, best_output, train_r2, val_r2

## Divide dataset into 80% train and 20% validation

In [31]:
# set Train %

train_percentage = .8
    
# Train set
number_of_rows = features[0].shape[0]
train_indices = np.random.choice(number_of_rows, size=int(train_percentage*number_of_rows), replace=False)
val_indices = np.setdiff1d(np.arange(adj.shape[1]),train_indices)


## Run the training on income feature

In [32]:
# Start Train
loss, op, train_acc, val_acc = train(adj,features,labels[:,0].reshape(-1,1),train_indices,val_indices,True)

train_results[0].append(train_acc)
val_results[0].append(val_acc)

Epoch: 0001 Train R2 score: -100896.8242 Validation R2 score: -78176.7761 Loss: 0.89584577 time: 1.6836s
Epoch: 10000 Train R2 score: -0.9356 Validation R2 score: -1.0462 Loss: 0.00001719 time: 0.0030s
Epoch: 20000 Train R2 score: 0.7848 Validation R2 score: 0.4674 Loss: 0.00000191 time: 0.0030s
Epoch: 30000 Train R2 score: 0.8186 Validation R2 score: 0.6349 Loss: 0.00000161 time: 0.0030s
Epoch: 40000 Train R2 score: 0.8262 Validation R2 score: 0.6380 Loss: 0.00000154 time: 0.0030s
Epoch: 50000 Train R2 score: 0.8300 Validation R2 score: 0.6410 Loss: 0.00000151 time: 0.0050s
Epoch: 60000 Train R2 score: 0.8328 Validation R2 score: 0.6440 Loss: 0.00000148 time: 0.0030s
Epoch: 70000 Train R2 score: 0.8349 Validation R2 score: 0.6446 Loss: 0.00000147 time: 0.0030s
Epoch: 80000 Train R2 score: 0.8369 Validation R2 score: 0.6423 Loss: 0.00000145 time: 0.0030s
Epoch: 90000 Train R2 score: 0.8386 Validation R2 score: 0.6407 Loss: 0.00000143 time: 0.0030s
Epoch: 100000 Train R2 score: 0.8399 V

## Compare the accuracy of different models for income feature

In [33]:
d = {'Algorithm': names, 'Train Accuracy': train_results[0], 'Validation Accuracy': val_results[0]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,Algorithm,Train Accuracy,Validation Accuracy
0,Linear Regression,0.663132,0.622371
1,Decision Tree Regression,1.0,0.499741
2,Bayesian Ridge Regression,0.660059,0.640972
3,XGB Regression,0.979487,0.711057
4,Gradient Boosting Regression,0.990419,0.754321
5,Support Vector Regression,-0.888984,-0.653164
6,Gaussian Process Regression,1.0,-0.294781
7,GNN,0.846572,0.641759


## Run the training on area size feature

In [34]:

loss, op, train_acc, val_acc = train(adj,features,labels[:,1].reshape(-1,1),train_indices,val_indices,True)

train_results[1].append(train_acc)
val_results[1].append(val_acc)

Epoch: 0001 Train R2 score: -36.6765 Validation R2 score: -2034.5239 Loss: 0.53387612 time: 0.0040s
Epoch: 10000 Train R2 score: 0.5924 Validation R2 score: -0.3825 Loss: 0.00577561 time: 0.0030s
Epoch: 20000 Train R2 score: 0.9244 Validation R2 score: -0.5034 Loss: 0.00107107 time: 0.0030s
Epoch: 30000 Train R2 score: 0.9369 Validation R2 score: -0.5215 Loss: 0.00089396 time: 0.0030s
Epoch: 40000 Train R2 score: 0.9383 Validation R2 score: -0.5840 Loss: 0.00087407 time: 0.0040s
Epoch: 50000 Train R2 score: 0.9389 Validation R2 score: -0.7067 Loss: 0.00086569 time: 0.0030s
Epoch: 60000 Train R2 score: 0.9393 Validation R2 score: -0.8103 Loss: 0.00085985 time: 0.0030s
Epoch: 70000 Train R2 score: 0.9405 Validation R2 score: -1.0999 Loss: 0.00084343 time: 0.0030s
Epoch: 80000 Train R2 score: 0.9713 Validation R2 score: -2.2030 Loss: 0.00040602 time: 0.0030s
Epoch: 90000 Train R2 score: 0.9771 Validation R2 score: -1.5169 Loss: 0.00032406 time: 0.0040s
Epoch: 100000 Train R2 score: 0.9783

## Compare the accuracy of different models for area size feature

In [35]:
d = {'Algorithm': names, 'Train Accuracy': train_results[1], 'Validation Accuracy': val_results[1]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,Algorithm,Train Accuracy,Validation Accuracy
0,Linear Regression,0.089836,-0.052061
1,Decision Tree Regression,1.0,-0.583688
2,Bayesian Ridge Regression,0.047412,0.016914
3,XGB Regression,0.999964,0.273061
4,Gradient Boosting Regression,0.998577,0.636721
5,Support Vector Regression,-0.410805,-0.828627
6,Gaussian Process Regression,1.0,-0.546861
7,GNN,0.978838,-0.184267


## Run the training on population jobs feature

In [36]:

loss, op, train_acc, val_acc = train(adj,features,labels[:,2].reshape(-1,1),train_indices,val_indices,True)

train_results[2].append(train_acc)
val_results[2].append(val_acc)

Epoch: 0001 Train R2 score: -703.9153 Validation R2 score: -14380.7131 Loss: 0.91138673 time: 0.0040s
Epoch: 10000 Train R2 score: 0.2461 Validation R2 score: -1.7515 Loss: 0.00097475 time: 0.0040s
Epoch: 20000 Train R2 score: 0.8251 Validation R2 score: -6.6213 Loss: 0.00022609 time: 0.0030s
Epoch: 30000 Train R2 score: 0.8675 Validation R2 score: -70.3280 Loss: 0.00017130 time: 0.0030s
Epoch: 40000 Train R2 score: 0.8728 Validation R2 score: -65.3668 Loss: 0.00016441 time: 0.0040s
Epoch: 50000 Train R2 score: 0.8744 Validation R2 score: -57.7173 Loss: 0.00016239 time: 0.0050s
Epoch: 60000 Train R2 score: 0.8751 Validation R2 score: -47.5837 Loss: 0.00016148 time: 0.0030s
Epoch: 70000 Train R2 score: 0.8756 Validation R2 score: -37.9102 Loss: 0.00016089 time: 0.0040s
Epoch: 80000 Train R2 score: 0.8760 Validation R2 score: -23.8865 Loss: 0.00016036 time: 0.0040s
Epoch: 90000 Train R2 score: 0.8764 Validation R2 score: -11.4257 Loss: 0.00015975 time: 0.0030s
Epoch: 100000 Train R2 scor

## Compare the accuracy of different models for population jobs feature

In [37]:
d = {'Algorithm': names, 'Train Accuracy': train_results[2], 'Validation Accuracy': val_results[2]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,Algorithm,Train Accuracy,Validation Accuracy
0,Linear Regression,0.048825,-0.866439
1,Decision Tree Regression,1.0,0.003183
2,Bayesian Ridge Regression,0.025131,-0.410341
3,XGB Regression,0.999591,-0.040025
4,Gradient Boosting Regression,0.998373,0.011072
5,Support Vector Regression,-6.614211,-69.121679
6,Gaussian Process Regression,1.0,-11.618351
7,GNN,0.883863,-21.374517


## Run the training on population feature

In [38]:

loss, op, train_acc, val_acc = train(adj,features,labels[:,3].reshape(-1,1),train_indices,val_indices,True)

train_results[3].append(train_acc)
val_results[3].append(val_acc)

Epoch: 0001 Train R2 score: -815.4537 Validation R2 score: -2930.2359 Loss: 0.81471401 time: 0.0040s
Epoch: 10000 Train R2 score: 0.8877 Validation R2 score: -0.3306 Loss: 0.00011206 time: 0.0020s
Epoch: 20000 Train R2 score: 0.9351 Validation R2 score: 0.2271 Loss: 0.00006480 time: 0.0030s
Epoch: 30000 Train R2 score: 0.9662 Validation R2 score: -0.0566 Loss: 0.00003374 time: 0.0030s
Epoch: 40000 Train R2 score: 0.9831 Validation R2 score: -2.9696 Loss: 0.00001682 time: 0.0030s
Epoch: 50000 Train R2 score: 0.9929 Validation R2 score: -6.4240 Loss: 0.00000706 time: 0.0040s
Epoch: 60000 Train R2 score: 0.9938 Validation R2 score: -8.2449 Loss: 0.00000615 time: 0.0030s
Epoch: 70000 Train R2 score: 0.9942 Validation R2 score: -9.1016 Loss: 0.00000579 time: 0.0020s
Epoch: 80000 Train R2 score: 0.9942 Validation R2 score: -9.1016 Loss: 0.00000579 time: 0.0030s
Epoch: 90000 Train R2 score: 0.9942 Validation R2 score: -9.1016 Loss: 0.00000579 time: 0.0030s
Epoch: 100000 Train R2 score: 0.9942

## Compare the accuracy of different models for population feature

In [39]:
d = {'Algorithm': names, 'Train Accuracy': train_results[3], 'Validation Accuracy': val_results[3]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,Algorithm,Train Accuracy,Validation Accuracy
0,Linear Regression,0.156128,0.029524
1,Decision Tree Regression,1.0,-0.176988
2,Bayesian Ridge Regression,0.091223,0.028226
3,XGB Regression,0.999539,0.266389
4,Gradient Boosting Regression,0.999985,0.173849
5,Support Vector Regression,-12.236689,-1.01387
6,Gaussian Process Regression,1.0,0.757075
7,GNN,0.986328,0.047877


## Run the training on house price feature

In [40]:

loss, op, train_acc, val_acc = train(adj,features,labels[:,4].reshape(-1,1),train_indices,val_indices,True)

train_results[4].append(train_acc)
val_results[4].append(val_acc)

Epoch: 0001 Train R2 score: -1036.7012 Validation R2 score: -4523.4729 Loss: 0.04560682 time: 0.0040s
Epoch: 10000 Train R2 score: -0.1532 Validation R2 score: -1.2542 Loss: 0.00005068 time: 0.0030s
Epoch: 20000 Train R2 score: 0.8557 Validation R2 score: -1.6714 Loss: 0.00000634 time: 0.0030s
Epoch: 30000 Train R2 score: 0.8921 Validation R2 score: -2.9480 Loss: 0.00000474 time: 0.0030s
Epoch: 40000 Train R2 score: 0.9071 Validation R2 score: -2.8313 Loss: 0.00000408 time: 0.0030s
Epoch: 50000 Train R2 score: 0.9109 Validation R2 score: -2.5607 Loss: 0.00000392 time: 0.0020s
Epoch: 60000 Train R2 score: 0.9136 Validation R2 score: -2.3149 Loss: 0.00000380 time: 0.0030s
Epoch: 70000 Train R2 score: 0.9217 Validation R2 score: -2.3559 Loss: 0.00000344 time: 0.0050s
Epoch: 80000 Train R2 score: 0.9227 Validation R2 score: -2.4989 Loss: 0.00000340 time: 0.0030s
Epoch: 90000 Train R2 score: 0.9234 Validation R2 score: -2.5047 Loss: 0.00000337 time: 0.0030s
Epoch: 100000 Train R2 score: 0.9

## Compare the accuracy of different models for house price feature

In [41]:
d = {'Algorithm': names, 'Train Accuracy': train_results[4], 'Validation Accuracy': val_results[4]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,Algorithm,Train Accuracy,Validation Accuracy
0,Linear Regression,0.40245,0.146991
1,Decision Tree Regression,1.0,0.091868
2,Bayesian Ridge Regression,0.393718,0.139551
3,XGB Regression,0.981412,0.198525
4,Gradient Boosting Regression,0.98516,0.056893
5,Support Vector Regression,-3.25656,-0.448655
6,Gaussian Process Regression,1.0,-0.140519
7,GNN,0.930519,-2.102115
