In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn import decomposition

import warnings
warnings.filterwarnings("ignore") 

seed = 635

## Import census data

In [2]:
data = pd.read_csv('data/data_311_ct_joined.csv', delimiter=',')
data = data.rename(columns={'Complaint Type' : 'Complaint', 'GEOID' : 'ct'})
data = data[['ct', 'Complaint']]
data.head()

Unnamed: 0,ct,Complaint
0,36061000100,Lost Property
1,36061000100,Lost Property
2,36061000100,Lost Property
3,36061000100,Lost Property
4,36061000100,Lost Property


In [3]:
data = data.groupby(['ct', 'Complaint']).size().reset_index(name='count')
data.head()

Unnamed: 0,ct,Complaint,count
0,36005000100,Blocked Driveway,1
1,36005000100,Consumer Complaint,8
2,36005000100,Food Poisoning,1
3,36005000100,Noise - Residential,1
4,36005000100,Non-Residential Heat,5


## Convert complaint types to categorical one hot variables

In [4]:
#convert a list to a 2d table with zip codes as rows and complaint types as columns
data=pd.pivot_table(data,index='ct',columns='Complaint',values='count',fill_value=0)
data.head()

Complaint,APPLIANCE,Abandoned Vehicle,Air Quality,Animal Facility - No Permit,Animal in a Park,Animal-Abuse,Appliance,Asbestos,BEST/Site Safety,Beach/Pool/Sauna Complaint,...,Vacant Lot,Vending,Violation of Park Rules,WATER LEAK,Water Conservation,Water Leak,Water Quality,Water System,Window Guard,X-Ray Machine/Equipment
ct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
36005000100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36005000200,0,65,0,0,2,5,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
36005000400,0,33,0,0,2,3,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
36005001600,0,41,0,0,0,5,0,1,0,0,...,0,0,1,0,0,0,0,2,0,0
36005001901,0,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
Total=data.sum(axis=1) #total 311 activity per census tract
data=data.div(data.sum(axis=1), axis=0) #normalize activity of various cathegories within census tract by total
data=data.loc[Total>100] #keep only those census tracts having sufficient activity
data.reset_index(inplace=True)
data.head()

Complaint,ct,APPLIANCE,Abandoned Vehicle,Air Quality,Animal Facility - No Permit,Animal in a Park,Animal-Abuse,Appliance,Asbestos,BEST/Site Safety,...,Vacant Lot,Vending,Violation of Park Rules,WATER LEAK,Water Conservation,Water Leak,Water Quality,Water System,Window Guard,X-Ray Machine/Equipment
0,36005000200,0.0,0.090782,0.0,0.0,0.002793,0.006983,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001397,0.0,0.0
1,36005000400,0.0,0.078759,0.0,0.0,0.004773,0.00716,0.0,0.0,0.0,...,0.0,0.0,0.0,0.002387,0.0,0.0,0.0,0.0,0.0,0.0
2,36005001600,0.0,0.127726,0.0,0.0,0.0,0.015576,0.0,0.003115,0.0,...,0.0,0.0,0.003115,0.0,0.0,0.0,0.0,0.006231,0.0,0.0
3,36005001901,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,36005001902,0.0,0.036066,0.003279,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003279,0.0,0.0


## Add the income level feature into the dataset

In [6]:
# Add the income level feature into the dataset

income = pd.read_csv('data/ct_income.csv', delimiter=',')
income.rename(columns={'median_income' : 'income'}, inplace=True)
income[income.isna()] = 0
income=income.loc[income.ct.isin(data['ct'])]
income['income'][income['income'] == 0.0] = income['income'].median() # Replace 0 income with median
income['income'] = income['income'] / (income['income'].sum() )#- income['income'].min() )
income.head()

Unnamed: 0,income,ct
1,0.000327,36005000200
2,0.000574,36005000400
3,0.000246,36005001600
8,0.000233,36005002500
9,0.000129,36005002701


## Add the age feature into the dataset

In [7]:
# Add the age feature into the dataset

age = pd.read_csv('data/ct_age.csv', delimiter=',')
age.rename(columns={'median_age' : 'age'}, inplace=True)
age=age.loc[age.ct.isin(data['ct'])]
age['age'] = age['age'] / (age['age'].max() - age['age'].min() )
age.head()

Unnamed: 0,ct,age
1,36005000200,0.657191
2,36005000400,0.688963
3,36005001600,0.563545
8,36005002500,0.553512
9,36005002701,0.48495


## Add the employment rate feature into the dataset

In [8]:
# Add the employment rate feature into the dataset

employment_rate = pd.read_csv('data/ct_employment_rate.csv', delimiter=',')
employment_rate=employment_rate.loc[employment_rate.ct.isin(data['ct'])]
employment_rate['employment_rate'] = employment_rate['employment_rate'] / (employment_rate['employment_rate'].max() - employment_rate['employment_rate'].min() )
employment_rate.head()

Unnamed: 0,ct,employment_rate
1,36005000200,0.508
2,36005000400,0.667
3,36005001600,0.504
8,36005002500,0.591
9,36005002701,0.547


## Add the house price feature into the dataset

In [9]:
# Add the house price feature into the dataset

housePrice = pd.read_csv('data/ct_avg_house_price.csv', delimiter=',')
housePrice=housePrice.loc[housePrice.ct.isin(data['ct'])]
housePrice[housePrice['avg_house_price'].isna()] = housePrice.median()
housePrice['avg_house_price'] = housePrice['avg_house_price']/ (housePrice['avg_house_price'].max() - housePrice['avg_house_price'].min() )
housePrice.head()

Unnamed: 0,avg_house_price,ct
1,0.175879,36005000200
2,0.175879,36005000400
3,0.31407,36047112200
8,0.123356,36005002500
9,0.31407,36047112200


## Merge all the features

In [10]:
data = pd.merge(data, income, on="ct")
data = data.merge(age)
data = data.merge(employment_rate)
data = data.merge(housePrice)
data = data.drop_duplicates(subset=['ct'], keep='last')
data.head()

Unnamed: 0,ct,APPLIANCE,Abandoned Vehicle,Air Quality,Animal Facility - No Permit,Animal in a Park,Animal-Abuse,Appliance,Asbestos,BEST/Site Safety,...,Water Conservation,Water Leak,Water Quality,Water System,Window Guard,X-Ray Machine/Equipment,income,age,employment_rate,avg_house_price
0,36005000200,0.0,0.090782,0.0,0.0,0.002793,0.006983,0.0,0.0,0.0,...,0.0,0.0,0.0,0.001397,0.0,0.0,0.000327,0.657191,0.508,0.175879
1,36005000400,0.0,0.078759,0.0,0.0,0.004773,0.00716,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000574,0.688963,0.667,0.175879
2,36005002500,0.004367,0.0,0.0,0.0,0.0,0.008734,0.0,0.0,0.0,...,0.0,0.0,0.0,0.004367,0.0,0.0,0.000233,0.553512,0.591,0.123356
3,36005002800,0.0,0.070796,0.0,0.0,0.00885,0.013274,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00035,0.744147,0.563,0.175879
4,36005003100,0.0,0.015306,0.0,0.0,0.0,0.020408,0.0,0.0,0.0,...,0.0,0.0,0.0,0.005102,0.0,0.0,0.000323,0.551839,0.714,0.013819


In [11]:
data1 = pd.read_csv('data/LEHD_311_2019.csv', delimiter=',')
data1 = data1.rename(columns={'distance': 'weight'})
data1 = data1[data1.destination.isin(data1.origin.unique())]
data1 = data1[['origin', 'destination', 'weight']]
data1 = data1[data1['origin'].isin(data['ct'])][data1['destination'].isin(data['ct'])]

In [12]:
data1.head()

Unnamed: 0,origin,destination,weight
4318,36081045400,36081044602,0.592328
4319,36081045400,36081044800,0.586815
4320,36081045400,36081045800,0.517808
4322,36081045400,36081046300,7.223306
4326,36081045400,36081046800,1.182094


In [13]:
data.head()

Unnamed: 0,ct,APPLIANCE,Abandoned Vehicle,Air Quality,Animal Facility - No Permit,Animal in a Park,Animal-Abuse,Appliance,Asbestos,BEST/Site Safety,...,Water Conservation,Water Leak,Water Quality,Water System,Window Guard,X-Ray Machine/Equipment,income,age,employment_rate,avg_house_price
0,36005000200,0.0,0.090782,0.0,0.0,0.002793,0.006983,0.0,0.0,0.0,...,0.0,0.0,0.0,0.001397,0.0,0.0,0.000327,0.657191,0.508,0.175879
1,36005000400,0.0,0.078759,0.0,0.0,0.004773,0.00716,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000574,0.688963,0.667,0.175879
2,36005002500,0.004367,0.0,0.0,0.0,0.0,0.008734,0.0,0.0,0.0,...,0.0,0.0,0.0,0.004367,0.0,0.0,0.000233,0.553512,0.591,0.123356
3,36005002800,0.0,0.070796,0.0,0.0,0.00885,0.013274,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00035,0.744147,0.563,0.175879
4,36005003100,0.0,0.015306,0.0,0.0,0.0,0.020408,0.0,0.0,0.0,...,0.0,0.0,0.0,0.005102,0.0,0.0,0.000323,0.551839,0.714,0.013819


## Load Data for processing

In [14]:
# function for loading data, returns adjacency matrix, initial feature assignments and true labels

def load_data():

    G = nx.from_pandas_edgelist(data1, 'origin', 'destination', 'weight',create_using=nx.DiGraph())
    adj_list = np.array([nx.adjacency_matrix(G).todense()], dtype=float)
    
    init_feat = data.to_numpy()[:,:-4]
    
    true_label = data.to_numpy()[:,-4:]
    
    return adj_list,init_feat,true_label

adj,feature,labels = load_data()

## PCA and reduce dimension

In [15]:
pca = decomposition.PCA(n_components=10)
feature = pca.fit_transform(feature)

features = np.expand_dims(feature, axis=0)

## Importing different classifier from sklearn

In [16]:
from sklearn.linear_model import LinearRegression, BayesianRidge
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn import tree
from sklearn.gaussian_process import GaussianProcessRegressor

from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

## Define various baseline models on the data

In [17]:
train_results = []
val_results = []

# Baseline Model building to train
names = ['Linear Regression', 'Decision Tree Regression', 'Bayesian Ridge Regression', 'XGB Regression', 
         'Gradient Boosting Regression', 'Support Vector Regression', 'Gaussian Process Regression', 'GNN']

Classifiers = [
        LinearRegression(),
        tree.DecisionTreeRegressor(),
        BayesianRidge(),
        XGBRegressor(),
        GradientBoostingRegressor(),
        SVR(),
        GaussianProcessRegressor()
        ]

## Train various baseline models on the data

In [18]:
def baseline(i, label):

    print("Feature consider = "+data.columns[-4+i]+"\n\n\n")

    X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size = 0.2, random_state = seed)

    train_results.append([])
    val_results.append([])
    
    models = zip(names, Classifiers)

    for name, model in models:

        model.fit(X_train,y_train)
        train_acc = model.score(X_train,y_train)
        val_acc = model.score(X_test,y_test)

        msg = "{0}:\nTraining R2 : {1} Validation R2 : {2}\n".format(name, train_acc, val_acc)
        print(msg)

        train_results[i].append(train_acc)
        val_results[i].append(val_acc)

## Train baseline models on income feature

In [19]:
baseline(0,labels[:,0].reshape(-1,1))

Feature consider = income



Linear Regression:
Training R2 : 0.32050990698772797 Validation R2 : 0.04708650254443869

Decision Tree Regression:
Training R2 : 1.0 Validation R2 : -1.046651078846184

Bayesian Ridge Regression:
Training R2 : 0.3202569685733435 Validation R2 : 0.05772981552704426

XGB Regression:
Training R2 : 0.367449977900704 Validation R2 : 0.0029091721081254684

Gradient Boosting Regression:
Training R2 : 0.7816154807599411 Validation R2 : -0.11586968900720307

Support Vector Regression:
Training R2 : -1.1522069181587788 Validation R2 : -2.5443153088095594

Gaussian Process Regression:
Training R2 : 1.0 Validation R2 : -6.178594692974762



## Train baseline models on age feature

In [20]:
baseline(1,labels[:,1].reshape(-1,1))

Feature consider = age



Linear Regression:
Training R2 : 0.16070949818127567 Validation R2 : 0.2257788863150797

Decision Tree Regression:
Training R2 : 1.0 Validation R2 : -0.5215770164002791

Bayesian Ridge Regression:
Training R2 : 0.15880870653826018 Validation R2 : 0.22185212318555503

XGB Regression:
Training R2 : 0.9996796272982259 Validation R2 : 0.3796524604702898

Gradient Boosting Regression:
Training R2 : 0.720905476625093 Validation R2 : 0.237972873292715

Support Vector Regression:
Training R2 : 0.07189881541020271 Validation R2 : 0.08108730937978215

Gaussian Process Regression:
Training R2 : 1.0 Validation R2 : -36.45967037589225



## Train baseline models on employment_rate feature

In [21]:
baseline(2,labels[:,2].reshape(-1,1))

Feature consider = employment_rate



Linear Regression:
Training R2 : 0.19352720717716543 Validation R2 : 0.03009039462395291

Decision Tree Regression:
Training R2 : 1.0 Validation R2 : -1.0318876280933105

Bayesian Ridge Regression:
Training R2 : 0.19251298297339425 Validation R2 : 0.0358428271698239

XGB Regression:
Training R2 : 0.9995826116930051 Validation R2 : -0.04392079061553811

Gradient Boosting Regression:
Training R2 : 0.6629964595659406 Validation R2 : 0.11541313862949054

Support Vector Regression:
Training R2 : 0.12179870366385437 Validation R2 : 0.010832966239813224

Gaussian Process Regression:
Training R2 : 1.0 Validation R2 : -76.85731060818986



## Train baseline models on house price feature

In [22]:
baseline(3,labels[:,3].reshape(-1,1))

Feature consider = avg_house_price



Linear Regression:
Training R2 : 0.17608272027313987 Validation R2 : 0.17863407813177412

Decision Tree Regression:
Training R2 : 1.0 Validation R2 : -0.5466831867022277

Bayesian Ridge Regression:
Training R2 : 0.1750929053508904 Validation R2 : 0.17968828454795782

XGB Regression:
Training R2 : 0.9996588168150599 Validation R2 : 0.08957867060562275

Gradient Boosting Regression:
Training R2 : 0.6246968922814149 Validation R2 : 0.2516968317289663

Support Vector Regression:
Training R2 : 0.157652833447411 Validation R2 : 0.1709388509114621

Gaussian Process Regression:
Training R2 : 1.0 Validation R2 : -2.499504847051739



## Import packages for GNN

In [23]:
# import packages

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
torch.set_printoptions(sci_mode=False)
import time
from sklearn.metrics import r2_score

## Hyperparameters

In [24]:
# set initial model config

cuda = torch.cuda.is_available()
weight_decay = 1e-6
epochs = 100000
seed = 635
hidden = 20
lr = 0.0001

In [25]:
np.random.seed(seed)
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed(seed)

## Define symmetric normalization for the adjacency matrix

In [26]:
def normalize(adj):

    adj = torch.FloatTensor(adj)
    adj_id = torch.FloatTensor(torch.eye(adj.shape[1]))
    adj_id = adj_id.reshape((1, adj.shape[1], adj.shape[1]))
    adj_id = adj_id.repeat(adj.shape[0], 1, 1)
    adj = adj + adj_id
    rowsum = torch.FloatTensor(adj.sum(2))
    degree_mat_inv_sqrt = torch.diag_embed(torch.float_power(rowsum,-0.5), dim1=-2, dim2=-1).float()
    adj_norm = torch.bmm(torch.transpose(torch.bmm(adj,degree_mat_inv_sqrt),1,2),degree_mat_inv_sqrt)

    return adj_norm

## Define GNN Layers

In [27]:
class GNN1Layer(Module):

    def __init__(self, batch_size, in_features, out_features, first):
        super(GNN1Layer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.batch_size = batch_size
        
        # Initialse W1 = 1, W2 = 0 as pytorch learnable weights (parameters) that have require_grad = True which is
        # required for calculating gradients while backpropogating using gradient descent
        weight1_eye = torch.FloatTensor(torch.eye(in_features, out_features))
        weight1_eye = weight1_eye.reshape((1, in_features, out_features))
        weight1_eye = weight1_eye.repeat(batch_size, 1, 1)
        self.weight1 = Parameter(weight1_eye)
        if not first:
            self.weight2 = Parameter(torch.zeros(batch_size, in_features, out_features))
        else:
            self.weight2 = Parameter(torch.empty(batch_size, in_features, out_features))
            nn.init.kaiming_normal_(self.weight2, mode='fan_out')

    def forward(self, input, adj):
        # first term H*W1
        v1 = torch.bmm(input, self.weight1)
        # second term adj_norm*H*W2
        v2 = torch.bmm(torch.bmm(adj, input), self.weight2)
        # adding the two terms
        output = v1 + v2

        return output

## Define GNN Model

In [28]:
class GNN1(nn.Module):

    def __init__(self, batch_size, nfeat, ndim, hidden, first):
        super(GNN1, self).__init__()

        self.gc1 = GNN1Layer(batch_size, nfeat, hidden, first)
        self.gc2 = GNN1Layer(batch_size, hidden, ndim, first)

    def forward(self, x, adj):

        # Applying activation function sigma (doublerelu) on the layer propogation
        x = nn.Sigmoid()(self.gc1(x, adj))
        x = nn.Sigmoid()(self.gc2(x, adj))
        #x = x/x.sum(axis=2).unsqueeze(2) #normalize st sum = 1
        
        return x

## Define the training function

In [29]:
def train(adj,features,labels,train_indices,val_indices,first=False):
    
    # calculate symmetric normalisation for layer propogation
    adj_norm = normalize(adj)
    
    #labels = labels - 1
    
    # Convert from numpy to torch tensors
    adj = torch.FloatTensor(adj)
    adj_norm = torch.FloatTensor(adj_norm)
    features = torch.FloatTensor(features)
    labels = torch.FloatTensor(labels)
    
    # initialise the model
    model = GNN1(batch_size=adj.shape[0],
                nfeat=features.shape[-1],
                ndim=1,
                hidden=hidden,
                first=first)
    
    # Transfer the weights to GPU for training
    if cuda:
        model.cuda()
        features = features.cuda()
        adj = adj.cuda()
        adj_norm = adj_norm.cuda()
        labels = labels.cuda()
    
    # Train model
    t_total = time.time()

    # Using adam optimizers for backpropogation
    optimizer = optim.Adam(model.parameters(),
                           lr=lr, weight_decay=weight_decay)
    
    # loss function criteria is MSE
    criterion = nn.MSELoss()
    
    # Train for the no of epochs
    for epoch in range(epochs):

        t = time.time()
        
        model.train()
        
        # Pytorch accumulates gradient after every operation on tensors (defined by the model architecture)
        # with require_grad = True. With each new epoch, we need to reset this gradient to 0 to calculate gradient
        # for this epoch.
        optimizer.zero_grad()

        # get the output from forward propogation of our model
        output = model(features, adj_norm)
        
        
        # Calculate Train accuracy
        train_output = output[:,train_indices,:]
        train_labels = labels[train_indices,:]
        
        #train_accuracy = torch.sum(torch.argmax(train_output,axis=2)==train_labels.reshape(1,-1))/train_labels.shape[0]
        train_r2 = r2_score(torch.flatten(train_labels).detach().cpu(), torch.flatten(train_output).detach().cpu())
        
        # Calculate the loss between our models training output and true label
        loss = criterion(torch.flatten(train_output),torch.flatten(train_labels))
        
        # Calculate the gradients 
        loss.backward(retain_graph=True)

        # Update the weights
        optimizer.step()
        
        model.eval()
        
        # Calculate Validation accuracy
        with torch.no_grad():
            val_output = output[:,val_indices,:]
            val_labels = labels[val_indices,:]
            val_r2 = r2_score(torch.flatten(val_labels).detach().cpu(), torch.flatten(val_output).detach().cpu())

        # Print summary of training 
        if epoch == 0:
            best_loss = loss
            best_output = output
            best_r2 = train_r2
            best_val_r2 = val_r2
            best_val_output = val_output
        else:
            if loss < best_loss:
                best_loss = loss
                best_output = output
                best_r2 = train_r2
                best_val_r2 = val_r2
                best_val_output = val_output

        if epoch == 0 or (epoch+1) % 10000 == 0:
            print('Epoch: {:04d}'.format(epoch + 1),
                  'Train R2 score: {:.4f}'.format(best_r2.item()),
                  'Validation R2 score: {:.4f}'.format(best_val_r2.item()),
                  'Loss: {:.8f}'.format(best_loss.item()),
                  'time: {:.4f}s'.format(time.time() - t))
            
        if val_r2 > best_val_r2:
            train_r2 = train_r2.item()
            val_r2 = val_r2.item()
            
    print("Optimization Finished!")
    print("Total time elapsed: {:.4f}s".format(time.time() - t_total))
    
    return best_loss, best_output, train_r2, val_r2

## Divide dataset into 80% train and 20% validation

In [30]:
# set Train %

train_percentage = .8
    
# Train set
number_of_rows = features[0].shape[0]
train_indices = np.random.choice(number_of_rows, size=int(train_percentage*number_of_rows), replace=False)
val_indices = np.setdiff1d(np.arange(adj.shape[1]),train_indices)


## Run the training on income feature

In [31]:
# Start Train
loss, op, train_acc, val_acc = train(adj,features,labels[:,0].reshape(-1,1),train_indices,val_indices,True)

train_results[0].append(train_acc)
val_results[0].append(val_acc)

Epoch: 0001 Train R2 score: -188996.5622 Validation R2 score: -188270.4961 Loss: 0.01379014 time: 1.7147s
Epoch: 10000 Train R2 score: -39.6344 Validation R2 score: -36.7962 Loss: 0.00000296 time: 0.0000s
Epoch: 20000 Train R2 score: -3.4766 Validation R2 score: -3.8014 Loss: 0.00000033 time: 0.0100s
Epoch: 30000 Train R2 score: -2.5627 Validation R2 score: -2.9635 Loss: 0.00000026 time: 0.0000s
Epoch: 40000 Train R2 score: -2.5627 Validation R2 score: -2.9635 Loss: 0.00000026 time: 0.0000s
Epoch: 50000 Train R2 score: -2.5627 Validation R2 score: -2.9635 Loss: 0.00000026 time: 0.0000s
Epoch: 60000 Train R2 score: -2.5627 Validation R2 score: -2.9635 Loss: 0.00000026 time: 0.0035s
Epoch: 70000 Train R2 score: -2.5627 Validation R2 score: -2.9635 Loss: 0.00000026 time: 0.0000s
Epoch: 80000 Train R2 score: -2.5627 Validation R2 score: -2.9635 Loss: 0.00000026 time: 0.0000s
Epoch: 90000 Train R2 score: -2.5627 Validation R2 score: -2.9635 Loss: 0.00000026 time: 0.0000s
Epoch: 100000 Train

## Compare the accuracy of different models for income feature

In [32]:
d = {'Algorithm': names, 'Train Accuracy': train_results[0], 'Validation Accuracy': val_results[0]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,Algorithm,Train Accuracy,Validation Accuracy
0,Linear Regression,0.32051,0.047087
1,Decision Tree Regression,1.0,-1.046651
2,Bayesian Ridge Regression,0.320257,0.05773
3,XGB Regression,0.36745,0.002909
4,Gradient Boosting Regression,0.781615,-0.11587
5,Support Vector Regression,-1.152207,-2.544315
6,Gaussian Process Regression,1.0,-6.178595
7,GNN,-8.798335,-6.603077


## Run the training on age feature

In [34]:

loss, op, train_acc, val_acc = train(adj,features,labels[:,1].reshape(-1,1),train_indices,val_indices,True)

train_results[1].append(train_acc)
val_results[1].append(val_acc)

Epoch: 0001 Train R2 score: -8.3890 Validation R2 score: -8.8378 Loss: 0.11655407 time: 0.0100s
Epoch: 10000 Train R2 score: 0.0629 Validation R2 score: -0.0425 Loss: 0.01163330 time: 0.0000s
Epoch: 20000 Train R2 score: 0.1281 Validation R2 score: 0.1157 Loss: 0.01082428 time: 0.0101s
Epoch: 30000 Train R2 score: 0.1340 Validation R2 score: 0.1046 Loss: 0.01075037 time: 0.0000s
Epoch: 40000 Train R2 score: 0.1396 Validation R2 score: 0.1023 Loss: 0.01068037 time: 0.0050s
Epoch: 50000 Train R2 score: 0.1396 Validation R2 score: 0.1023 Loss: 0.01068037 time: 0.0040s
Epoch: 60000 Train R2 score: 0.1396 Validation R2 score: 0.1023 Loss: 0.01068037 time: 0.0101s
Epoch: 70000 Train R2 score: 0.1396 Validation R2 score: 0.1023 Loss: 0.01068037 time: 0.0000s
Epoch: 80000 Train R2 score: 0.1396 Validation R2 score: 0.1023 Loss: 0.01068037 time: 0.0000s
Epoch: 90000 Train R2 score: 0.1396 Validation R2 score: 0.1023 Loss: 0.01068037 time: 0.0000s
Epoch: 100000 Train R2 score: 0.1396 Validation 

## Compare the accuracy of different models for age feature

In [35]:
d = {'Algorithm': names, 'Train Accuracy': train_results[1], 'Validation Accuracy': val_results[1]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,Algorithm,Train Accuracy,Validation Accuracy
0,Linear Regression,0.160709,0.225779
1,Decision Tree Regression,1.0,-0.521577
2,Bayesian Ridge Regression,0.158809,0.221852
3,XGB Regression,0.99968,0.379652
4,Gradient Boosting Regression,0.720905,0.237973
5,Support Vector Regression,0.071899,0.081087
6,Gaussian Process Regression,1.0,-36.45967
7,GNN,0.115274,0.088015


## Run the training on employment_rate feature

In [36]:

loss, op, train_acc, val_acc = train(adj,features,labels[:,2].reshape(-1,1),train_indices,val_indices,True)

train_results[2].append(train_acc)
val_results[2].append(val_acc)

Epoch: 0001 Train R2 score: -56.4217 Validation R2 score: -62.9077 Loss: 0.37712896 time: 0.0000s
Epoch: 10000 Train R2 score: -0.0727 Validation R2 score: -0.0725 Loss: 0.00704511 time: 0.0060s
Epoch: 20000 Train R2 score: 0.0627 Validation R2 score: 0.0377 Loss: 0.00615620 time: 0.0000s
Epoch: 30000 Train R2 score: 0.1137 Validation R2 score: 0.0636 Loss: 0.00582082 time: 0.0000s
Epoch: 40000 Train R2 score: 0.1173 Validation R2 score: 0.0846 Loss: 0.00579761 time: 0.0000s
Epoch: 50000 Train R2 score: 0.1273 Validation R2 score: 0.0815 Loss: 0.00573142 time: 0.0000s
Epoch: 60000 Train R2 score: 0.1273 Validation R2 score: 0.0815 Loss: 0.00573142 time: 0.0000s
Epoch: 70000 Train R2 score: 0.1291 Validation R2 score: 0.0669 Loss: 0.00571978 time: 0.0031s
Epoch: 80000 Train R2 score: 0.1291 Validation R2 score: 0.0669 Loss: 0.00571978 time: 0.0045s
Epoch: 90000 Train R2 score: 0.1291 Validation R2 score: 0.0669 Loss: 0.00571978 time: 0.0020s
Epoch: 100000 Train R2 score: 0.1304 Validati

## Compare the accuracy of different models for employment_rate feature

In [37]:
d = {'Algorithm': names, 'Train Accuracy': train_results[2], 'Validation Accuracy': val_results[2]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,Algorithm,Train Accuracy,Validation Accuracy
0,Linear Regression,0.193527,0.03009
1,Decision Tree Regression,1.0,-1.031888
2,Bayesian Ridge Regression,0.192513,0.035843
3,XGB Regression,0.999583,-0.043921
4,Gradient Boosting Regression,0.662996,0.115413
5,Support Vector Regression,0.121799,0.010833
6,Gaussian Process Regression,1.0,-76.857311
7,GNN,0.081006,0.102662


## Run the training on house_price feature

In [38]:

loss, op, train_acc, val_acc = train(adj,features,labels[:,3].reshape(-1,1),train_indices,val_indices,True)

train_results[3].append(train_acc)
val_results[3].append(val_acc)

Epoch: 0001 Train R2 score: -4.3148 Validation R2 score: -3.4636 Loss: 0.31297469 time: 0.0091s
Epoch: 10000 Train R2 score: 0.0085 Validation R2 score: -0.0168 Loss: 0.05838805 time: 0.0020s
Epoch: 20000 Train R2 score: 0.0803 Validation R2 score: 0.0454 Loss: 0.05415805 time: 0.0000s
Epoch: 30000 Train R2 score: 0.0925 Validation R2 score: 0.0926 Loss: 0.05344074 time: 0.0000s
Epoch: 40000 Train R2 score: 0.1173 Validation R2 score: 0.1071 Loss: 0.05197755 time: 0.0031s
Epoch: 50000 Train R2 score: 0.1256 Validation R2 score: 0.0885 Loss: 0.05149070 time: 0.0080s
Epoch: 60000 Train R2 score: 0.1256 Validation R2 score: 0.0885 Loss: 0.05149070 time: 0.0000s
Epoch: 70000 Train R2 score: 0.1256 Validation R2 score: 0.0885 Loss: 0.05149070 time: 0.0000s
Epoch: 80000 Train R2 score: 0.1256 Validation R2 score: 0.0885 Loss: 0.05149070 time: 0.0040s
Epoch: 90000 Train R2 score: 0.1256 Validation R2 score: 0.0885 Loss: 0.05149070 time: 0.0000s
Epoch: 100000 Train R2 score: 0.1256 Validation 

## Compare the accuracy of different models for house_price feature

In [39]:
d = {'Algorithm': names, 'Train Accuracy': train_results[3], 'Validation Accuracy': val_results[3]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,Algorithm,Train Accuracy,Validation Accuracy
0,Linear Regression,0.176083,0.178634
1,Decision Tree Regression,1.0,-0.546683
2,Bayesian Ridge Regression,0.175093,0.179688
3,XGB Regression,0.999659,0.089579
4,Gradient Boosting Regression,0.624697,0.251697
5,Support Vector Regression,0.157653,0.170939
6,Gaussian Process Regression,1.0,-2.499505
7,GNN,0.078257,0.077675
