# Feature extraction from business relationship graphs by GCN

We show the source codes used in the paper "Feature extraction from business relationship graphs by Graph Convolutional Networks" in this notebook.

Although the paper uses actual bank data, we use fictitious data in this notebook for demonstration purposes. So, the execution results will differ from the paper.

## 1. Preparation

### 1.1. Generate dummy data

In [1]:
!python generate_dummy_data.py

### 1.2. Import packages and setup devices

In [2]:
import os
import math
import random
import numpy as np
import pandas as pd
from sklearn import metrics
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import dgl
import dgl.function as fn
from tqdm import tqdm_notebook as tqdm

In [3]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [4]:
seed = 123
seed_everything(seed)

In [5]:
torch.cuda.is_available()

True

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## 2. Train

### 2.1. Read target file

In [7]:
df_target = pd.read_csv('data/train/target.csv', index_col='company_id')
df_target.head()

Unnamed: 0_level_0,target
company_id,Unnamed: 1_level_1
0,1
1,0
2,0
3,1
4,0


### 2.2. Read business relationship graphs

In [8]:
def make_node_feature(n_nodes, n_features):
    t1 = torch.eye(n_features)
    t2 = torch.zeros(n_nodes-n_features, n_features)
    t2[:, n_features-1] = 1
    return torch.cat([t1, t2])

In [9]:
graphs = []
for i in tqdm(df_target.index):
    try:
        Di_G = nx.read_weighted_edgelist(
            'data/train/{}.edgelist'.format(i), delimiter=',', nodetype=int, create_using=nx.DiGraph()
        )
        last_id = sorted(list(Di_G.nodes))[-1]
        for i in range(last_id):
            if not Di_G.has_node(i):
                Di_G.add_node(i)
        G = dgl.DGLGraph()
        G.from_networkx(Di_G, edge_attrs=['weight'])
        G.ndata['h'] = make_node_feature(G.number_of_nodes(), 8)
        G.edata['direction'] = torch.zeros(G.number_of_edges())
        src, dst = G.edges()
        weight = G.edata['weight']
        direction = torch.ones(G.number_of_edges())
        G.add_edges(dst, src, data={'weight': weight, 'direction': direction})
        G.to(device)
        graphs.append(G)
    except Exception as e:
        print(i)
        raise e

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [10]:
graphs = np.array(graphs)

### 2.3. Define model

In [11]:
class BiDirectionalGCN(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(BiDirectionalGCN, self).__init__()
        self.fc = nn.Linear(in_feats*3, out_feats)
        self.activation = activation

    def message_func(self, edges):
        weight = edges.data['weight'].view(-1, 1)
        messages = edges.src['h'] * weight
        return {'m': messages, 'direction': edges.data['direction']}
    
    def reduce_func(self, nodes):
        messages = nodes.mailbox['m']
        direction = nodes.mailbox['direction'].view(messages.size()[0], messages.size()[1], 1)
        h0 = torch.sum(messages * direction, dim=1)
        direction = (direction + 1) % 2  # 0=>1, 1=>0
        h1 = torch.sum(messages * direction, dim=1)
        return {'h0': h0, 'h1': h1}

    def node_apply_func(self, nodes):
        h = torch.cat([nodes.data['h'], nodes.data['h0'], nodes.data['h1']], dim=1)
        h = self.activation(self.fc(h))
        return {'h': h}
    
    def forward(self, g, h):
        g.ndata['h'] = h
        g.update_all(self.message_func, self.reduce_func)
        g.apply_nodes(func=self.node_apply_func)
        return g.ndata.pop('h')

In [12]:
class GATWithSelfLoop(nn.Module):
    def __init__(self, in_dim, out_dim, activation):
        super(GATWithSelfLoop, self).__init__()
        self.fc = nn.Linear(in_dim, out_dim, bias=False)
        self.attn_fc = nn.Linear(2 * out_dim, 1, bias=False)
        self.self_fc = nn.Linear(in_dim, out_dim, bias=False)
        self.self_attn_fc = nn.Linear(out_dim, 1, bias=False)
        self.activation = activation
        
    def node_apply_func(self, nodes):
        z_self = self.self_fc(nodes.data['h'])
        return {
            'z': self.fc(nodes.data['h']),
            'z_self': z_self,
            'e_self': F.leaky_relu(self.self_attn_fc(z_self))
        }
    
    def edge_attention(self, edges):
        z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=1)
        e = F.leaky_relu(self.attn_fc(z2))
        self.attentions = e.clone().detach()
        return {'e': e}
    
    def message_func(self, edges):
        return {'z': edges.src['z'], 'e': edges.data['e']}
    
    def reduce_func(self, nodes):
        e_self = nodes.data['e_self'].view(-1, 1, 1)
        e = torch.cat([nodes.mailbox['e'], e_self], dim=1)
        alpha = F.softmax(e, dim=1)
        z_self = nodes.data['z_self']
        hidden_dim = z_self.size(1)
        z_self = z_self.view(-1, 1, hidden_dim)
        z = torch.cat([nodes.mailbox['z'], z_self], dim=1)
        h = self.activation(torch.sum(alpha * z, dim=1))
        return {'h': h}
    
    def forward(self, g, h):
        g.ndata['h'] = h
        g.apply_nodes(self.node_apply_func)
        g.apply_edges(self.edge_attention)
        g.update_all(self.message_func, self.reduce_func)
        return g.ndata.pop('h')

In [13]:
class Classifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes):
        super(Classifier, self).__init__()
        self.layers = nn.ModuleList([
            BiDirectionalGCN(in_dim, hidden_dim, F.relu),
            GATWithSelfLoop(hidden_dim, hidden_dim, F.relu)
        ])
        self.fc1 = nn.Linear(hidden_dim*9, hidden_dim)
        self.activation = F.relu
        self.fc2 = nn.Linear(hidden_dim, n_classes)
        self.hidden_dim = hidden_dim
    
    def forward(self, g):
        h = g.ndata.pop('h')
        for conv in self.layers:
            h = conv(g, h)
        g.ndata['h'] = h
        features = []
        for graph in dgl.unbatch(g):
            node_features = graph.ndata['h'][0:7]
            mean = torch.mean(graph.ndata['h'][7:], 0).view(1, -1)
            max_values, _ = torch.max(graph.ndata['h'][7:], 0)
            max_values = max_values.view(1, -1)
            concat = torch.cat([node_features, mean, max_values], 0).view(1, -1)
            features.append(concat)
        tensor = torch.cat(features)
        return self.fc2(self.activation(self.fc1(tensor)))

### 2.4. Fit

In [14]:
labels = torch.tensor(df_target['target'].values).float()

In [15]:
epochs = 20
batch_size = 100
hidden_dim = 16

In [16]:
def collate(samples):
    graphs, labels = map(list, zip(*samples))
    batched_graph = dgl.batch(graphs)
    return batched_graph, torch.tensor(labels).to(device)

In [17]:
trainset = list(zip(graphs, labels))
data_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True, collate_fn=collate)

In [18]:
model = Classifier(8, hidden_dim, 1).to(device)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in tqdm(range(epochs)):
    # Train
    model.train()
    epoch_loss = 0
    for iter, (bg, label) in enumerate(data_loader):
        prediction = model(bg).view(-1)
        loss = loss_func(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.detach().item()
    epoch_loss /= (iter + 1)
    print('Train loss:{}'.format(epoch_loss))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Train loss:0.6024249017238616
Train loss:0.5861609876155853
Train loss:0.580128276348114
Train loss:0.5747227817773819
Train loss:0.568929237127304
Train loss:0.5661774933338165
Train loss:0.5617933869361877
Train loss:0.5609187304973602
Train loss:0.5582446873188018
Train loss:0.5574558764696121
Train loss:0.5549972593784332
Train loss:0.5555018901824951
Train loss:0.5538880228996277
Train loss:0.555523294210434
Train loss:0.5564940989017486
Train loss:0.5538007497787476
Train loss:0.5524279206991196
Train loss:0.5502881675958633
Train loss:0.5522221475839615
Train loss:0.5498411387205124



## 3. Test

### 3.1. Read target file

In [19]:
df_target = pd.read_csv('data/test/target.csv', index_col='company_id')
df_target.head()

Unnamed: 0_level_0,target
company_id,Unnamed: 1_level_1
0,0
1,0
2,1
3,0
4,0


### 3.2. Read business relationship graph

In [20]:
graphs = []
for i in tqdm(df_target.index):
    try:
        Di_G = nx.read_weighted_edgelist(
            'data/test/{}.edgelist'.format(i), delimiter=',', nodetype=int, create_using=nx.DiGraph()
        )
        last_id = sorted(list(Di_G.nodes))[-1]
        for i in range(last_id):
            if not Di_G.has_node(i):
                Di_G.add_node(i)
        G = dgl.DGLGraph()
        G.from_networkx(Di_G, edge_attrs=['weight'])
        G.ndata['h'] = make_node_feature(G.number_of_nodes(), 8)
        G.edata['direction'] = torch.zeros(G.number_of_edges())
        src, dst = G.edges()
        weight = G.edata['weight']
        direction = torch.ones(G.number_of_edges())
        G.add_edges(dst, src, data={'weight': weight, 'direction': direction})
        G.to(device)
        graphs.append(G)
    except Exception as e:
        print(i)
        raise e

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




### 3.3. Predict

In [21]:
total_count = len(df_target)
iteration_count = math.ceil(total_count / batch_size)

In [22]:
df_probs_list = []
for i in tqdm(range(iteration_count)):
    start = i * batch_size
    end = (i+1) * batch_size
    if end > total_count:
        end = total_count
    batch_graphs = dgl.batch(graphs[start:end])
    model.eval()
    prediction = model(batch_graphs).view(-1)
    probs = torch.sigmoid(prediction).to(torch.device('cpu')).view(-1).detach().numpy()
    df = df_target.iloc[start:end].copy()
    df['probability'] = probs
    df_probs_list.append(df)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [23]:
df_probs = pd.concat(df_probs_list)
df_probs.head()

Unnamed: 0_level_0,target,probability
company_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,0.189258
1,0,0.133027
2,1,0.320061
3,0,0.410629
4,0,0.177759


### 3.4. Evaluation

In [24]:
fpr, tpr, thresholds = metrics.roc_curve(df_probs['target'], df_probs['probability'])
print(metrics.auc(fpr, tpr))

0.6317605276509386
