In [1]:
import torch
import torch.nn as nn
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.data import InMemoryDataset
import pickle
import pandas as pd
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
import torch.nn.utils.rnn as rnnutils

def unpack_sequence(packed_sequence, lengths):
    assert isinstance(packed_sequence, rnnutils.PackedSequence)
    head = 0
    trailing_dims = packed_sequence.data.shape[1:]
    unpacked_sequence = [torch.zeros(l, *trailing_dims) for l in lengths]
    # l_idx - goes from 0 - maxLen-1
    for l_idx, b_size in enumerate(packed_sequence.batch_sizes):
        for b_idx in range(b_size):
            unpacked_sequence[b_idx][l_idx] = packed_sequence.data[head]
            head += 1
    return unpacked_sequence

In [3]:
path = '../blockchain_files/3000000to3999999_BlockTransaction/'
with open(path+"graphs_data_rnn_1", "rb") as fp:   # Unpickling
    read_graph_data = pickle.load(fp)

In [4]:
data = read_graph_data[0]
edges,edge_attributes,edge_attrib_lenghts,label = data

In [5]:
ins = unpack_sequence(edge_attributes,edge_attrib_lenghts)
len(ins)
label

0

In [6]:
data_list = []
for edges,edge_attributes,edge_attrib_lenghts,label in read_graph_data:
    if len(edge_attributes) == 0:
        continue
    data_list.append((edges,edge_attributes,edge_attrib_lenghts,label))

In [7]:
def get_graph_data(edges,edge_attributes,edge_attrib_lenghts,label):
    pd_edges = pd.DataFrame(edges)
    pd_edges
    nodes = pd.concat([pd_edges[0]
                      ,pd_edges[1]]).unique()
    edge_index = torch.tensor(np.array(edges).T, dtype=torch.long)
    x = torch.ones(nodes.shape).reshape(-1,1)
    output, (hn, cn) = rnn(edge_attributes.to(device))
    
#     out = unpack_sequence(output,edge_attrib_lenghts)
#     print(out[0].shape)
#     print(len(out))
#     reduced = []
#     for o in out:
#         reduced.append(o.sum(axis=0))
    
#     print(len(reduced))
#     print(reduced[0])
#     edge_attribs = torch.stack(reduced)
    
#     print(edge_attribs.shape)
#     print(edge_index.shape)
    
    data = Data(x=x, edge_index=edge_index,edge_attr = hn[0],y=torch.tensor([label]))
    
    return data
# get_graph_data(edges,edge_attributes,edge_attrib_lenghts,label)()

In [8]:
def create_data_loader(n):
    indices = np.random.choice(len(data_list),5)
    
    graphs = []
    for i in indices:
        graph = data_list[i]
        graphs.append(get_graph_data(graph[0],graph[1],graph[2],graph[3]))
        
    train_loader = DataLoader(graphs, batch_size=n,shuffle=True)
    
    return train_loader

# create_data_loader(3)

In [50]:
graph = data_list[0]
print(graph[3])

0


In [9]:
from typing import Tuple, Union

import torch
import torch.nn.functional as F
from torch import Tensor
from torch.nn import BatchNorm1d, Linear

from torch_geometric.nn.conv import MessagePassing
from torch_geometric.typing import Adj, OptTensor, PairTensor


class CGConv(MessagePassing):

    def __init__(self, channels: Union[int, Tuple[int, int]], dim: int = 0,
                 aggr: str = 'mean', batch_norm: bool = False,
                 bias: bool = True, **kwargs):
        super().__init__(aggr=aggr, **kwargs)
        # print(channels)
        self.channels = channels
        self.dim = dim
        self.batch_norm = batch_norm

        if isinstance(channels, int):
            channels = (channels, channels)
        self.lin_f = Linear(2*channels[0] + dim, channels[1], bias=bias)
        # self.lin_s = Linear(2*channels[0] + dim, channels[1], bias=bias)
        if batch_norm:
            self.bn = BatchNorm1d(channels[1])
        else:
            self.bn = None

        self.reset_parameters()


    def reset_parameters(self):
        self.lin_f.reset_parameters()
        # self.lin_s.reset_parameters()
        if self.bn is not None:
            self.bn.reset_parameters()


    def forward(self, x: Union[Tensor, PairTensor], edge_index: Adj,
                edge_attr: OptTensor = None) -> Tensor:
        """"""
        if isinstance(x, Tensor):
            x: PairTensor = (x, x)

        # propagate_type: (x: PairTensor, edge_attr: OptTensor)
        out = self.propagate(edge_index, x=x, edge_attr=edge_attr, size=None)
        out = out if self.bn is None else self.bn(out)
        out += x[1]
        # print(out.shape)
        # print(x[1])
        return out


    def message(self, x_i, x_j, edge_attr: OptTensor) -> Tensor:
        if edge_attr is None:
            z = torch.cat([x_i, x_j], dim=-1)
        else:
            z = torch.cat([x_i, x_j, edge_attr], dim=-1)

        # print(z.shape)
        # print(self.lin_f)
        return self.lin_f(z)

    def __repr__(self) -> str:
        return f'{self.__class__.__name__}({self.channels}, dim={self.dim})'

In [10]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
#         torch.manual_seed(12345)
        self.conv1 = CGConv((1, hidden_channels),16,batch_norm=True)
#         self.conv1 = GCNConv(1, hidden_channels)
        
        self.conv2 = CGConv((hidden_channels, hidden_channels),16,batch_norm=True)
        self.conv3 = CGConv((hidden_channels, hidden_channels),16,batch_norm=True)
        self.conv4 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, 2)

    def forward(self, x, edge_index,edge_attributes, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index,edge_attributes)
        x = x.relu()
        x = self.conv2(x, edge_index,edge_attributes)
        x = x.relu()
        x = self.conv3(x, edge_index,edge_attributes)
        x = x.relu()
        x = self.conv4(x, edge_index)
 

        # 2. Readout layer
#         print(x.shape)
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]
#         print(x.shape)
        # 3. Apply a final classifier
#         x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): CGConv((1, 64), dim=16)
  (conv2): CGConv((64, 64), dim=16)
  (conv3): CGConv((64, 64), dim=16)
  (conv4): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [16]:
rnn = nn.LSTM(4, 16, 1).to(device)
model = GCN(hidden_channels=64).to(device)

net_params = list(model.parameters()) + list(rnn.parameters())
optimizer = torch.optim.Adam(net_params, lr=0.01)

# weights = torch.tensor([.05,.95]).to(device)
criterion = torch.nn.CrossEntropyLoss()

def train_mode():
    rnn.train()
    model.train()
train_mode()

In [17]:
def train():
    train_mode()
    for i in range(len(data_list)):

        train_loader = create_data_loader(10)
        
        data = next(iter(train_loader))
#         print(data.edge_attr)
        data = data.to(device)
        out = model(data.x, data.edge_index,data.edge_attr,data.batch)
        
        loss = criterion(out, data.y)
        
        loss.backward()  # Derive gradients.
        
        optimizer.step()
        optimizer.zero_grad()
#         break
    return loss.cpu().item()
# train()

In [18]:
print(test())
for epoch in range(1, 20):
    loss = train()
    train_acc = test()
#     test_acc = test()

    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, loss : {loss:.4f}')

0.7229299363057324
Epoch: 001, Train Acc: 0.8615, loss : 0.4881
Epoch: 002, Train Acc: 0.9379, loss : 0.2825
Epoch: 003, Train Acc: 0.9347, loss : 0.0337
Epoch: 004, Train Acc: 0.9666, loss : 0.0009
Epoch: 005, Train Acc: 0.9475, loss : 0.0415
Epoch: 006, Train Acc: 0.9857, loss : 0.0018


KeyboardInterrupt: 

In [13]:
def test_mode():
    rnn.eval()
    model.eval()
    
def create_test_loader(st,end):
    graphs = []
    for i in range(st,end):
        if i >= len(data_list):
            break
        graph = data_list[i]
        graphs.append(get_graph_data(graph[0],graph[1],graph[2],graph[3]))
        
    loader = DataLoader(graphs, batch_size=(end-st),shuffle=True)
    
    return loader

# create_test_loader(3,6)

In [14]:
def test():
    test_mode()
    n  = 40
    batch_size = int(np.ceil(len(data_list)/n))
    
    correct = 0
    
    for j in range(n):
#         print((j*batch_size,(j+1)*batch_size))
        test_loader = create_test_loader(j*batch_size,(j+1)*batch_size)
        data = next(iter(test_loader))
        data = data.to(device)
        
        out = model(data.x, data.edge_index,data.edge_attr, data.batch)  
        pred = out.argmax(dim=1)
        correct += int((pred == data.y).sum())
    return correct / len(data_list)
test()

0.6767515923566879

In [87]:
np.ceil(len(data_list)/10)

63.0

In [9]:
def train():
    model.train()
    for batch_idx,data in enumerate(train_loader):


        data.edge_attr = data.edge_attr.float()
        data = data.to(device)
        out = model(data.x, data.edge_index,data.edge_attr,data.batch)

        loss = criterion(out, data.y)  # Compute the loss.
        loss.backward()  # Derive gradients.
        
        optimizer.step()
        optimizer.zero_grad()
#             print(batch_idx)
        

    return loss.cpu().item()
    
def test(loader):
    model.eval()

    correct = 0
    for data in loader:  # Iterate in batches over the training/test dataset.
        


        data.edge_attr = data.edge_attr.float() 
        data = data.to(device)
        out = model(data.x, data.edge_index,data.edge_attr, data.batch)  
        pred = out.argmax(dim=1)  # Use the class with highest probability.
        

        correct += int((pred == data.y).sum())  # Check against ground-truth labels.
    return correct / len(loader.dataset)  # Derive ratio of correct predictions.

test(test_loader)

0.4915254237288136

In [10]:

for epoch in range(1, 400):
    loss = train()
#     break
    train_acc = test(test_loader)
#     test_acc = test(test_loader)
    if epoch % 5 == 0: 
        print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, loss : {loss:.4f}')

Epoch: 005, Train Acc: 0.7429, loss : 0.4767
Epoch: 010, Train Acc: 0.7401, loss : 0.7171
Epoch: 015, Train Acc: 0.5311, loss : 0.6414
Epoch: 020, Train Acc: 0.7486, loss : 0.6560
Epoch: 025, Train Acc: 0.7740, loss : 0.7737
Epoch: 030, Train Acc: 0.7910, loss : 0.5825
Epoch: 035, Train Acc: 0.7881, loss : 0.4266
Epoch: 040, Train Acc: 0.7712, loss : 0.4582
Epoch: 045, Train Acc: 0.7853, loss : 0.3972
Epoch: 050, Train Acc: 0.7853, loss : 0.5311
Epoch: 055, Train Acc: 0.7966, loss : 0.3808
Epoch: 060, Train Acc: 0.7994, loss : 0.0512
Epoch: 065, Train Acc: 0.8249, loss : 0.2856
Epoch: 070, Train Acc: 0.7571, loss : 0.2421
Epoch: 075, Train Acc: 0.7966, loss : 0.2986
Epoch: 080, Train Acc: 0.8220, loss : 0.9112
Epoch: 085, Train Acc: 0.7740, loss : 0.0514
Epoch: 090, Train Acc: 0.8107, loss : 0.0952
Epoch: 095, Train Acc: 0.8305, loss : 0.1733
Epoch: 100, Train Acc: 0.8249, loss : 0.3040
Epoch: 105, Train Acc: 0.8107, loss : 0.1430
Epoch: 110, Train Acc: 0.8305, loss : 3.9038
Epoch: 115

In [11]:
train_acc = test(test_loader)

In [12]:
train_acc

0.9491525423728814

In [13]:
import numpy as np
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([p.numel() for p in model_parameters])
params

22338

In [22]:
read_graph_data[0]

Data(x=[12995, 1], edge_index=[2, 17027], edge_attr=[17027, 5], y=[1])