In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.data import DataLoader
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import EllipticBitcoinDataset
from torch_geometric.utils import k_hop_subgraph
from sklearn.metrics import classification_report
import requests
import os
import pickle

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dataset_lib = EllipticBitcoinDataset(root='EllipticBitcoin', transform=None)
data_lib = dataset_lib[0]

In [3]:
base_url = 'http://localhost:5004'
node_url = f"{base_url}/api/nodes"
edge_url = f"{base_url}/api/edges"
headers = {"Content-Type": "application/json"}
batch_size = 10_000
cache_file = 'data_cache.pkl'

In [4]:
def fetch_data(url, headers, batch_size, page):
    params = {"page_size": batch_size, "page": page}
    response = requests.get(url, headers=headers, params=params)
    response.raise_for_status()
    return response.json()

def process_nodes(existing_nodes, new_nodes):
    node_id_map = {node['id']: idx for idx, node in enumerate(existing_nodes)}
    start_idx = len(existing_nodes)
    
    for node in new_nodes:
        if node['id'] not in node_id_map:
            node_id_map[node['id']] = start_idx
            existing_nodes.append(node)
            start_idx += 1

def process_edges(existing_edges, new_edges, existing_nodes):
    node_id_map = {node['id']: idx for idx, node in enumerate(existing_nodes)}

    for edge in new_edges:
        if edge['src'] in node_id_map and edge['dst'] in node_id_map:
            existing_edges.append(edge)

def save_cache(nodes, edges, node_page, edge_page):
    with open(cache_file, 'wb') as f:
        pickle.dump((nodes, edges, node_page, edge_page), f)

def load_cache():
    if os.path.exists(cache_file):
        with open(cache_file, 'rb') as f:
            return pickle.load(f)
    return None, None, 1, 1

def fetch_and_process_data():
    existing_nodes, existing_edges, node_page, edge_page = load_cache()

    if existing_nodes is None:
        existing_nodes = []
    if existing_edges is None:
        existing_edges = []

    try:
        while True:
            node_data = fetch_data(node_url, headers, batch_size, node_page)
            new_nodes = node_data['results']
            
            if not new_nodes:
                break
            
            process_nodes(existing_nodes, new_nodes)
            start_idx = (node_page - 1) * batch_size
            end_idx = start_idx + len(new_nodes) - 1
            print(f"Nodes {start_idx}-{end_idx} retrieved.")
            node_page += 1
            save_cache(existing_nodes, existing_edges, node_page, edge_page)

        while True:
            edge_data = fetch_data(edge_url, headers, batch_size, edge_page)
            new_edges = edge_data['results']
            
            if not new_edges:
                break
            
            process_edges(existing_edges, new_edges, existing_nodes)
            start_idx = (edge_page - 1) * batch_size
            end_idx = start_idx + len(new_edges) - 1
            print(f"Edges {start_idx}-{end_idx} retrieved.")
            edge_page += 1
            save_cache(existing_nodes, existing_edges, node_page, edge_page)

    except Exception as e:
        print(f"An error occurred: {e}")
        save_cache(existing_nodes, existing_edges, node_page, edge_page)
        raise

    return existing_nodes, existing_edges

In [5]:
nodes, edges = fetch_and_process_data()

In [6]:
def create_data_object(nodes, edges):
    sorted_nodes = sorted(nodes, key=lambda node: node['orig_id'])
    sorted_edges = sorted(edges, key=lambda edge: edge['orig_id'])

    node_features = []
    node_labels = []
    node_id_map = {}

    for idx, node in enumerate(sorted_nodes):
        node_id_map[node['id']] = idx
        node_features.append(node['x'])
        node_labels.append(node['y'])

    node_features = torch.tensor(node_features, dtype=torch.float)
    node_labels = torch.tensor(node_labels, dtype=torch.long)

    edge_index = []
    for edge in sorted_edges:
        if edge['src'] in node_id_map and edge['dst'] in node_id_map:
            src = node_id_map[edge['src']]
            dst = node_id_map[edge['dst']]
            edge_index.append([src, dst])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    return Data(x=node_features, edge_index=edge_index, y=node_labels)

In [7]:
data = create_data_object(nodes, edges)
data.train_mask = data_lib.train_mask.clone()
data.test_mask = data_lib.test_mask.clone()
data = data.to(device)

In [123]:
import torch

class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
        self.num_hops = sum(1 for layer in self.children() if isinstance(layer, GCNConv)) + 1
        self.optimizer = None
        self.loss_fn = None

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

    def compile(self, optimizer, loss_fn, class_weights=None):
        self.optimizer = optimizer
        if class_weights is not None:
            class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(next(self.parameters()).device)
            self.loss_fn = loss_fn(weight=class_weights_tensor)
        else:
            self.loss_fn = loss_fn()

    def fit(self, data, epochs=1000):
        for epoch in range(epochs):
            self.train()
            self.optimizer.zero_grad()
            out = self(data)
            loss = self.loss_fn(out[data.train_mask], data.y[data.train_mask])
            loss.backward()
            self.optimizer.step()

            if epoch % 100 == 0:
                acc = self.evaluate(data)
                print(f'Epoch {epoch}, Loss: {loss:.4f}, Test Accuracy: {acc:.4f}')

    def predict(self, data, node_idx=None):
        self.eval()
        with torch.no_grad():
            if node_idx is not None:
                subset, edge_index, mapping, _ = k_hop_subgraph(node_idx, self.num_hops, data.edge_index, relabel_nodes=True)
                sub_data = Data(x=data.x[subset], edge_index=edge_index)
            else:
                sub_data = data
            out = self(sub_data)
            probabilities = F.softmax(out, dim=1)
            predictions = probabilities.argmax(dim=1)
            if node_idx is not None:
                return predictions[mapping.item()], probabilities[mapping.item()]
            else:
                return predictions, probabilities

    def evaluate(self, data):
        self.eval()
        with torch.no_grad():
            out = self(data)
            pred = out.argmax(dim=1)
            correct = pred[data.test_mask] == data.y[data.test_mask]
            acc = int(correct.sum()) / int(data.test_mask.sum())
            return acc

In [115]:
edge_index = torch.tensor([[0, 1, 2, 3, 3, 5],
                           [4, 2, 2, 2, 2, 2]])



In [39]:
subset, edge_index, _, _ = k_hop_subgraph(0, 2, data.edge_index)

In [43]:
data.x[subset].shape

torch.Size([3, 165])

In [44]:
sub_data = Data(x=data.x[subset], edge_index=edge_index)

In [50]:
subset, edge_index, _, _ = k_hop_subgraph(0, 2, data.edge_index, relabel_nodes=True)
sub_data = Data(x=data.x[subset], edge_index=edge_index)

In [51]:
sub_data.edge_index

tensor([[1, 2],
        [2, 0]], device='cuda:0')

In [30]:
k_hop_subgraph(4, 2, data.edge_index, relabel_nodes=True)

(tensor([  4, 281, 282, 304], device='cuda:0'),
 tensor([[1, 3, 1, 3, 2],
         [2, 2, 0, 1, 0]], device='cuda:0'),
 tensor([0], device='cuda:0'),
 tensor([False, False, False,  ..., False, False, False], device='cuda:0'))

In [124]:
model = GCN(in_channels=data.x.shape[1], hidden_channels=100, out_channels=2).to(device)

In [125]:
class_weights = [0.3, 0.7]
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
model.compile(optimizer, torch.nn.CrossEntropyLoss, class_weights)

In [126]:
model.fit(data, epochs=100)

Epoch 0, Loss: 2.3002, Test Accuracy: 0.5040


In [132]:
model.predict(data)

(tensor(0, device='cuda:0'), tensor([0.6453, 0.3547], device='cuda:0'))

In [98]:
# Predict for all nodes
predictions_all, probabilities_all = model.predict(data)
print(predictions_all)
print(probabilities_all)

# Predict for node with index 0
predictions_node_0, probabilities_node_0, mapping_node_0 = model.predict(data, node_idx=2)
print(predictions_node_0)
print(probabilities_node_0)
print(mapping_node_0)

tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0')
tensor([[0.8257, 0.1743],
        [0.8332, 0.1668],
        [0.5656, 0.4344],
        ...,
        [0.6775, 0.3225],
        [0.6726, 0.3274],
        [0.5105, 0.4895]], device='cuda:0')
tensor([0, 0, 0, 0], device='cuda:0')
tensor([[0.5656, 0.4344],
        [0.8038, 0.1962],
        [0.8054, 0.1946],
        [0.6033, 0.3967]], device='cuda:0')
tensor([0], device='cuda:0')


In [103]:
mapping_node_0.item()

0

In [None]:
(tensor([0, 0, 0,  ..., 0, 0, 1], device='cuda:0'),
 tensor([[0.8298, 0.1702],
         [0.8592, 0.1408],
         [0.6100, 0.3900],
         ...,
         [0.5784, 0.4216],
         [0.6302, 0.3698],
         [0.4238, 0.5762]], device='cuda:0'))

In [None]:
(tensor([0, 0, 0], device='cuda:0'),
 tensor([[0.8442, 0.1558],
         [0.7852, 0.2148],
         [0.8122, 0.1878]], device='cuda:0'),
 tensor([0], device='cuda:0'))

In [None]:
(tensor([0, 0, 0,  ..., 1, 0, 1], device='cuda:0'),
 tensor([[0.8067, 0.1933],
         [0.8420, 0.1580],
         [0.6114, 0.3886],
         ...,
         [0.4606, 0.5394],
         [0.6813, 0.3187],
         [0.3630, 0.6370]], device='cuda:0'))

In [None]:
(tensor([0, 0], device='cuda:0'),
 tensor([[0.9275, 0.0725],
         [0.8699, 0.1301]], device='cuda:0'),
 tensor([0], device='cuda:0'))