In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.data import DataLoader
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import EllipticBitcoinDataset
from sklearn.metrics import classification_report
import requests
import os
import pickle

In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [51]:
dataset_lib = EllipticBitcoinDataset(root='EllipticBitcoin', transform=None)
data_lib = dataset_lib[0]

In [3]:
data_lib

Data(x=[203769, 165], edge_index=[2, 234355], y=[203769], train_mask=[203769], test_mask=[203769])

In [4]:
base_url = 'http://localhost:5004'
node_url = f"{base_url}/api/nodes"
edge_url = f"{base_url}/api/edges"
headers = {"Content-Type": "application/json"}
batch_size = 10_000
cache_file = 'data_cache.pkl'

In [5]:
def fetch_data(url, headers, batch_size, page):
    params = {"page_size": batch_size, "page": page}
    response = requests.get(url, headers=headers, params=params)
    response.raise_for_status()
    return response.json()

def process_nodes(existing_nodes, new_nodes):
    node_id_map = {node['id']: idx for idx, node in enumerate(existing_nodes)}
    start_idx = len(existing_nodes)
    
    for node in new_nodes:
        if node['id'] not in node_id_map:
            node_id_map[node['id']] = start_idx
            existing_nodes.append(node)
            start_idx += 1

def process_edges(existing_edges, new_edges, existing_nodes):
    node_id_map = {node['id']: idx for idx, node in enumerate(existing_nodes)}

    for edge in new_edges:
        if edge['src'] in node_id_map and edge['dst'] in node_id_map:
            existing_edges.append(edge)

def save_cache(nodes, edges, node_page, edge_page):
    with open(cache_file, 'wb') as f:
        pickle.dump((nodes, edges, node_page, edge_page), f)

def load_cache():
    if os.path.exists(cache_file):
        with open(cache_file, 'rb') as f:
            return pickle.load(f)
    return None, None, 1, 1

def fetch_and_process_data():
    existing_nodes, existing_edges, node_page, edge_page = load_cache()

    if existing_nodes is None:
        existing_nodes = []
    if existing_edges is None:
        existing_edges = []

    try:
        while True:
            node_data = fetch_data(node_url, headers, batch_size, node_page)
            new_nodes = node_data['results']
            
            if not new_nodes:
                break
            
            process_nodes(existing_nodes, new_nodes)
            start_idx = (node_page - 1) * batch_size
            end_idx = start_idx + len(new_nodes) - 1
            print(f"Nodes {start_idx}-{end_idx} retrieved.")
            node_page += 1
            save_cache(existing_nodes, existing_edges, node_page, edge_page)

        while True:
            edge_data = fetch_data(edge_url, headers, batch_size, edge_page)
            new_edges = edge_data['results']
            
            if not new_edges:
                break
            
            process_edges(existing_edges, new_edges, existing_nodes)
            start_idx = (edge_page - 1) * batch_size
            end_idx = start_idx + len(new_edges) - 1
            print(f"Edges {start_idx}-{end_idx} retrieved.")
            edge_page += 1
            save_cache(existing_nodes, existing_edges, node_page, edge_page)

    except Exception as e:
        print(f"An error occurred: {e}")
        save_cache(existing_nodes, existing_edges, node_page, edge_page)
        raise

    return existing_nodes, existing_edges

In [6]:
nodes, edges = fetch_and_process_data()

In [7]:
def create_data_object(nodes, edges):
    node_features = []
    node_labels = []
    node_id_map = {}

    for idx, node in enumerate(nodes):
        node_id_map[node['id']] = idx
        node_features.append(node['x'])
        node_labels.append(node['y'])

    node_features = torch.tensor(node_features, dtype=torch.float)
    node_labels = torch.tensor(node_labels, dtype=torch.long)

    edge_index = []
    for edge in edges:
        src = node_id_map[edge['src']]
        dst = node_id_map[edge['dst']]
        edge_index.append([src, dst])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    return Data(x=node_features, edge_index=edge_index, y=node_labels)

In [40]:
data_lib.train_mask

tensor([False, False, False,  ..., False, False, False], device='cuda:0')

In [50]:
data = create_data_object(nodes, edges)
data.train_mask = data_lib.train_mask.clone()
data.test_mask = data_lib.test_mask.clone()

In [11]:
torch.cuda.is_available()

True

In [33]:
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, class_weights):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
        self.optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        self.class_weights = class_weights
        self.loss_fn = torch.nn.CrossEntropyLoss(self.class_weights)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

    def fit(self, data):
        self.train()
        self.optimizer.zero_grad()
        out = self(data)
        loss = self.loss_fn(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        self.optimizer.step()
        return loss.item()

    def predict(self, data):
        self.eval()
        out = self(data)
        _, pred = out.max(dim=1)
        correct = pred[data.test_mask].eq(data.y[data.test_mask]).sum().item()
        acc = correct / data.test_mask.sum().item()
        return acc, pred

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data = data_lib

model = GCN(in_channels=data.x.shape[1], hidden_channels=100, out_channels=2, class_weights=torch.tensor([0.3, 0.7])).to(device)

data = data.to(device)

In [32]:
(y_true==2).sum()

tensor(13144)

In [42]:
data

Data(x=[203769, 165], edge_index=[2, 234355], y=[203769], train_mask=[203769], test_mask=[203769])

In [43]:
data_lib

Data(x=[203769, 165], edge_index=[2, 234355], y=[203769], train_mask=[203769], test_mask=[203769])

In [52]:
import torch

def compare_data(data, data_lib):
    attributes = ['x', 'edge_index', 'y', 'train_mask', 'test_mask']
    
    for attr in attributes:
        data_attr = getattr(data, attr, None)
        data_lib_attr = getattr(data_lib, attr, None)
        
        if data_attr is None or data_lib_attr is None:
            print(f"Attribute {attr} is missing in one of the objects.")
            continue
        
        if not torch.equal(data_attr, data_lib_attr):
            print(f"Attribute {attr} is different.")
            
            if data_attr.shape != data_lib_attr.shape:
                print(f"Shapes are different: {data_attr.shape} vs {data_lib_attr.shape}")
            else:
                # Compare the values and print where they differ
                differences = (data_attr != data_lib_attr).nonzero(as_tuple=True)
                if differences[0].numel() == 0:
                    print(f"Values are the same but the tensors are not equal due to dtype or device mismatch.")
                else:
                    print(f"Differences found at indices: {differences}")
                    print(f"data.{attr} values at differences: {data_attr[differences]}")
                    print(f"data_lib.{attr} values at differences: {data_lib_attr[differences]}")
        else:
            print(f"Attribute {attr} is the same.")
    
    print("Comparison complete.")

# Example usage
compare_data(data, data_lib)


Attribute x is different.
Differences found at indices: (tensor([     0,      0,      0,  ..., 203768, 203768, 203768]), tensor([  0,   1,   2,  ..., 162, 163, 164]))
data.x values at differences: tensor([-0.1729, -0.1515,  1.0186,  ..., -0.0975, -0.1206, -0.1198])
data_lib.x values at differences: tensor([-0.1715, -0.1847, -1.2014,  ..., -0.1406,  1.5197,  1.5214])
Attribute edge_index is different.
Differences found at indices: (tensor([0, 0, 0,  ..., 1, 1, 1]), tensor([     0,      1,      2,  ..., 234352, 234353, 234354]))
data.edge_index values at differences: tensor([ 11078,    941,   1037,  ..., 202619, 202184, 203242])
data_lib.edge_index values at differences: tensor([     0,      2,      4,  ..., 202042, 201368, 201756])
Attribute y is different.
Differences found at indices: (tensor([     1,      3,      6,  ..., 203762, 203766, 203768]),)
data.y values at differences: tensor([0, 2, 0,  ..., 0, 2, 0])
data_lib.y values at differences: tensor([2, 0, 2,  ..., 2, 1, 2])
Attribu

In [57]:
data_lib.x

tensor([[-0.1715, -0.1847, -1.2014,  ..., -0.0975, -0.1206, -0.1198],
        [-0.1715, -0.1847, -1.2014,  ..., -0.0975, -0.1206, -0.1198],
        [-0.1721, -0.1847, -1.2014,  ..., -0.1837, -0.1206, -0.1198],
        ...,
        [-0.1720, -0.0782,  1.0186,  ..., -0.0975, -0.1206, -0.1198],
        [-0.1728, -0.1766,  1.0186,  ..., -0.1406,  1.5197,  1.5214],
        [-0.0120, -0.1323,  0.4636,  ..., -0.1406,  1.5197,  1.5214]])

In [59]:
data_lib.y

tensor([2, 2, 2,  ..., 1, 2, 2])

In [58]:
data.x

tensor([[-0.1729, -0.1515,  1.0186,  ...,  0.0748, -0.1206, -0.1198],
        [-0.1728, -0.0511, -0.0914,  ..., -0.1406,  1.5197,  1.5214],
        [-0.1729, -0.1311,  1.0186,  ..., -0.0975, -0.1206, -0.1198],
        ...,
        [-0.1720, -0.1847, -1.2014,  ..., -0.0975, -0.1206, -0.1198],
        [-0.1730, -0.1780,  1.0186,  ..., -0.0975, -0.1206, -0.1198],
        [ 0.5888, -0.2106, -1.7564,  ..., -0.0975, -0.1206, -0.1198]])

In [60]:
data.y

tensor([2, 0, 2,  ..., 2, 2, 0])

In [61]:
import torch

# Function to find the first matching row index
def find_first_matching_row(data, data_lib):
    first_row = data.x[0]
    all_rows = data_lib.x
    
    # Compare the first row of data with all rows of data_lib
    differences = torch.sum(all_rows != first_row, dim=1)
    
    # Find the index of the first row that matches exactly (all differences should be 0)
    matching_index = torch.where(differences == 0)[0]
    
    if len(matching_index) > 0:
        return matching_index[0].item()
    else:
        return None

# Example usage
matching_index = find_first_matching_row(data, data_lib)
if matching_index is not None:
    print(f"The first row of data matches row {matching_index} in data_lib.")
else:
    print("No matching row found in data_lib.")


The first row of data matches row 200788 in data_lib.


In [35]:
for epoch in range(1, 1001):
    loss = model.fit(data)
    if epoch % 100 == 0:
        acc, _ = model.predict(data)
        print(f'Epoch: {epoch}, Loss: {loss:.4f}, Test Accuracy: {acc:.4f}')

acc, pred = model.predict(data)
print(f'Test Accuracy: {acc:.4f}')

y_true = data.y[data.test_mask].cpu()
y_pred = pred[data.test_mask].cpu()
print(classification_report(y_true, y_pred, target_names=['licit', 'illicit']))

Epoch: 100, Loss: 0.2482, Test Accuracy: 0.8139
Epoch: 200, Loss: 0.2036, Test Accuracy: 0.9005
Epoch: 300, Loss: 0.1827, Test Accuracy: 0.9401
Epoch: 400, Loss: 0.1671, Test Accuracy: 0.9469
Epoch: 500, Loss: 0.1557, Test Accuracy: 0.9484
Epoch: 600, Loss: 0.1455, Test Accuracy: 0.9498
Epoch: 700, Loss: 0.1405, Test Accuracy: 0.9507
Epoch: 800, Loss: 0.1324, Test Accuracy: 0.9505
Epoch: 900, Loss: 0.1297, Test Accuracy: 0.9512
Epoch: 1000, Loss: 0.1251, Test Accuracy: 0.9511
Test Accuracy: 0.9511
              precision    recall  f1-score   support

       licit       0.96      0.99      0.97     15587
     illicit       0.71      0.42      0.53      1083

    accuracy                           0.95     16670
   macro avg       0.83      0.71      0.75     16670
weighted avg       0.94      0.95      0.95     16670

