In [2]:
import os
import logging
logger = logging.getLogger()
old_level = logger.level
logger.setLevel(logging.ERROR)

if os.path.basename(os.getcwd()) != 'graph':
    os.chdir('..')

In [3]:
from config import Config

data_dir = 'data/transaction'
base_url = Config.BASE_URL

node_url = f"{base_url}/api/nodes"
edge_url = f"{base_url}/api/edges"
batch_size = 10_000
transactions_raw_file = f'{data_dir}/transactions_raw.pkl'
transactions_file = f'{data_dir}/transactions.pkl'

In [4]:
# from torch_geometric.datasets import EllipticBitcoinDataset

# dataset_lib = EllipticBitcoinDataset(root=f'{data_dir}/EllipticBitcoin', transform=None)
# data_lib = dataset_lib[0]

In [5]:
# from models.helpers import fetch_and_process_graph_data

# nodes, edges = fetch_and_process_graph_data(node_url, edge_url, transactions_raw_file, batch_size)

In [6]:
# from models.helpers import create_data_object, save_data

# data = create_data_object(nodes, edges)

In [7]:
# import torch

# labels = data.y.cpu()
# class_0_1_indices = torch.where((labels == 0) | (labels == 1))[0]
# num_samples = len(class_0_1_indices)
# train_end, val_end = int(0.7 * num_samples), int(0.85 * num_samples)

# train_indices, val_indices, test_indices = class_0_1_indices.split([train_end, val_end - train_end, num_samples - val_end])

# masks = torch.zeros((3, len(data.y)), dtype=torch.bool, device=data.y.device)
# masks[0, train_indices] = True
# masks[1, val_indices] = True
# masks[2, test_indices] = True

# data.train_mask, data.val_mask, data.test_mask = masks

In [8]:
# save_data(data, transactions_file)

In [9]:
from models.helpers import load_data

data = load_data(transactions_file)

In [10]:
# from models.transaction import TransactionClassifierGCN

# transaction_classifier_gcn = TransactionClassifierGCN()
# transaction_classifier_gcn.fit(data, epochs=1000)
# transaction_classifier_gcn.evaluate(data)

In [11]:
# from models.helpers import save_model
# save_model(transaction_classifier_gcn.state_dict(), 'models/transaction_classifier_gcn.pth')

In [12]:
import torch
from torcheval.metrics.functional import multiclass_f1_score, mean_squared_error


class GNN(torch.nn.Module):
    def __init__(self, layers):
        super(GNN, self).__init__()
        self.layers = torch.nn.ModuleList(layers)
        self.optimizer = None
        self.loss_fn = None

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        for layer in self.layers:
            x = layer(x, edge_index) if 'Conv' in layer.__class__.__name__ else layer(x)
        return x

    def compile(self, optimizer, loss_fn):
        self.optimizer = optimizer
        self.loss_fn = loss_fn

    def fit(self, data, epochs=1000, patience=10):
        history = {'train_loss': [], 'val_loss': [], 'train_metric': [], 'val_metric': []}
        best_val_loss = float('inf')
        patience_counter = 0
        
        for epoch in range(epochs):
            self.train()
            self.optimizer.zero_grad()
            out = self(data)
            train_loss, train_metric = self._compute_loss_and_metric(out, data, 'train')
            val_loss, val_metric = self._compute_loss_and_metric(out, data, 'val')
            history['train_loss'].append(train_loss.item())
            history['val_loss'].append(val_loss.item())
            history['train_metric'].append(train_metric.item())
            history['val_metric'].append(val_metric.item())
            train_loss.backward()
            self.optimizer.step()

            if epoch % 100 == 0:
                print(f"Epoch {epoch}, Training Loss: {train_loss.item()}, Validation Loss: {val_loss.item()}")
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print(f"Early stopping at epoch {epoch}")
                    break
        return history
    
    def predict(self, data):
        self.eval()
        with torch.no_grad():
            return self(data)

    def evaluate(self, data, mode='val'):
        y_true, y_pred = self._get_true_and_predicted(data, mode)
        return self.calculate_metrics(y_true, y_pred)
    
    def calculate_metrics(self, y_true, y_pred):
        raise NotImplementedError
    
    def load_for_inference(self, path):
        self.load_state_dict(torch.load(path))
        self.eval()

    def _compute_loss_and_metric(self, out, data, mode):
        mask = getattr(data, f"{mode}_mask")
        y_true = data.y[mask]
        y_pred = out[mask]
        loss = self.loss_fn(y_pred, y_true)
        metric = self.calculate_metrics(y_true, torch.argmax(y_pred, dim=1))
        return loss, metric
    
    def _get_true_and_predicted(self, data, mode):
        mask = getattr(data, f"{mode}_mask")
        y_true = data.y[mask]
        y_pred = self.predict(data)[mask]
        return y_true, y_pred


class GNNClassifier(GNN):
    def __init__(self, layers):
        super(GNNClassifier, self).__init__(layers)
    
    def predict(self, data):
        out = super().predict(data)
        return torch.argmax(out, dim=1)
    
    def calculate_metrics(self, y_true, y_pred):
        num_classes = y_true.max().item() + 1
        return multiclass_f1_score(y_true, y_pred, num_classes=num_classes, average='macro')


class GNNRegressor(GNN):
    def __init__(self, layers):
        super(GNNRegressor, self).__init__(layers)
    
    def calculate_metrics(self, y_true, y_pred):
        return mean_squared_error(y_true, y_pred)


In [13]:
def count_classes(data, mask=None):
    unique_classes, counts = torch.unique(data.y[mask], return_counts=True)
    for cls, count in zip(unique_classes.tolist(), counts.tolist()):
        print(f"Class {cls}: {count}")

In [14]:
from collections import Counter


def objective(trial, data, layer):
    num_layers = trial.suggest_int('num_layers', 2, 4)
    hidden_channels = trial.suggest_int('hidden_channels', 16, 64)
    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)

    layers = []
    in_channels = data.num_features
    out_channels = data.y[data.train_mask].max().item() + 1
    class_counts = Counter(data.y[data.train_mask].cpu().numpy())
    total = sum(class_counts.values())
    class_weights = {c: total / count for c, count in class_counts.items()}

    class_weights = torch.tensor([class_weights[c] for c in sorted(class_weights)], device=data.y.device)
    
    for _ in range(num_layers):
        layers.append(layer(in_channels, hidden_channels))
        layers.append(torch.nn.ReLU())
        layers.append(torch.nn.Dropout(dropout_rate))
        in_channels = hidden_channels
    layers.append(torch.nn.Linear(hidden_channels, out_channels))

    model = GNNClassifier(layers).to(data.y.device)
    model.compile(
        optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate),
        loss_fn=torch.nn.CrossEntropyLoss(weight=class_weights)
    )

    model_history = model.fit(data, epochs=1000)
    trial.set_user_attr('history', model_history)
    trial.set_user_attr('model', model)
    return model.evaluate(data)

In [15]:
import optuna

def conduct_study(data, layer_types, n_trials=10):
    study_results = {}

    for layer in layer_types:
        layer_name = layer.__name__
        study = optuna.create_study(direction='maximize')
        study.optimize(lambda trial: objective(trial, data, layer), n_trials=n_trials)

        best_trial = study.best_trial
        study_results[layer_name] = {
            'trial_values': [t.value for t in study.trials],
            'best_model': best_trial.user_attrs['model'],
            'best_model_history': best_trial.user_attrs['history'],
        }

    return study_results

In [16]:
from torch_geometric.nn import GATConv, GCNConv, TransformerConv

layer_types = [torch.nn.Linear, GCNConv, GATConv, TransformerConv]

In [17]:
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torcheval.metrics.functional import multiclass_accuracy, multiclass_precision, multiclass_recall, multiclass_confusion_matrix, multiclass_f1_score

def visualize_study_results(study_results, data):
    layer_metrics = {layer: results['trial_values'] for layer, results in study_results.items()}

    best_layer = None
    best_trial_value = float('-inf')
    for layer, results in study_results.items():
        max_trial_value = max(results['trial_values'])
        if max_trial_value > best_trial_value:
            best_trial_value = max_trial_value
            best_layer = layer
            best_model = results['best_model']
            best_model_history = results['best_model_history']

    plt.figure(figsize=(10, 6))
    plt.boxplot([
        values
        for values in layer_metrics.values()
    ], labels=layer_metrics.keys())
    plt.title('Performance Comparison')
    plt.ylabel('Metric Value')
    plt.show()

    plt.figure(figsize=(10, 6))
    plt.plot(best_model_history['train_loss'], label='Training Loss')
    plt.plot(best_model_history['val_loss'], label='Validation Loss')
    plt.title(f'Loss History for {best_layer}')
    plt.xlabel('Epochs')
    plt.ylabel('Loss Value')
    plt.legend()
    plt.show()

    plt.figure(figsize=(10, 6))
    plt.plot(best_model_history['train_metric'], label='Training Metric')
    plt.plot(best_model_history['val_metric'], label='Validation Metric')
    plt.title(f'Metric History for {best_layer}')
    plt.xlabel('Epochs')
    plt.ylabel('Metric Value')
    plt.legend()
    plt.show()

    y_true = data.y[data.val_mask]
    num_classes = y_true.max().item() + 1
    
    with torch.no_grad():
        y_pred = best_model.predict(data)[data.val_mask]
        confusion_mat = multiclass_confusion_matrix(y_pred, y_true, num_classes)

    plt.figure(figsize=(10, 6))
    plt.imshow(confusion_mat.cpu().numpy(), interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    tick_marks = range(data.y.max().item() + 1)
    plt.xticks(tick_marks)
    plt.yticks(tick_marks)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

    metrics_data = []
    for layer, results in study_results.items():
        best_model = results['best_model']
        y_pred = best_model.predict(data)[data.val_mask]
        accuracy = multiclass_accuracy(y_pred, y_true, num_classes=num_classes, average='macro').item()
        precision = multiclass_precision(y_pred, y_true, num_classes=num_classes, average='macro').item()
        recall = multiclass_recall(y_pred, y_true, num_classes=num_classes, average='macro').item()
        f1 = multiclass_f1_score(y_pred, y_true, num_classes=num_classes, average='macro').item()
        metrics_data.append({
            'Layer Type': layer,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1
        })

    metrics_df = pd.DataFrame(metrics_data)
    return metrics_df

In [18]:
import torch

def create_val_test_masks(train_mask, val_test_ratio=0.5):
  non_train_indices = torch.where(~train_mask)[0]

  num_val = int(val_test_ratio * len(non_train_indices))

  shuffled_indices = torch.randperm(len(non_train_indices))

  val_indices = non_train_indices[shuffled_indices[:num_val]]
  test_indices = non_train_indices[shuffled_indices[num_val:]]

  val_mask = torch.zeros_like(train_mask, dtype=torch.bool)
  test_mask = torch.zeros_like(train_mask, dtype=torch.bool)
  val_mask[val_indices] = True
  test_mask[test_indices] = True

  return val_mask, test_mask

In [19]:
from torch_geometric.datasets import Amazon

In [20]:
data_amazon = Amazon(root=f'{data_dir}/Amazon', name='Computers')[0].to('cuda')

In [21]:
from sklearn.model_selection import train_test_split

def create_masks(data, train_ratio=0.7, val_ratio=0.15, random_state=None):
    num_samples = len(data.y)
    labels = data.y.cpu().numpy()
    test_ratio = 1 - train_ratio - val_ratio
    
    train_indices, tmp_indices = train_test_split(
        torch.arange(num_samples), train_size=train_ratio, stratify=labels, random_state=random_state
    )
    val_indices, test_indices = train_test_split(
        tmp_indices, test_size=test_ratio/(test_ratio+val_ratio), stratify=labels[tmp_indices], random_state=random_state
    )

    masks = torch.zeros((3, num_samples), dtype=torch.bool)
    masks[0, train_indices] = True
    masks[1, val_indices] = True
    masks[2, test_indices] = True

    data.train_mask, data.val_mask, data.test_mask = masks

In [22]:
create_masks(data_amazon)

In [23]:
data_amazon

Data(x=[13752, 767], edge_index=[2, 491722], y=[13752], train_mask=[13752], val_mask=[13752], test_mask=[13752])

In [24]:
# study_amazon = conduct_study(data_amazon, layer_types)

In [25]:
# visualize_study_results(study_amazon, data_amazon)

In [26]:
from torch_geometric.datasets import WordNet18RR

In [27]:
data_wordnet = WordNet18RR(root=f'{data_dir}/WordNet18RR')[0].to('cuda')

Downloading https://raw.githubusercontent.com/villmow/datasets_knowledge_embedding/master/WN18RR/original/train.txt
Downloading https://raw.githubusercontent.com/villmow/datasets_knowledge_embedding/master/WN18RR/original/valid.txt
Downloading https://raw.githubusercontent.com/villmow/datasets_knowledge_embedding/master/WN18RR/original/test.txt
Processing...
Done!


In [28]:
data_wordnet

Data(edge_index=[2, 93003], edge_type=[93003], train_mask=[93003], val_mask=[93003], test_mask=[93003], num_nodes=40943)

In [36]:
data_wordnet.train_mask.count_nonzero(), data_wordnet.val_mask.count_nonzero(), data_wordnet.test_mask.count_nonzero()

(tensor(86835, device='cuda:0'),
 tensor(3034, device='cuda:0'),
 tensor(3134, device='cuda:0'))

In [35]:
data_wordnet.edge_index

tensor([[    0,     0,     1,  ..., 40933, 40934, 40935],
        [10211, 25525,  3891,  ...,  8943,  8648,  6809]], device='cuda:0')