In [1]:
import sys
sys.path.append('../src')

import os
import networkx as nx
import numpy as np
import scipy.sparse as sp
from tqdm.notebook import tqdm
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from my_utils import set_seed, setup_env
from data_loader import create_data_loader
from model_eval import eval_pred


In [2]:
# Hyper parameters
seed = 0
device_id = '1'
dataset_name = 'UAE_sample'
num_splits = 20
train_few_shot_samples = 10
val_few_shot_samples = 10
test_perc = 0.2
overwrite_data = False


In [3]:
# set seed for reproducibility
set_seed(seed)
# set device
os.environ['CUDA_VISIBLE_DEVICES'] = device_id
device, base_dir, interim_data_dir, data_dir = setup_env(device_id, dataset_name, seed, num_splits,
                                                         train_few_shot_samples, val_few_shot_samples,
                                                          test_perc)
print(data_dir)
# Create data loader for datasets
datasets = create_data_loader(dataset_name, base_dir, data_dir,
                              hyper_params={'overwrite_data': overwrite_data,
                                            'train_few_shot_samples': train_few_shot_samples,
                                            'val_few_shot_samples': val_few_shot_samples,
                                            'test_perc': test_perc,
                                            'seed': seed,
                                            'num_splits': num_splits,
                                            },
                              device=device)
# Remap nodeIDs
node_ids_list = list(datasets['graph'].nodes())
# Relabel each node from 0 to N-1
node_remapping = {node_ids_list[i]: i for i in range(len(node_ids_list))}
datasets['graph'] = nx.relabel_nodes(datasets['graph'], node_remapping)


[ Using Seed : 0 ]
/mnt/nas/minici/SocGFM/data/processed/UAE_sample/seed_0_num_splits_20/train_10_val_10_test_0.2


# Load pre-training dataset

In [4]:
# Hyper parameters
PRETRAINING_DATASET_NAME = 'cuba'
hidden_dim = 64
# Constants
GRAPH_FILENAME, USER_LABELS_FILENAME = 'fused_network.gml', 'fused_network_node_labels.npy'
NODE_FEATURES = 'svd_node_features.npy'

pretraining_data_dir = base_dir / 'data' / 'processed' / PRETRAINING_DATASET_NAME
print(pretraining_data_dir)

# Read network and user labels
pretraining_dataset = nx.read_graphml(pretraining_data_dir / GRAPH_FILENAME)
pretraining_user_labels = np.load(pretraining_data_dir / USER_LABELS_FILENAME)
# Remap nodeIDs
node_ids_list = list(pretraining_dataset.nodes())
# Relabel each node from 0 to N-1
node_remapping = {node_ids_list[i]: i for i in range(len(node_ids_list))}
pretraining_dataset = nx.relabel_nodes(pretraining_dataset, node_remapping)


/mnt/nas/minici/SocGFM/data/processed/cuba


In [5]:
def get_svd_features(graph, saving_path, hidden_dim):
    if saving_path.exists():
        return np.load(saving_path)
    else:
        # Step 2. Compute Singular Value Decomposition of the adjacency matrix
        num_nodes = graph.number_of_nodes()
        adj_matrix = nx.to_numpy_array(graph)
        row, col = np.where(adj_matrix == 1.)
        sparse_adj_matrix = sp.coo_matrix((np.ones(row.shape[0]), (row, col)), shape=(num_nodes, num_nodes))
        svd = TruncatedSVD(n_components=hidden_dim, n_iter=128)
        svd.fit(sparse_adj_matrix)
        node_features = svd.components_.T
        np.save(file=saving_path, arr=node_features)
        return node_features

pretraining_svd_embeddings = get_svd_features(pretraining_dataset, pretraining_data_dir / NODE_FEATURES, hidden_dim)


In [52]:
# Alternative feature extractor based on node centrality
centrality_fn = [nx.degree_centrality, nx.eigenvector_centrality, nx.pagerank]
pretraining_node_embeddings = np.full(shape=(pretraining_dataset.number_of_nodes(), len(centrality_fn)), fill_value=None)
for i in tqdm(range(len(centrality_fn))):
    print('centrality ', i)
    centrality_stats = centrality_fn[i](pretraining_dataset)
    for nodeid in centrality_stats:
        pretraining_node_embeddings[nodeid, i] = centrality_stats[nodeid]
np.save(file=pretraining_data_dir / 'centrality_node_features.npy', arr=pretraining_node_embeddings)


  0%|          | 0/3 [00:00<?, ?it/s]

centrality  0
centrality  1
centrality  2


# Train a Random Forest on the pretraining dataset

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 4 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 4, verbose=2, random_state=seed, n_jobs = -1,
                               scoring = 'roc_auc', refit=True)
# Fit the random search model
rf_random.fit(pretraining_svd_embeddings, pretraining_user_labels)


{'n_estimators': [200, 288, 377, 466, 555, 644, 733, 822, 911, 1000], 'max_features': ['log2', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
Fitting 4 folds for each of 100 candidates, totalling 400 fits


In [66]:
from sklearn.preprocessing import RobustScaler

SCALING=False
if SCALING:
    scaled_pretraining_svd_embeddings = RobustScaler().fit_transform(pretraining_node_embeddings)
else:
    scaled_pretraining_svd_embeddings = np.copy(pretraining_node_embeddings)
rf_random = RandomForestClassifier()
# rf_random.fit(scaled_pretraining_svd_embeddings, pretraining_user_labels)
rf_random.fit(scaled_target_svd_embeddings, datasets['labels'])


## Test this trained RandomForest on another country

#### Extract SVD features from target country

In [59]:
SCALING=False
target_svd_embeddings = get_svd_features(datasets['graph'], data_dir.parent.parent / NODE_FEATURES, hidden_dim)
if SCALING:
    scaled_target_svd_embeddings = RobustScaler().fit_transform(target_svd_embeddings)
else:
    scaled_target_svd_embeddings = np.copy(target_svd_embeddings)


In [54]:
# Alternative feature extractor based on node centrality
centrality_fn = [nx.degree_centrality, nx.eigenvector_centrality, nx.pagerank]
target_node_embeddings = np.full(shape=(datasets['graph'].number_of_nodes(), len(centrality_fn)), fill_value=None)
for i in tqdm(range(len(centrality_fn))):
    print('centrality ', i)
    centrality_stats = centrality_fn[i](datasets['graph'])
    for nodeid in centrality_stats:
        target_node_embeddings[nodeid, i] = centrality_stats[nodeid]
np.save(file=data_dir.parent.parent / 'centrality_node_features.npy', arr=target_node_embeddings)


  0%|          | 0/3 [00:00<?, ?it/s]

centrality  0
centrality  1
centrality  2


In [60]:
SCALING=False
if SCALING:
    scaled_target_svd_embeddings = RobustScaler().fit_transform(target_node_embeddings)
else:
    scaled_target_svd_embeddings = np.copy(target_node_embeddings)



In [67]:
from model_eval import eval_pred, TestLogMetrics

test_logger = TestLogMetrics(num_splits, ['accuracy', 'precision', 'f1_macro', 'f1_micro'])
for run_id in tqdm(range(num_splits), 'data split'):
    # Since this is an unsupervised baseline, we merge training and validation
    # unsupervised_mask = np.logical_or(datasets['splits'][run_id]['train'], datasets['splits'][run_id]['val'])
    # Select the best threshold according to the eval on train+val sets
    # best_val_threshold = get_best_threshold(datasets['labels'], predicted_labels_list, unsupervised_mask, metric_to_optimize)
    # val_metrics = eval_pred(datasets['labels'], predicted_labels_list[best_val_threshold], unsupervised_mask)
    # Compute test statistics
    test_metrics = eval_pred(pretraining_user_labels, rf_random.predict(scaled_pretraining_svd_embeddings), np.full(len(pretraining_user_labels), True))
    # test_metrics = eval_pred(datasets['labels'], rf_random.predict(scaled_target_svd_embeddings), datasets['splits'][run_id]['test'])

    for metric_name in test_metrics:
        test_logger.update(metric_name, run_id, test_metrics[metric_name])


data split:   0%|          | 0/20 [00:00<?, ?it/s]

In [68]:
for metric_name in test_logger.test_metrics_dict:
    avg_val, std_val = test_logger.get_metric_stats(metric_name)
    print(f'Test {metric_name}: {avg_val}+-{std_val}')


Test accuracy: 0.4747+-0.0
Test precision: 0.0641+-0.0
Test f1_macro: 0.373+-0.0
Test f1_micro: 0.4747+-0.0


# Repeat by using a GNN

## Train a GCN on the pretraining dataset

In [None]:
import torch
from torch_geometric.data import Data

edge_index = torch.tensor([[0, 1],
                           [1, 0],
                           [1, 2],
                           [2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=x, edge_index=edge_index.t().contiguous())
data.validate(raise_on_error=True)

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, num_node_features, hidden_dim, dropout_perc=0.2):
        super().__init__()
        self.conv1 = GCNConv(num_node_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, 1)
        self.output_activation = torch.nn.Sigmoid()
        self.dropout = nn.Dropout(dropout_perc)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)

        return self.output_activation(x)

## Train a multi-layer perceptron on the pre-training dataset

In [96]:
import torch
import torch.nn as nn

class MyNet(nn.Module):
    def __init__(self, hidden_dimension, dropout_perc=0.2, with_batch_norm=False, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if with_batch_norm:
            self.layers = nn.Sequential(
                    nn.Linear(hidden_dimension, hidden_dimension),
                    nn.BatchNorm1d(hidden_dimension),
                    nn.ReLU(),
                    nn.Dropout(dropout_perc),
                    nn.Linear(hidden_dimension, hidden_dimension),
                    nn.BatchNorm1d(hidden_dimension),
                    nn.ReLU(),
                    nn.Dropout(dropout_perc),
                    nn.Linear(hidden_dimension, 1),
                    )
        else:
            self.layers = nn.Sequential(
                    nn.Linear(hidden_dimension, hidden_dimension),
                    nn.ReLU(),
                    nn.Dropout(dropout_perc),
                    nn.Linear(hidden_dimension, hidden_dimension),
                    nn.ReLU(),
                    nn.Dropout(dropout_perc),
                    nn.Linear(hidden_dimension, 1),
                    )
    def forward(self, x):
        return self.layers(x)


In [114]:
pretraining_torch_embeddings = torch.Tensor(pretraining_svd_embeddings).float().to(device)
pretraining_torch_labels = torch.Tensor(pretraining_user_labels).float().to(device)

from torch.utils.data import Dataset, DataLoader
class MyDataset(Dataset):
    def __init__(self, features, labels, transform=None):
        self.features = features
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        sample = self.features[idx]
        sample_labels = self.labels[idx]
        if self.transform:
            sample = self.transform(sample)
        return {'x': sample, 'y': sample_labels}



In [115]:
# Pretraining hyperparameters
num_epochs = 1000
batch_size = 1024
learning_rate = 1e-3
with_batch_norm = True
dropout_perc = 0.2
val_perc = 0.15
VAL_METRIC = 'f1_macro'
early_stopping = 10

# Create model
model = MyNet(hidden_dimension=hidden_dim, dropout_perc=dropout_perc, with_batch_norm=with_batch_norm)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
weight = torch.Tensor([pretraining_torch_embeddings.shape[0] / (pretraining_user_labels.sum() * 2)]).float().to(device)
loss = torch.nn.BCEWithLogitsLoss(pos_weight = weight)

# Create dataloaders
train_dataset = MyDataset(pretraining_torch_embeddings, pretraining_torch_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
# Create validation
_, val_idxs = train_test_split(range(len(pretraining_user_labels)), test_size=val_perc, random_state=seed, stratify=pretraining_user_labels)
val_mask = np.full(shape=pretraining_user_labels.shape, fill_value=False)
val_mask[val_idxs] = True


In [116]:
training_loss_list, val_metric_list = [], []
best_val_metric = -np.inf
early_stopping_count = 0

model.eval()
with torch.no_grad():
    val_pred = torch.nn.Sigmoid()(model(pretraining_torch_embeddings)).detach().cpu().numpy()
    val_metrics = roc_auc_score(pretraining_user_labels[val_mask], val_pred[val_mask])
print(f'[Epoch: -1] val: {round(val_metrics, 2)}')

# Model training loop
for epoch in range(num_epochs):
    if early_stopping_count > early_stopping:
        print('Early stopping.')
        break
    model.train()
    tr_loss = 0
    for batch_idx, sample in enumerate(train_loader):
        optimizer.zero_grad()
        y_pred = model(sample['x']).flatten()
        loss_val = loss(y_pred, sample['y'])
        tr_loss += loss_val.item()
        loss_val.backward()
        optimizer.step()
    training_loss_list.append(tr_loss / batch_idx)
    # Validation step
    model.eval()
    with torch.no_grad():
        val_pred = torch.nn.Sigmoid()(model(pretraining_torch_embeddings)).detach().cpu().numpy()
        val_metrics = roc_auc_score(pretraining_user_labels[val_mask], val_pred[val_mask])
        val_metric_list.append(val_metrics)
        if val_metrics > best_val_metric:
            best_val_metric = val_metrics
            early_stopping_count = 0
        else:
            early_stopping_count += 1
    print(f'[Epoch: {epoch}] tr: {round(training_loss_list[-1], 2)} val: {round(val_metric_list[-1], 2)}')


[Epoch: -1] val: 0.32
[Epoch: 0] tr: 1.03 val: 0.66
[Epoch: 1] tr: 0.96 val: 0.66
[Epoch: 2] tr: 0.9 val: 0.66
[Epoch: 3] tr: 0.85 val: 0.67
[Epoch: 4] tr: 0.81 val: 0.69
[Epoch: 5] tr: 0.8 val: 0.73
[Epoch: 6] tr: 0.79 val: 0.73
[Epoch: 7] tr: 0.78 val: 0.73
[Epoch: 8] tr: 0.78 val: 0.73
[Epoch: 9] tr: 0.78 val: 0.73
[Epoch: 10] tr: 0.78 val: 0.73
[Epoch: 11] tr: 0.78 val: 0.73
[Epoch: 12] tr: 0.78 val: 0.73
[Epoch: 13] tr: 0.77 val: 0.73
[Epoch: 14] tr: 0.78 val: 0.73
[Epoch: 15] tr: 0.78 val: 0.73
[Epoch: 16] tr: 0.78 val: 0.73
Early stopping.


# Test the trained model on a different dataset

#### Compute SVD features for the new graph

In [125]:
# Hyper parameters
TEST_DATASET_NAME = 'UAE_sample'
hidden_dim = 64
# Constants
GRAPH_FILENAME, USER_LABELS_FILENAME = 'fused_network.gml', 'fused_network_node_labels.npy'

test_dataset_data_dir = base_dir / 'data' / 'processed' / TEST_DATASET_NAME
print(test_dataset_data_dir)

# Read network and user labels
test_dataset = nx.read_graphml(test_dataset_data_dir / GRAPH_FILENAME)
test_user_labels = np.load(test_dataset_data_dir / USER_LABELS_FILENAME)
# Remap nodeIDs
node_ids_list = list(test_dataset.nodes())
# Relabel each node from 0 to N-1
node_remapping = {node_ids_list[i]: i for i in range(len(node_ids_list))}
test_dataset = nx.relabel_nodes(test_dataset, node_remapping)


/mnt/nas/minici/SocGFM/data/processed/UAE_sample


In [129]:
# Step 2. Compute Singular Value Decomposition of the adjacency matrix
num_nodes = test_dataset.number_of_nodes()
adj_matrix = nx.to_numpy_array(test_dataset)
row, col = np.where(adj_matrix == 1.)
sparse_adj_matrix = sp.coo_matrix((np.ones(row.shape[0]), (row, col)), shape=(num_nodes, num_nodes))
svd = TruncatedSVD(n_components=hidden_dim, n_iter=128)
svd.fit(sparse_adj_matrix)
test_dataset_svd_embeddings = svd.components_.T
test_dataset_torch_embeddings = torch.Tensor(test_dataset_svd_embeddings).float().to(device)


In [131]:
test_rocauc_list = []
for run_id in tqdm(range(num_splits), 'data split'):
    # Since this is an unsupervised baseline, we merge training and validation
    unsupervised_mask = np.logical_or(datasets['splits'][run_id]['train'], datasets['splits'][run_id]['val'])
    # Select the best threshold according to the eval on train+val sets
    # best_val_threshold = get_best_threshold(datasets['labels'],
    #                                         predicted_labels_list, unsupervised_mask, metric_to_optimize)
    test_pred = torch.nn.Sigmoid()(model(test_dataset_torch_embeddings)).detach().cpu().numpy()
    test_rocauc_list.append(roc_auc_score(test_user_labels, test_pred))
    # Compute test statistics
    # test_metrics = eval_pred(datasets['labels'], predicted_labels_list[best_val_threshold],
    #                          datasets['splits'][run_id]['test'])



data split:   0%|          | 0/20 [00:00<?, ?it/s]

In [132]:
test_rocauc_list

[0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491,
 0.5464945502139491]