In [1]:
import sys
sys.path.append('../src')

import os
import networkx as nx
import numpy as np
import scipy.sparse as sp
from tqdm.notebook import tqdm
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from my_utils import set_seed, setup_env
from data_loader import create_data_loader
from model_eval import eval_pred


In [2]:
# Hyper parameters
seed = 0
device_id = '1'
dataset_name = 'UAE_sample'
num_splits = 20
train_few_shot_samples = 10
val_few_shot_samples = 10
test_perc = 0.2
overwrite_data = False


In [3]:
# set seed for reproducibility
set_seed(seed)
# set device
os.environ['CUDA_VISIBLE_DEVICES'] = device_id
device, base_dir, interim_data_dir, data_dir = setup_env(device_id, dataset_name, seed, num_splits,
                                                         train_few_shot_samples, val_few_shot_samples,
                                                          test_perc)
print(data_dir)
# Create data loader for datasets
datasets = create_data_loader(dataset_name, base_dir, data_dir,
                              hyper_params={'overwrite_data': overwrite_data,
                                            'train_few_shot_samples': train_few_shot_samples,
                                            'val_few_shot_samples': val_few_shot_samples,
                                            'test_perc': test_perc,
                                            'seed': seed,
                                            'num_splits': num_splits,
                                            },
                              device=device)
# Remap nodeIDs
node_ids_list = list(datasets['graph'].nodes())
# Relabel each node from 0 to N-1
node_remapping = {node_ids_list[i]: i for i in range(len(node_ids_list))}
datasets['graph'] = nx.relabel_nodes(datasets['graph'], node_remapping)


[ Using Seed : 0 ]
/mnt/nas/minici/SocGFM/data/processed/UAE_sample/seed_0_num_splits_20/train_10_val_10_test_0.2


# Load pre-training dataset

In [4]:
# Hyper parameters
PRETRAINING_DATASET_NAME = 'cuba'
hidden_dim = 64
# Constants
GRAPH_FILENAME, USER_LABELS_FILENAME = 'fused_network.gml', 'fused_network_node_labels.npy'
NODE_FEATURES = 'svd_node_features.npy'

pretraining_data_dir = base_dir / 'data' / 'processed' / PRETRAINING_DATASET_NAME
print(pretraining_data_dir)

# Read network and user labels
pretraining_dataset = nx.read_graphml(pretraining_data_dir / GRAPH_FILENAME)
pretraining_user_labels = np.load(pretraining_data_dir / USER_LABELS_FILENAME)
# Remap nodeIDs
node_ids_list = list(pretraining_dataset.nodes())
# Relabel each node from 0 to N-1
node_remapping = {node_ids_list[i]: i for i in range(len(node_ids_list))}
pretraining_dataset = nx.relabel_nodes(pretraining_dataset, node_remapping)


/mnt/nas/minici/SocGFM/data/processed/cuba


In [5]:
from sknetwork.embedding import SVD
def get_svd_features(graph, saving_path, hidden_dim):
    if saving_path.exists():
        return np.load(saving_path)
    else:
        # Step 2. Compute Singular Value Decomposition of the adjacency matrix
        svd = SVD(hidden_dim)
        adj_matrix = nx.to_numpy_array(graph)
        node_features =  svd.fit_transform(adj_matrix)
        np.save(file=saving_path, arr=node_features)
        return node_features

pretraining_svd_embeddings = get_svd_features(pretraining_dataset, pretraining_data_dir / NODE_FEATURES, hidden_dim)


#### Extract SVD features from target country

In [6]:
target_svd_embeddings = get_svd_features(datasets['graph'], data_dir.parent.parent / NODE_FEATURES, hidden_dim)


#### train a random forest

In [28]:
from sklearn.ensemble import RandomForestClassifier

SCALING = True
if SCALING:
    # from sklearn.preprocessing import RobustScaler
    # scaled_pretraining_svd_embeddings = RobustScaler().fit_transform(pretraining_svd_embeddings)
    from sklearn.preprocessing import QuantileTransformer
    scaled_pretraining_svd_embeddings = QuantileTransformer(n_quantiles=1000).fit_transform(pretraining_svd_embeddings)
else:
    scaled_pretraining_svd_embeddings = np.copy(pretraining_svd_embeddings)

rf_random = RandomForestClassifier()
rf_random.fit(scaled_pretraining_svd_embeddings, pretraining_user_labels)


#### test the classifier on the target dataset

In [30]:
SCALING = True
if SCALING:
    # from sklearn.preprocessing import RobustScaler
    # scaled_target_svd_embeddings = RobustScaler().fit_transform(target_svd_embeddings)
    from sklearn.preprocessing import QuantileTransformer
    scaled_target_svd_embeddings = QuantileTransformer(n_quantiles=1000).fit_transform(target_svd_embeddings)
else:
    scaled_target_svd_embeddings = np.copy(target_svd_embeddings)

from model_eval import eval_pred, TestLogMetrics

test_logger = TestLogMetrics(num_splits, ['accuracy', 'precision', 'f1_macro', 'f1_micro'])
for run_id in tqdm(range(num_splits), 'data split'):
    # Since this is an unsupervised baseline, we merge training and validation
    # unsupervised_mask = np.logical_or(datasets['splits'][run_id]['train'], datasets['splits'][run_id]['val'])
    # Select the best threshold according to the eval on train+val sets
    # best_val_threshold = get_best_threshold(datasets['labels'], predicted_labels_list, unsupervised_mask, metric_to_optimize)
    # val_metrics = eval_pred(datasets['labels'], predicted_labels_list[best_val_threshold], unsupervised_mask)
    # Compute test statistics
    test_metrics = eval_pred(datasets['labels'], rf_random.predict(scaled_target_svd_embeddings),
                             datasets['splits'][run_id]['test'])
    for metric_name in test_metrics:
        test_logger.update(metric_name, run_id, test_metrics[metric_name])

for metric_name in test_logger.test_metrics_dict:
    avg_val, std_val = test_logger.get_metric_stats(metric_name)
    print(f'Test {metric_name}: {avg_val}+-{std_val}')



data split:   0%|          | 0/20 [00:00<?, ?it/s]

Test accuracy: 0.4598+-0.0
Test precision: 0.0+-0.0
Test f1_macro: 0.315+-0.0
Test f1_micro: 0.4598+-0.0


## Train a GCN on the pretraining dataset

In [7]:
SCALING = True
if SCALING:
    # from sklearn.preprocessing import RobustScaler
    # scaled_pretraining_svd_embeddings = RobustScaler().fit_transform(pretraining_svd_embeddings)
    from sklearn.preprocessing import QuantileTransformer
    scaled_pretraining_svd_embeddings = QuantileTransformer(n_quantiles=1000).fit_transform(pretraining_svd_embeddings)
    scaled_target_svd_embeddings = QuantileTransformer(n_quantiles=1000).fit_transform(target_svd_embeddings)
else:
    scaled_pretraining_svd_embeddings = np.copy(pretraining_svd_embeddings)
    scaled_target_svd_embeddings = np.copy(target_svd_embeddings)


In [120]:
# Alternative feature extractor based on node centrality
pretraining_node_embeddings = np.load(pretraining_data_dir / 'centrality_node_features.npy', allow_pickle=True)
target_node_embeddings = np.load(data_dir.parent.parent / 'centrality_node_features.npy', allow_pickle=True)


In [128]:
import torch
from torch_geometric.data import Data

pretraining_edge_index = torch.tensor(np.array(pretraining_dataset.edges()), dtype=torch.long)
pretraining_x = torch.tensor(pretraining_node_embeddings.astype(float), dtype=torch.float)

print('building Data object')
dataset = Data(x=pretraining_x,
               edge_index=pretraining_edge_index.t().contiguous(),
               y=torch.tensor(pretraining_user_labels, dtype=torch.float))
print('Validate Data object')
dataset.validate(raise_on_error=True)


building Data object
Validate Data object


True

In [141]:
import torch
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, num_node_features, hidden_dim, layer_norm=True, dropout_perc=0.2):
        super().__init__()
        self.conv1 = GCNConv(num_node_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, 1)
        self.inner_activation = torch.nn.ReLU()
        self.output_activation = torch.nn.Sigmoid()
        self.dropout = torch.nn.Dropout(dropout_perc)
        self.layer_norm = layer_norm
        if layer_norm:
            self.layern_norm1 = torch.nn.LayerNorm(hidden_dim, elementwise_affine = True)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = self.inner_activation(x)
        if self.layer_norm:
            x = self.layern_norm1(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        return self.output_activation(x)


In [148]:
# Sample validation examples for model selection
val_perc = 0.2
train_idxs, val_idxs = train_test_split(range(len(pretraining_user_labels)), test_size=val_perc, random_state=seed, stratify=pretraining_user_labels)
val_mask = np.full(shape=pretraining_user_labels.shape, fill_value=False)
val_mask[val_idxs] = True


In [150]:
from sklearn.metrics import roc_curve

num_epochs = 1000
hidden_dim_gcn = 3
early_stopping_limit = 25
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(pretraining_x.shape[1], hidden_dim_gcn, layer_norm=False, dropout_perc=0.0).to(device)
dataset = dataset.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, weight_decay=5e-4)
loss = torch.nn.BCELoss()

model.train()
tr_loss_list = []
val_metric_list = []
best_val_metric = -np.inf
early_stopping_count = 0
for epoch in tqdm(range(num_epochs)):
    if early_stopping_count > early_stopping_limit:
        break
    model.train()
    optimizer.zero_grad()
    out = model(dataset)
    tr_loss = loss(out.flatten()[train_idxs], dataset.y[train_idxs])
    tr_loss_list.append(tr_loss.item())
    tr_loss.backward()
    optimizer.step()
    # Validation step
    model.eval()
    if True:
        with torch.no_grad():
            val_pred = model(dataset).detach().cpu().numpy()
            # pick best threshold
            fpr, tpr, thresholds = roc_curve(pretraining_user_labels[val_mask], val_pred[val_mask])
            best_threshold_idx = np.argmax(tpr-fpr)
            best_threshold = thresholds[best_threshold_idx]
            val_metrics = eval_pred(pretraining_user_labels, val_pred > best_threshold, val_mask)
            val_metric_list.append(val_metrics['f1_macro'])
            if val_metrics['f1_macro'] > best_val_metric:
                best_val_metric = val_metrics['f1_macro']
                early_stopping_count = 0
            else:
                early_stopping_count += 1
    else:
        with torch.no_grad():
            val_pred = model(dataset).detach().cpu().numpy()
            val_metrics = roc_auc_score(pretraining_user_labels[val_mask], val_pred[val_mask])
            val_metric_list.append(val_metrics)
            if val_metrics > best_val_metric:
                best_val_metric = val_metrics
                early_stopping_count = 0
            else:
                early_stopping_count += 1
    print(f'[Epoch: {epoch}] tr: {round(tr_loss_list[-1], 2)} val: {round(val_metric_list[-1], 2)}')


  0%|          | 0/1000 [00:00<?, ?it/s]

[Epoch: 0] tr: 0.52 val: 0.36
[Epoch: 1] tr: 0.51 val: 0.36
[Epoch: 2] tr: 0.5 val: 0.36
[Epoch: 3] tr: 0.49 val: 0.36
[Epoch: 4] tr: 0.47 val: 0.36
[Epoch: 5] tr: 0.46 val: 0.36
[Epoch: 6] tr: 0.45 val: 0.36
[Epoch: 7] tr: 0.44 val: 0.36
[Epoch: 8] tr: 0.42 val: 0.36
[Epoch: 9] tr: 0.41 val: 0.36
[Epoch: 10] tr: 0.4 val: 0.36
[Epoch: 11] tr: 0.39 val: 0.36
[Epoch: 12] tr: 0.38 val: 0.36
[Epoch: 13] tr: 0.37 val: 0.36
[Epoch: 14] tr: 0.36 val: 0.36
[Epoch: 15] tr: 0.35 val: 0.36
[Epoch: 16] tr: 0.34 val: 0.36
[Epoch: 17] tr: 0.33 val: 0.36
[Epoch: 18] tr: 0.33 val: 0.36
[Epoch: 19] tr: 0.32 val: 0.36
[Epoch: 20] tr: 0.31 val: 0.36
[Epoch: 21] tr: 0.3 val: 0.36
[Epoch: 22] tr: 0.3 val: 0.36
[Epoch: 23] tr: 0.29 val: 0.36
[Epoch: 24] tr: 0.29 val: 0.36
[Epoch: 25] tr: 0.28 val: 0.36
[Epoch: 26] tr: 0.28 val: 0.36


# Test the trained model on a different dataset

In [151]:
import torch
from torch_geometric.data import Data

target_edge_index = torch.tensor(np.array(datasets['graph'].edges()), dtype=torch.long)
target_x = torch.tensor(target_node_embeddings.astype(float), dtype=torch.float)

print('building target Data object')
target_dataset = Data(x=target_x,
               edge_index=target_edge_index.t().contiguous(),
               y=torch.tensor(datasets['labels'], dtype=torch.float))
print('Validate target Data object')
target_dataset.validate(raise_on_error=True)
target_dataset.to(device)


building target Data object
Validate target Data object


Data(x=[5711, 3], edge_index=[2, 509469], y=[5711])

In [152]:
from sklearn.metrics import roc_curve

test_rocauc_list = []
for run_id in tqdm(range(num_splits), 'data split'):
    # Since this is an unsupervised baseline, we merge training and validation
    unsupervised_mask = np.logical_or(datasets['splits'][run_id]['train'], datasets['splits'][run_id]['val'])
    # Select the best threshold according to the eval on train+val sets
    model.eval()
    with torch.no_grad():
        test_pred = model(target_dataset).detach().cpu().numpy()
        fpr, tpr, thresholds = roc_curve(datasets['labels'][unsupervised_mask], test_pred[unsupervised_mask])
        best_threshold_idx = np.argmax(tpr-fpr)
        best_threshold = thresholds[best_threshold_idx]
    test_metrics = eval_pred(datasets['labels'], test_pred > best_threshold, datasets['splits'][run_id]['test'])
    test_rocauc_list.append(test_metrics['f1_macro'])
    # Compute test statistics
    # test_metrics = eval_pred(datasets['labels'], predicted_labels_list[best_val_threshold],
    #                          datasets['splits'][run_id]['test'])



data split:   0%|          | 0/20 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [153]:
test_rocauc_list

[0.31492911668484186,
 0.3174375522887534,
 0.3174375522887534,
 0.3174375522887534,
 0.3301876091355616,
 0.3270739775249982,
 0.3174375522887534,
 0.3174375522887534,
 0.3174375522887534,
 0.3174375522887534,
 0.36314552279970386,
 0.33291922051697476,
 0.333181425881845,
 0.3356082953020542,
 0.35121118355020486,
 0.3174375522887534,
 0.3174375522887534,
 0.3174375522887534,
 0.3174375522887534,
 0.3174375522887534]

In [113]:
datasets['graph'].number_of_nodes()

5711

In [114]:
datasets['graph'].number_of_edges()

509469

In [115]:
datasets['labels'].sum()

3055.0

In [116]:
3055/5711

0.5349325862370863

In [112]:
3

<networkx.classes.graph.Graph at 0x7f8340536030>