# Import Packages

In [1]:
import os
import yaml

import numpy as np
import pandas as pd

import torch
import dgl
import dgl.nn as nn
import dgl.function as fn
import dgl.nn.functional as F

In [2]:
data_root = '../datasets'

In [3]:
def save_config(data, config_path):
    with open(config_path, 'w') as f:
        data = yaml.dump(data, f)

def load_config(config_path):
    with open(config_path, 'r') as f:
        data = yaml.safe_load(f)
    return data

# Functions for Graph Augmentation

* node degree
* numerical feature aggregation (average, max, min)
* categorical features aggregation (category ratios over neighbours)
* binary features aggregation (ratios over neighbours)

In [4]:
# graph.ndata['features'] = torch.tensor(num_features)
# graph.update_all(fn.copy_u('features', 'message'), fn.sum('message', 'features_sum'))

def get_structural_features(graph):
    structural_features = graph.in_degrees().numpy().astype(np.float32).reshape(-1, 1)
    structural_feature_names = ['node_degree']

    return structural_features, structural_feature_names


def get_num_feature_aggregation(graph, features, feature_names):
    aggregation_types = ['mean', 'max', 'min']

    with graph.local_scope():
        graph.ndata['f'] = torch.FloatTensor(features)
        aggregated_features_container = []

        for aggregation_type in aggregation_types:
            graph.update_all(fn.copy_u('f', 'm'), getattr(fn, aggregation_type)('m', aggregation_type))
            aggregated_features_container.append(graph.ndata[aggregation_type])

        aggregated_features = torch.cat(aggregated_features_container, dim=1).numpy().astype(np.float32)
        aggregated_features_names = [
            f"{feature_name}_{aggregation_type}" 
            for aggregation_type in aggregation_types for feature_name in feature_names
        ]
    
    return aggregated_features, aggregated_features_names


def get_cat_feature_aggregation(graph, features, feature_names):
    df_features = pd.DataFrame(features, columns=feature_names).astype(np.int32)
    df_ohe = pd.get_dummies(df_features, columns=feature_names, sparse=False, drop_first=False)
    ohe_features = df_ohe.values
    ohe_feature_names = df_ohe.columns

    with graph.local_scope():
        graph.ndata['f'] = torch.FloatTensor(ohe_features)
        graph.update_all(fn.copy_u('f', 'm'), fn.mean('m', 'mean'))
        
        aggregated_features = graph.ndata['mean'].numpy().astype(np.float32)
        aggregated_features_names = [f"{feature_name}_mean" for feature_name in ohe_feature_names]
    
    return aggregated_features, aggregated_features_names
    

def get_bin_feature_aggregation(graph, features, feature_names):
    with graph.local_scope():
        graph.ndata['f'] = torch.FloatTensor(features)
        graph.update_all(fn.copy_u('f', 'm'), fn.mean('m', 'mean'))
        
        aggregated_features = graph.ndata['mean'].numpy().astype(np.float32)
        aggregated_features_names = [f"{feature_name}_mean" for feature_name in feature_names]
    
    return aggregated_features, aggregated_features_names

# Prepare Graph Augmentation

In [5]:
dataset_names = [
    # 'tolokers-tab',
    # 'questions-tab',
    # 'city-reviews',
    # 'browser-games',
    # 'hm-categories',
    # 'web-fraud',
    # 'city-roads-M',
    # 'city-roads-L',
    # 'avazu-devices',
    # 'hm-prices',
    # 'web-traffic'
]

In [None]:
for dataset_name in dataset_names:
    print(dataset_name)
    dataset_path = f"{data_root}/{dataset_name}"

    df_features = pd.read_csv(f'{dataset_path}/features.csv', index_col=0)
    dataset_info = load_config(f'{dataset_path}/info.yaml')

    edge_list = pd.read_csv(f"{dataset_path}/edgelist.csv").values[:, :2]
    graph = dgl.graph(tuple(torch.tensor(indices) for indices in edge_list.T))
    graph = dgl.to_bidirected(graph)
    graph = dgl.add_self_loop(graph)

    structural_features, structural_feature_names = get_structural_features(graph)
    augmented_features_container = [structural_features]
    augmented_feature_names_container = [structural_feature_names]
    
    if dataset_info['num_feature_names']:
        num_feature_names = dataset_info['num_feature_names']
        num_features = df_features[num_feature_names].values
        aggregated_num_features, aggregated_num_features_names = get_num_feature_aggregation(graph, num_features, num_feature_names)
        
        augmented_features_container.append(aggregated_num_features)
        augmented_feature_names_container.append(aggregated_num_features_names)
    else:
        aggregated_num_features_names = []

    if dataset_info['cat_feature_names']:
        cat_feature_names = dataset_info['cat_feature_names']
        cat_features = df_features[cat_feature_names].values
        aggregated_cat_features, aggregated_cat_features_names = get_cat_feature_aggregation(graph, cat_features, cat_feature_names)

        augmented_features_container.append(aggregated_cat_features)
        augmented_feature_names_container.append(aggregated_cat_features_names)
    else:
        aggregated_cat_features_names = []

    if dataset_info['bin_feature_names']:
        bin_feature_names = dataset_info['bin_feature_names']
        bin_features = df_features[bin_feature_names].values
        aggregated_bin_features, aggregated_bin_features_names = get_bin_feature_aggregation(graph, bin_features, bin_feature_names)

        augmented_features_container.append(aggregated_bin_features)
        augmented_feature_names_container.append(aggregated_bin_features_names)
    else:
        aggregated_bin_features_names = []

    augmented_info = {
        'structural_feature_names': structural_feature_names,
        'aggregated_num_feature_names': aggregated_num_features_names,
        'aggregated_cat_feature_names': aggregated_cat_features_names,
        'aggregated_bin_feature_names': aggregated_bin_features_names,
    }

    augmented_features = np.concatenate(augmented_features_container, axis=1)
    augmented_feature_names = np.concatenate(augmented_feature_names_container, axis=0)
    
    augmented_features_path = f'{dataset_path}/augmented_features.csv'
    df_augmented_features = pd.DataFrame(augmented_features, columns=augmented_feature_names)
    df_augmented_features.to_csv(augmented_features_path)

    augmented_info_path = f'{dataset_path}/augmented_info.yaml'
    save_config(augmented_info, augmented_info_path)