In [None]:
import os
import yaml
import json
import numpy as np
import pandas as pd

In [None]:
dataset_names = [
    # 'tolokers-tab',
    # 'questions-tab',
    # 'city-reviews',
    # 'browser-games',
    # 'hm-categories',
    # 'web-fraud',
    # 'city-roads-M',
    # 'city-roads-L',
    # 'avazu-devices',
    # 'hm-prices',
    # 'web-traffic',
]

data_root = '../datasets'
tabular_root = '../source/tabular/data'
specialized_root = '../source/bgnn/datasets'

In [None]:
def load_config(config_path):
    with open(config_path, 'r') as f:
        data = yaml.safe_load(f)
    return data

# Prepare Datasets for Tabular Baselines (GBDT, MLP, TabR)

In [None]:
USE_DWE = False
USE_NFA = False

for dataset_name in dataset_names:
    print(dataset_name)
    
    tabular_dataset_path = f'{tabular_root}/{dataset_name}'
    if USE_DWE: tabular_dataset_path += 'dwe'
    if USE_NFA: tabular_dataset_path += 'nfa'

    os.makedirs(tabular_dataset_path, exist_ok=True)

    dataset_path = f'{data_root}/{dataset_name}'

    ### convert data

    df_features = pd.read_csv(f'{dataset_path}/features.csv', index_col=0)
    df_targets = pd.read_csv(f'{dataset_path}/targets.csv', index_col=0)

    dataset_info = load_config(f'{dataset_path}/info.yaml')
    masks = {}
    for part_name in ['train', 'valid', 'test']:
        masks[part_name] = pd.read_csv(f'{dataset_path}/{part_name}_mask.csv', index_col=0).values.reshape(-1)

    num_features = (
        df_features[dataset_info['num_feature_names']].values.astype(np.float32)
        if dataset_info['num_feature_names'] else None
    )

    cat_features = (
        df_features[dataset_info['cat_feature_names']].values.astype(np.int32).astype(np.str_)
        if dataset_info['cat_feature_names'] else None
    )

    bin_features = (
        df_features[dataset_info['bin_feature_names']].values.astype(np.float32)
        if dataset_info['bin_feature_names'] else None
    )

    targets = df_targets[dataset_info['target_name']].values
    info = {
        'task_type': 'regression' if dataset_info['task'] == 'regression' 
            else 'binclass' if dataset_info['task'] == 'binary_classification' 
            else 'multiclass',
        'name': dataset_name,
    }

    ### append graph augmented features and node embeddings

    if USE_DWE:
        df_augmented_features = pd.read_csv(f'{dataset_path}/augmented_features.csv', index_col=0)
        extra_num_features = df_augmented_features.values.astype(np.float32)
        num_features = (
            np.concatenate([num_features, extra_num_features], axis=1) 
            if num_features is not None else extra_num_features
        )
    
    if USE_NFA:
        node_embeds = np.load(f'{dataset_path}/node_embeddings.npz')['node_embeds'].astype(np.float32)
        num_features = (
            np.concatenate([num_features, node_embeds], axis=1) 
            if num_features is not None else node_embeds
        )
    
    ### write data to files
    
    with open(f'{tabular_dataset_path}/info.json', 'w') as f:
        json.dump(info, f)

    for part_name, conventional_part_name in zip(
        ['train', 'valid', 'test'], ['train', 'val', 'test']
    ):
        mask = masks[part_name]
        if num_features is not None:
            np.save(f'{tabular_dataset_path}/X_num_{conventional_part_name}.npy', num_features[mask])
        
        if cat_features is not None:
            np.save(f'{tabular_dataset_path}/X_cat_{conventional_part_name}.npy', cat_features[mask])
        
        if bin_features is not None:
            np.save(f'{tabular_dataset_path}/X_bin_{conventional_part_name}.npy', bin_features[mask])

        targets_masked = targets[mask]
        targets_casted = (
            targets_masked.astype(np.int32) if dataset_info['task'] != 'regression' 
            else targets_masked.astype(np.float32)
        )
        
        np.save(f'{tabular_dataset_path}/Y_{conventional_part_name}.npy', targets_casted)
    
    open(f'{tabular_dataset_path}/READY', 'a').close()

# Prepare Datasets for Specialized Models (BGNN and EBBS)

In [None]:
USE_DWE = False

for dataset_name in dataset_names:
    print(dataset_name)
    
    specialized_dataset_path = f'{specialized_root}/{dataset_name}'
    os.makedirs(specialized_dataset_path, exist_ok=True)

    dataset_path = f'{data_root}/{dataset_name}'

    ### convert data

    df_features = pd.read_csv(f'{dataset_path}/features.csv', index_col=0)
    dataset_info = load_config(f'{dataset_path}/info.yaml')
    edgelist = pd.read_csv(f'{dataset_path}/edgelist.csv')
    
    split_indices = {}
    for part_name in ['train', 'valid', 'test']:
        mask = pd.read_csv(f'{dataset_path}/{part_name}_mask.csv', index_col=0).values.reshape(-1)
        split_indices[part_name] = np.where(mask)[0]

    targets = df_features[dataset_info['target_name']]
    features = df_features[
        dataset_info['num_feature_names'] + 
        dataset_info['cat_feature_names'] +
        dataset_info['bin_feature_names']
    ]

    if USE_DWE:
        node_embeds = np.load(f'{dataset_path}/node_embeddings.npz')['node_embeds'].astype(np.float32)
        df_node_embeds = pd.DataFrame(
            node_embeds, columns=[
                f'embed_coordinate_{idx}' 
                for idx in range(node_embeds.shape[1])
            ]
        )
        features = pd.concat([features, df_node_embeds], axis=1)
    
    ### write data to files

    with open(f'{specialized_dataset_path}/cat_features.txt', 'w') as f:
        f.write('\n'.join(dataset_info['cat_feature_names']) + '\n')

    with open(f'{specialized_dataset_path}/masks.json', 'w') as f:
        json.dump({
            conventional_part_name: split_indices[part_name].tolist()
            for part_name, conventional_part_name in zip(
                ['train', 'valid', 'test'], ['train', 'val', 'test']
            )
        }, f)

    features.to_csv(f'{specialized_dataset_path}/X.csv', index=None)
    targets.to_csv(f'{specialized_dataset_path}/y.csv', index=None)
    edgelist.to_csv(f'{specialized_dataset_path}/edgelist.csv', index=None)