In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from timeit import default_timer as timer
from functools import partial
from random import choices
import logging

In [3]:
import sdgym
from sdgym import load_dataset
from sdgym import benchmark
from sdgym import load_dataset

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

In [5]:
import pgmpy

from pgmpy.models import BayesianModel
from pgmpy.estimators import TreeSearch, HillClimbSearch, BicScore, ExhaustiveSearch, BayesianEstimator
from pgmpy.sampling import BayesianModelSampling

In [6]:
import xgboost as xgb
from xgboost import XGBClassifier

In [7]:
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.isotonic import IsotonicRegression

In [8]:
from scipy import interpolate

In [9]:
from synthsonic.models.kde_utils import kde_smooth_peaks_1dim, kde_smooth_peaks
from synthsonic.models.kde_copula_nn_pdf import KDECopulaNNPdf

In [10]:
import matplotlib.pyplot as plt
%matplotlib inline

In [11]:
logging.basicConfig(level=logging.INFO)

In [12]:
#dataset_name = 'intrusion_numeric_u100'
#dataset_name = 'intrusion_categorical'
#dataset_name = 'intrusion_u100'
dataset_name = 'intrusion'

In [13]:
data, categorical_columns, ordinal_columns = load_dataset(dataset_name)

INFO:sdgym.data:Loading dataset intrusion (all variables, -1 distinct values threshold)
INFO:sdgym.data:Columns to zero-code: []


In [14]:
data.shape

(394021, 41)

In [15]:
#clf = lgb.LGBMClassifier() #**param)
#clffitkw = {'categorical_feature': categorical_columns+ordinal_columns}

# run sdgym

In [16]:
def KDECopulaNNPdf_Synthesizer(real_data, categorical_columns, ordinal_columns):
    # assert len(categorical_columns + ordinal_columns) <= 1

    all_features = list(range(real_data.shape[1]))
    numerical_features = list(set(all_features) - set(categorical_columns + ordinal_columns))
    data = np.float64(real_data)
    
    n_samples = data.shape[0]
    n_features = data.shape[1]
    
    clf = xgb.XGBClassifier(
        n_estimators=250,
        reg_lambda=1,
        gamma=0,
        max_depth=9
    )
#     clf = MLPClassifier(alpha=0.1, random_state=0, max_iter=1000, early_stopping=True)
    
    kde = KDECopulaNNPdf(
        use_KDE=False, 
        categorical_columns=categorical_columns+ordinal_columns,
        distinct_threshold=-1,
        n_bins=25,
#        clf=clf,
#         ordering='mi',
    )
    kde = kde.fit(data)
    
#     X_gen, sample_weight = kde.sample(n_samples)
    X_gen = kde.sample_no_weights(n_samples, show_progress=True, mode='cheap')
    
    #X_gen[:, categorical_columns+ordinal_columns] = np.round(X_gen[:, categorical_columns+ordinal_columns])
    X_gen = np.float32(X_gen)
    
    print(X_gen[:, :5])

    return X_gen

In [17]:
def KDECopulaNNPdf_SynthesizerInteger(real_data, categorical_columns, ordinal_columns):
    """Census has integer only..."""
    data = KDECopulaNNPdf_Synthesizer(real_data, categorical_columns, ordinal_columns)
    data = np.round(data)
    
    print(data[:, :5])

    return data

In [18]:
from sdgym.synthesizers import (
    CLBNSynthesizer, CTGANSynthesizer, IdentitySynthesizer, IndependentSynthesizer,
    MedganSynthesizer, PrivBNSynthesizer, TableganSynthesizer, TVAESynthesizer,
    UniformSynthesizer, VEEGANSynthesizer, CopulaGAN)

all_synthesizers = [
#    IdentitySynthesizer,
#     IndependentSynthesizer,
#     KDECopulaNNPdf_Synthesizer,
#    CopulaGAN
     KDECopulaNNPdf_SynthesizerInteger,
]

In [19]:
scores = sdgym.run(synthesizers=all_synthesizers, datasets=[dataset_name], iterations=1)

INFO:sdgym.benchmark:0%|          | 0/1 [00:00<?, ?it/s]
INFO:sdgym.benchmark:Evaluating KDECopulaNNPdf_SynthesizerInteger on dataset intrusion; iteration 0; 436.74 MB
INFO:sdgym.data:Loading dataset intrusion (all variables, -1 distinct values threshold)
INFO:sdgym.data:Columns to zero-code: []
INFO:sdgym.benchmark:Running KDECopulaNNPdf_SynthesizerInteger on dataset intrusion; iteration 0; 518.58 MB
INFO:KDECopulaNNPdf:Processing 26 numerical and 15 categorical columns
INFO:KDECopulaNNPdf:Transforming numerical variables.
INFO:KDECopulaNNPdf:Configuring Bayesian Network (cat+num).
INFO:KDECopulaNNPdf:Configuring classifier.
INFO:KDECopulaNNPdf:Generating 394021 data points.

  0%|          | 0/41 [00:00<?, ?it/s][A
Generating for node: 1:   0%|          | 0/41 [00:00<?, ?it/s][A
Generating for node: 0:   0%|          | 0/41 [00:00<?, ?it/s][A
Generating for node: 0:   5%|▍         | 2/41 [00:00<00:05,  7.72it/s][A
Generating for node: 7:   5%|▍         | 2/41 [00:00<00:05,  7.72i

Generating for node: 31:  17%|█▋        | 7/41 [00:01<00:10,  3.23it/s][A
Generating for node: 30:  17%|█▋        | 7/41 [00:01<00:10,  3.23it/s][A
Generating for node: 30:  20%|█▉        | 8/41 [00:02<00:10,  3.30it/s][A
Generating for node: 33:  20%|█▉        | 8/41 [00:02<00:10,  3.30it/s][A
Generating for node: 33:  22%|██▏       | 9/41 [00:02<00:09,  3.34it/s][A
Generating for node: 32:  22%|██▏       | 9/41 [00:02<00:09,  3.34it/s][A
Generating for node: 32:  24%|██▍       | 10/41 [00:02<00:09,  3.40it/s][A
Generating for node: 5:  24%|██▍       | 10/41 [00:02<00:09,  3.40it/s] [A
Generating for node: 5:  27%|██▋       | 11/41 [00:03<00:08,  3.42it/s][A
Generating for node: 25:  27%|██▋       | 11/41 [00:03<00:08,  3.42it/s][A
Generating for node: 25:  29%|██▉       | 12/41 [00:03<00:08,  3.27it/s][A
Generating for node: 28:  29%|██▉       | 12/41 [00:03<00:08,  3.27it/s][A
Generating for node: 28:  32%|███▏      | 13/41 [00:03<00:08,  3.15it/s][A
Generating for node

[[ 3.5614783e-01  1.0000000e+00  1.0000000e+00  1.0000000e+00
   4.0846428e+03]
 [ 6.2063038e-01  0.0000000e+00  0.0000000e+00  0.0000000e+00
  -2.6159861e+03]
 [-3.7880585e-01  1.0000000e+00  1.0000000e+00  1.0000000e+00
  -2.6516339e+02]
 ...
 [-3.5882361e-02  1.0000000e+00  1.0000000e+00  1.0000000e+00
  -3.2009357e+02]
 [ 9.7072080e-02  0.0000000e+00  0.0000000e+00  0.0000000e+00
   3.7860693e+03]
 [-1.4612874e-01  1.0000000e+00  1.0000000e+00  1.0000000e+00
   7.0037012e+02]]
[[ 0.000e+00  1.000e+00  1.000e+00  1.000e+00  4.085e+03]
 [ 1.000e+00  0.000e+00  0.000e+00  0.000e+00 -2.616e+03]
 [-0.000e+00  1.000e+00  1.000e+00  1.000e+00 -2.650e+02]
 ...
 [-0.000e+00  1.000e+00  1.000e+00  1.000e+00 -3.200e+02]
 [ 0.000e+00  0.000e+00  0.000e+00  0.000e+00  3.786e+03]
 [-0.000e+00  1.000e+00  1.000e+00  1.000e+00  7.000e+02]]


INFO:sdgym.evaluate:Evaluating using multiclass classifier DecisionTreeClassifier
INFO:sdgym.evaluate:Evaluating using multiclass classifier MLPClassifier
INFO:sdgym.benchmark:Finished KDECopulaNNPdf_SynthesizerInteger on dataset intrusion; iteration 0; 1.23 GB
INFO:sdgym.benchmark:100%|##########| 1/1 [08:05<00:00, 485.68s/it]
INFO:sdgym.benchmark:100%|##########| 1/1 [08:05<00:00, 485.68s/it]


In [20]:
# n_bins = 25; distinct = -1
scores

Unnamed: 0,intrusion/accuracy,intrusion/macro_f1,intrusion/micro_f1,timestamp
VEEGANSynthesizer,0.506413,0.180797,0.506413,2020-04-12 09:41:35.096775
CLBNSynthesizer,0.94099,0.385243,0.94099,2020-10-17 09:46:54.494331
CTGAN,0.982863,0.540598,0.982863,2020-10-17 09:46:54.494331
CTGANSynthesizer,0.97664,0.510842,0.97664,2020-10-17 09:46:54.494331
CopulaGAN,0.978257,0.516588,0.978257,2020-10-17 09:46:54.494331
GaussianCopulaCategorical,0.792053,0.176792,0.792053,2020-10-17 09:46:54.494331
GaussianCopulaCategoricalFuzzy,0.843653,0.256447,0.843653,2020-10-17 09:46:54.494331
GaussianCopulaOneHot,0.904267,0.331106,0.904267,2020-10-17 09:46:54.494331
IdentitySynthesizer,0.99915,0.863171,0.99915,2020-10-17 09:46:54.494331
IndependentSynthesizer,0.722643,0.198038,0.722643,2020-10-17 09:46:54.494331


In [21]:
scores.tail(4)

Unnamed: 0,intrusion/accuracy,intrusion/macro_f1,intrusion/micro_f1,timestamp
TVAESynthesizer,0.97462,0.432752,0.97462,2020-10-17 09:46:54.494331
TableganSynthesizer,,,,2020-10-17 09:46:54.494331
UniformSynthesizer,0.121007,0.065334,0.121007,2020-10-17 09:46:54.494331
KDECopulaNNPdf_SynthesizerInteger,0.98869,0.590213,0.98869,2021-05-13 20:15:43.016700
