In [None]:
import xgboost as xgb

In [None]:
from sdgym import load_dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

In [None]:
from synthsonic.models.kde_utils import kde_smooth_peaks_1dim, kde_smooth_peaks
from sklearn.model_selection import train_test_split

In [None]:
%matplotlib inline

In [None]:
data = pd.read_csv('train.csv')

In [None]:
categorical_columns = ['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03',
       'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin',
       'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin',
       'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15',
       'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_car_01_cat', 'ps_car_02_cat',
       'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
       'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat',
       'ps_car_11_cat', 'ps_car_11', 'ps_calc_04',
       'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09',
       'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14',
       'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin',
       'ps_calc_19_bin', 'ps_calc_20_bin']

In [None]:
df = data[categorical_columns]

In [None]:
import pgmpy

In [None]:
from pgmpy.models import BayesianModel
from pgmpy.estimators import TreeSearch

In [None]:
from pgmpy.estimators import HillClimbSearch, BicScore, ExhaustiveSearch, PC

In [None]:
if True:
    # learn graph structure 
    est = TreeSearch(df, root_node=df.columns[0])
    dag = est.estimate(class_node='ps_ind_02_cat', estimator_type="tan") 

In [None]:
if False:
    # alternative graph structure 
    est2 = TreeSearch(df, root_node=df.columns[0])
    dag = est2.estimate(estimator_type="chow-liu")

In [None]:
if False:
    est = HillClimbSearch(df, use_cache=True)
    dag = est.estimate(start_dag=dag)

In [None]:
if True:
    est = PC(df)
    dag = est.estimate(start_dag=dag)

In [None]:
nx.draw(dag, with_labels=True, arrowsize=30, node_size=800, alpha=0.3, font_weight='bold')
plt.show()

In [None]:
edges = dag.edges()

In [None]:
edges

In [None]:
from pgmpy.estimators import BayesianEstimator

# there are many choices of parametrization, here is one example
model = BayesianModel(dag.edges())
model.fit(df, estimator=BayesianEstimator, prior_type='dirichlet', pseudo_counts=0.1)


In [None]:
len(df)

In [None]:
columns = sorted(df.columns)

In [None]:
data = df[columns].values

In [None]:
# set up train-test sample.
# the test sample is used to calibrate the output of the classifier

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(data, np.ones(data.shape[0]), test_size=0.35,
                                                        random_state=0)



In [None]:
X1_train.shape

In [None]:
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf=MLPClassifier(random_state=0, max_iter=1000, early_stopping=True)

In [None]:
clf = xgb.XGBClassifier(max_depth=2)

if True:
    clf = xgb.XGBClassifier(
        n_estimators=250,
        reg_lambda=1,
        gamma=0,
        max_depth=9
    )

In [None]:
clf = SVC(probability=True)

In [None]:
clf = LogisticRegression()

In [None]:
import inspect

In [None]:
argspecs = inspect.getfullargspec(clf.fit)

In [None]:
support_weight = 'sample_weight' in argspecs.args

In [None]:
n_one = len(X1_train)
n_zero = n_one

In [None]:
from pgmpy.sampling import BayesianModelSampling

# sample data from BN

#np.random.seed(seed)

inference = BayesianModelSampling(model)
df_data = inference.forward_sample(size=n_zero, return_type='dataframe')

#df_data.columns = [int(c) for c in df_data.columns]

In [None]:
X0_train = df_data[columns].values

In [None]:
X0_train

In [None]:
X1_train

In [None]:
zeros = np.zeros(n_zero)
ones = np.ones(n_one)

yy = np.concatenate([zeros, ones], axis = 0)
XX = np.concatenate([X0_train, X1_train], axis = 0)

In [None]:
clf = clf.fit(XX, yy)

In [None]:
# calibrate the probabilities, using the test sample and a new null sample

In [None]:
df_data = inference.forward_sample(size=500000, return_type='dataframe', seed=10)

if False:
    df_data.columns = [int(c) for c in df_data.columns]
    X = df_data[sorted(df_data.columns)].values
X0_test = df_data[columns].values

In [None]:
p0 = clf.predict_proba(X0_test)[:, 1]
p2 = clf.predict_proba(X1_test)[:, 1]

In [None]:
len(p0), len(p2)

In [None]:
nbins = 200
plt.figure(figsize=(12,7))
plt.hist(p0, bins=nbins, range=(0,1), alpha=0.5, log=True, density=True) 
plt.hist(p2, bins=nbins, range=(0,1), alpha=0.5, log=True, density=True); 


In [None]:
nbins = 200
binning = np.linspace(0, 1, nbins+1)

hist_p0, bin_edges = np.histogram(p0, binning)
hist_p1, bin_edges = np.histogram(p2, binning)

def poisson_uncertainty(n):
    sigman = np.sqrt(n)
    # correct poisson counts of zero.
    sigman[sigman == 0] = 1.
    return sigman

def fraction_and_uncertainty(a, b, sigma_a, sigma_b):
    frac_a = a / (a + b)
    frac_b = b / (a + b)
    sigma_fa2 = np.power(frac_b * sigma_a, 2) / np.power(a + b, 2)  +  np.power(frac_a * sigma_b, 2) / np.power(a + b, 2)
    return frac_a, np.sqrt(sigma_fa2)

rest_p0 = np.sum(hist_p0) - hist_p0
rest_p1 = np.sum(hist_p1) - hist_p1

sigma_bin0 = poisson_uncertainty(hist_p0)
sigma_rest0 = poisson_uncertainty(rest_p0)

sigma_bin1 = poisson_uncertainty(hist_p1)
sigma_rest1 = poisson_uncertainty(rest_p1)

frac0, sigma_frac0 = fraction_and_uncertainty(hist_p0, rest_p0, sigma_bin0, sigma_rest0)
frac1, sigma_frac1 = fraction_and_uncertainty(hist_p1, rest_p1, sigma_bin1, sigma_rest1)

p1calib, sigma_p1calib = fraction_and_uncertainty(frac1, frac0, sigma_frac1, sigma_frac0)

sample_weight = 1 / (sigma_p1calib * sigma_p1calib)

sample_weight /= min(sample_weight)

sample_weight

In [None]:
frac0

In [None]:
frac1

In [None]:
8.40000e-05 * 6207

In [None]:
np.sqrt(hist_p0[-1]) / hist_p0[-1]

In [None]:
plt.figure(figsize=(12,7))
plt.plot(bin_centers, p1calib)

In [None]:
from sklearn.isotonic import IsotonicRegression
from scipy import interpolate

In [None]:
# we recalibrate per probability bin. NO interpolation (not valid in highest bin)

#hist_p0, bin_edges = np.histogram(p0, bins=nbins, range=(0, 1))
#hist_p1, bin_edges = np.histogram(p2, bins=nbins, range=(0, 1)) #### !!!! p2
bin_centers = bin_edges[:-1] + 0.5/nbins

hnorm_p0 = hist_p0 / sum(hist_p0)
hnorm_p1 = hist_p1 / sum(hist_p1)
hnorm_sum = hnorm_p0 + hnorm_p1
p1cb = np.divide(hnorm_p1, hnorm_sum, out=np.zeros_like(hnorm_p1), where=hnorm_sum != 0)
# self.p1cb = p1cb, bin_centers

# use isotonic regression to smooth out potential fluctuations in the p1 values
# isotonic regression assumes that p1 can only be a rising function.
# I’m assuming that if a classifier predicts a higher probability, the calibrated probability
# will also be higher. This may not always be right, but I think generally it is a safe one.
iso_reg = IsotonicRegression(y_min=0, y_max=1).fit(bin_centers, p1calib, sample_weight)
p1pred = iso_reg.predict(bin_centers)

p1f_ = interpolate.interp1d(bin_edges[:-1], p1pred, kind='previous', bounds_error=False, fill_value="extrapolate")

p1pred = p1f_(bin_centers)

In [None]:
p1pred[-1] = 0.9998389328412737

p1f_ = interpolate.interp1d(bin_edges[:-1], p1pred, kind='previous', bounds_error=False, fill_value="extrapolate")

p1pred = p1f_(bin_centers)

In [None]:
plt.figure(figsize=(12,7))
plt.plot(bin_centers, p1cb)
plt.plot(bin_centers, p1pred)
plt.plot(bin_centers, bin_centers)
#plt.plot(bin_centers, p1lin)

In [None]:
x = np.linspace(0.9,1,500)

In [None]:
pp = p1f_(x)

In [None]:
plt.figure(figsize=(12,7))
#plt.plot(bin_centers, p1cb)
plt.plot(x, pp)

In [None]:
maxp1 = p1f_(0.999)
maxp1

In [None]:
maxp2 = p1f_(0.991)
maxp2

In [None]:
max_weight = maxp1 / (1. - maxp1)
max_weight

In [None]:
sw = p1f_(0.991) / (1. - p1f_(0.991))
sw

In [None]:
# validation - part 1: check if reweighting works okay

In [None]:
from pgmpy.sampling import BayesianModelSampling

# sample data from BN
inference = BayesianModelSampling(model)

df_data = inference.forward_sample(size=250000, return_type='dataframe', seed=1)

#df_data.columns = [int(c) for c in df_data.columns]

X = df_data[columns].values

In [None]:
p0 = clf.predict_proba(X)[:, 1]
nominator = p1f_(p0)
denominator = 1 - nominator
weight = np.divide(nominator, denominator, out=np.ones_like(nominator), where=denominator != 0)

In [None]:
len(X), sum(weight)

In [None]:
keep = weight == max_weight
same = weight != max_weight

In [None]:
np.sum(weight[same]), np.sum(weight[keep]), 

In [None]:
ratio = (250000 - np.sum(weight[same])) / np.sum(weight[keep])

In [None]:
weight[keep] = weight[keep] * ratio

In [None]:
plt.hist(weight[weight < 20], bins=100, log=True);

In [None]:
max_weight = max(weight)

In [None]:
max_weight

In [None]:
max_weight / (1 + max_weight)

In [None]:
sum(weight)

In [None]:
from random import choices

In [None]:
#data, sample_weights = self._sample_no_transform(n_samples, random_state)
pop = np.asarray(range(X.shape[0]))
probs = weight/np.sum(weight)
sample = choices(pop, probs, k=X.shape[0])
Xtrans = X[sample]


In [None]:
p0 = clf.predict_proba(Xtrans)[:, 1]
p2 = clf.predict_proba(X1_test)[:, 1]


In [None]:
plt.figure(figsize=(12,7))
plt.hist(p0, bins=200, range=(0,1), alpha=0.5, density=True, log=True) #, weights=weight)#, log=True)
plt.hist(p2, bins=200, range=(0,1), alpha=0.5, density=True, log=True);


In [None]:
plt.figure(figsize=(12,7))
plt.hist(p0, bins=100, range=(0,1), alpha=0.5, density=True, log=True) #, weights=weight)#, log=True)
plt.hist(p2, bins=100, range=(0,1), alpha=0.5, density=True, log=True);


In [None]:
# validation - part 2: plot distributions

In [None]:
i = 1
plt.figure(figsize=(12,7))
plt.hist(X[:, i], bins=100, range=(0,1), alpha=0.5, density=True)#, log=True)
plt.hist(X1_test[:, i], bins=100, range=(0,1), alpha=0.5, density=True)


In [None]:
# validation part 3: check number of duplicates

In [None]:
df_data = inference.forward_sample(size=500000, return_type='dataframe', seed=2)
#df_data.columns = [int(c) for c in df_data.columns]
X10k = df_data[columns].values

In [None]:
p0 = clf.predict_proba(X10k)[:, 1]
nominator = p1f_(p0)
denominator = 1 - nominator
weight = np.divide(nominator, denominator, out=np.ones_like(nominator), where=denominator != 0)

In [None]:
sum(weight)

In [None]:
pop = np.asarray(range(X10k.shape[0]))
probs = weight/np.sum(weight)
sample = choices(pop, probs, k=X10k.shape[0])
Xtrans = X10k[sample]


In [None]:
#u, c = np.unique(Xtrans, axis=0, return_counts=True)
u, c = np.unique(Xtrans, axis=0, return_counts=True)

In [None]:
counts = np.sort(c)[::-1] 

In [None]:
counts

In [None]:
u, c = np.unique(data, axis=0, return_counts=True)

In [None]:
c2 = np.sort(c)[::-1] 

In [None]:
c2

In [None]:
plt.figure(figsize=(12,7))
plt.bar(list(range(40)), c2[:40], alpha=0.5)
plt.bar(list(range(40)), counts[:40], alpha=0.5)


In [None]:
plt.figure(figsize=(12,7))
plt.bar(list(range(40)), c2[:40], alpha=0.5)
plt.bar(list(range(40)), counts[:40], alpha=0.5)


In [None]:
import numpy as np
import pandas as pd
from sdgym import benchmark
from sdgym import load_dataset
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from synthsonic.models.kde_copula_nn_pdf import KDECopulaNNPdf
from synthsonic.models.categorical_utils import categorical_round, vec_translate, categorical_frequency_mapping, \
            categorical_frequency_inverse_mapping, encode_one_hot, decode_one_hot
from timeit import default_timer as timer
import xgboost as xgb
from sklearn.decomposition import PCA
%matplotlib inline

from functools import partial

In [None]:

df = pd.DataFrame(Xtrans)
df.to_csv('test.csv', index=False)


In [None]:
def KDECopulaNNPdf_RoundCategorical(real_data, categorical_columns, ordinal_columns, times=None):
    df = pd.read_csv('test.csv')
    data = df.values[:25000]
    return data

In [None]:
alarm_times = []
alarm_thing = partial(KDECopulaNNPdf_RoundCategorical)
alarm_thing.__name__ = KDECopulaNNPdf_RoundCategorical.__name__


In [None]:
alarm_scores = benchmark(synthesizers=[alarm_thing], datasets=['alarm'])

In [None]:
alarm_scores

In [None]:
if True:
    alarm_scores = benchmark(synthesizers=[alarm_thing], datasets=['alarm'])
    alarm_scores.drop(columns=['timestamp'], inplace=True)
    exec_time = ['N/A'] * 9 + [round(np.mean(alarm_times), 2)]
    alarm_scores['alarm/exec_time(s)'] = exec_time    