MNIST classification (drawn from sklearn example)
=====================================================
MWEM is not particularly well suited for image data (where there are tons of features with relatively large ranges) but it is still able to capture some important information about the underlying distributions if tuned correctly.

We use a feature included with MWEM that allows a column to be specified for a custom bin count, if we are capping every other bin count at a small value. In this case, we specify that the numerical column (784) has 10 possible values. We do this with the dict {'784': 10}.

Here we borrow from a scikit-learn example, and insert MWEM synthetic data into their training example/visualization, to understand the tradeoffs.

https://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_logistic_regression_mnist.html#sphx-glr-download-auto-examples-linear-model-plot-sparse-logistic-regression-mnist-py


In [1]:
import warnings
warnings.filterwarnings('ignore')
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
from snsynth.gsd import GSDSynthesizer

# pip install scikit-image
# from skimage import data, color
# from skimage.transform import rescale

# Author: Arthur Mensch <arthur.mensch@m4x.org>
# License: BSD 3 clause

# Turn down for faster convergence
t0 = time.time()
train_samples = 5000

# Load data from https://www.openml.org/d/554
data = fetch_openml('mnist_784', version=1, return_X_y=False)

In [2]:
data_np = np.hstack((data.data,np.reshape(data.target.to_numpy().astype(int), (-1, 1))))


In [3]:
columns = []
for i in range(28):
    for j in range(28):
        columns.append(f'p_{i}_{j}')
columns.append('Label')
data_df = pd.DataFrame(data_np, columns=columns)

marginals = []
for i in range(28):
    for j in range(28):
        c1 = f'p_{i}_{j}'
        if j < 27:
            marginals.append((c1, f'p_{i}_{j+1}', 'Label'))
        if i < 27:
            marginals.append((c1, f'p_{i+1}_{j}', 'Label'))


In [16]:
from snsynth.transform import TableTransformer, BinTransformer, LabelTransformer

transformers = [BinTransformer(lower=0.0, upper=255.0, bins=4) for _ in range(data_np.shape[1] - 1)]
transformers.append(LabelTransformer())
transformer = TableTransformer(transformers)


In [19]:
epsilon = 1
subsample_size = 100
subsample_frac =  subsample_size / len(data_df)
data_subsample_df = data_df.sample(n=subsample_size)

# N_prime = len(adult_df_train)

synth = GSDSynthesizer(epsilon=1.0 / subsample_frac, delta=1e-5, tree_height=6, verbose=True)
synth.fit(data_subsample_df,
          transformer=transformer,
          marginals=marginals,
          early_stop_threshold=0.0001,
          N_prime=subsample_size)


Marginal= ('p_0_0', 'p_0_1', 'Label') . Sigma=0.0236. Top.Level=1. Max.Size=None
Marginal= ('p_0_0', 'p_1_0', 'Label') . Sigma=0.0236. Top.Level=1. Max.Size=None
Marginal= ('p_0_1', 'p_0_2', 'Label') . Sigma=0.0236. Top.Level=1. Max.Size=None
Marginal= ('p_0_1', 'p_1_1', 'Label') . Sigma=0.0236. Top.Level=1. Max.Size=None
Marginal= ('p_0_2', 'p_0_3', 'Label') . Sigma=0.0236. Top.Level=1. Max.Size=None
Marginal= ('p_0_2', 'p_1_2', 'Label') . Sigma=0.0236. Top.Level=1. Max.Size=None
Marginal= ('p_0_3', 'p_0_4', 'Label') . Sigma=0.0236. Top.Level=1. Max.Size=None
Marginal= ('p_0_3', 'p_1_3', 'Label') . Sigma=0.0236. Top.Level=1. Max.Size=None
Marginal= ('p_0_4', 'p_0_5', 'Label') . Sigma=0.0236. Top.Level=1. Max.Size=None
Marginal= ('p_0_4', 'p_1_4', 'Label') . Sigma=0.0236. Top.Level=1. Max.Size=None
Marginal= ('p_0_5', 'p_0_6', 'Label') . Sigma=0.0236. Top.Level=1. Max.Size=None
Marginal= ('p_0_5', 'p_1_5', 'Label') . Sigma=0.0236. Top.Level=1. Max.Size=None
Marginal= ('p_0_6', 'p_0_7',

2023-09-27 10:44:21.749040: E external/org_tensorflow/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:429] Could not create cudnn handle: CUDNN_STATUS_NOT_INITIALIZED
2023-09-27 10:44:21.749085: E external/org_tensorflow/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:438] Possibly insufficient driver version: 530.30.2
2023-09-27 10:44:21.749652: E external/org_tensorflow/tensorflow/compiler/xla/status_macros.cc:54] INTERNAL: RET_CHECK failure (external/org_tensorflow/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc:641) dnn != nullptr 
*** Begin stack trace ***
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	_PyObject_MakeTpCall
	
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	PyObject_Call
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_EvalFrameDefault
	
	_PyEval_EvalFrameDefault
	_PyFunction_Vectorcall
	_PyEval_

XlaRuntimeError: INTERNAL: RET_CHECK failure (external/org_tensorflow/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc:641) dnn != nullptr 

In [None]:
model = RidgeClassifier
synth_df = pd.DataFrame(synthetic, 
    columns=real.columns)

X = real.iloc[:, :-1]
y = real.iloc[:, -1]
X_synth = synth_df.iloc[:, :-1]
y_synth = synth_df.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
x_train_synth, x_test_synth, y_train_synth, y_test_synth = train_test_split(X_synth, y_synth, test_size=0.2, random_state=42)

print(y_train)
print(y_train_synth)

model_real = model()
model_real.fit(x_train, y_train)

print(y_train_synth.max())
print(y_train_synth.min())

# model_fake = model()
# model_fake.fit(x_train_synth, y_train_synth)



In [None]:
from sklearn.linear_model import RidgeClassifier

import utils

real = pd.DataFrame(data_np[:sample_size])

model_real, model_fake = utils.test_real_vs_synthetic_data(real, synthetic, RidgeClassifier, tsne=True)

In [None]:
# Classification 
coef = model_real.coef_.copy()
plt.figure(figsize=(10, 5))
scale = np.abs(coef).max()
for i in range(10):
    l1_plot = plt.subplot(2, 5, i + 1)
    l1_plot.imshow(coef[i].reshape(28, 28), interpolation='nearest',
                   cmap=plt.cm.RdBu, vmin=-scale, vmax=scale)
    l1_plot.set_xticks(())
    l1_plot.set_yticks(())
    l1_plot.set_xlabel('Class %i' % i)
plt.suptitle('Classification vector for...')

run_time = time.time() - t0
print('Example run in %.3f s' % run_time)
plt.show()

In [None]:
coef = model_fake.coef_.copy()
plt.figure(figsize=(10, 5))
scale = np.abs(coef).max()
for i in range(10):
    l1_plot = plt.subplot(2, 5, i + 1)
    l1_plot.imshow(coef[i].reshape(28, 28), interpolation='nearest',
                   cmap=plt.cm.RdBu, vmin=-scale, vmax=scale)
    l1_plot.set_xticks(())
    l1_plot.set_yticks(())
    l1_plot.set_xlabel('Class %i' % i)
plt.suptitle('Classification vector for...')

run_time = time.time() - t0
print('Example run in %.3f s' % run_time)
plt.show()