# Import packages

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
import tensorflow_datasets as tfds

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2
import os
import random
import bisect
from tqdm import tqdm

# Utility function

In [2]:
def uncompressArray(file_dir):
  uncompressed_data = []
  with open(file_dir, 'rb') as f:
    loaded_file = np.load(f)
    ks = list(loaded_file.keys())
    print("First, check the data!")
    print(f"Keys: {ks}")
    ans = input("Please enter 'y' if you want to proceed: ")
    if ans == 'y':
      print("\nloading data !")
      for k in ks:
        uncompressed_data.append(loaded_file[k].copy())
        print(f"load: {k}")
    else:
      print("data is not loaded!")
  return uncompressed_data

def reshape_data(X):
  X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)
  return X

# Data preprocessing

## Import data

### MNIST

In [3]:
from keras.datasets import mnist
_, (X_mnist_test, Y_mnist_test) = mnist.load_data()
X_mnist_test.shape, Y_mnist_test.shape

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


((10000, 28, 28), (10000,))

In [4]:
X_mnist_test = 255 - X_mnist_test
X_mnist_test = np.expand_dims(X_mnist_test, axis=-1)/255.
X_mnist_test.shape

(10000, 28, 28, 1)

### Binary MNIST

In [5]:
X_bmnist_test = X_mnist_test.copy()
Y_bmnist_test = Y_mnist_test.copy()

upper, lower = np.max(X_mnist_test), np.min(X_mnist_test)
thr = abs(upper-lower)/2
X_bmnist_test = np.where(X_bmnist_test>thr, upper, lower)

X_bmnist_test.shape, Y_bmnist_test.shape

((10000, 28, 28, 1), (10000,))

### K Mnist

In [6]:
ds = tfds.load('kmnist', split='test', as_supervised=True)

X_kmnist_test, Y_kmnist_test = [], []

for image, label in tfds.as_numpy(ds):
  X_kmnist_test.append(image)
  Y_kmnist_test.append(label)

del ds

X_kmnist_test = np.asarray(X_kmnist_test)
Y_kmnist_test = np.asarray(Y_kmnist_test)

X_kmnist_test.shape, Y_kmnist_test.shape

[1mDownloading and preparing dataset kmnist/3.0.1 (download: 20.26 MiB, generated: 31.76 MiB, total: 52.02 MiB) to /root/tensorflow_datasets/kmnist/3.0.1...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]






0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/kmnist/3.0.1.incomplete00B0LP/kmnist-train.tfrecord


  0%|          | 0/60000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/kmnist/3.0.1.incomplete00B0LP/kmnist-test.tfrecord


  0%|          | 0/10000 [00:00<?, ? examples/s]

[1mDataset kmnist downloaded and prepared to /root/tensorflow_datasets/kmnist/3.0.1. Subsequent calls will reuse this data.[0m


((10000, 28, 28, 1), (10000,))

### Fashion Mnist

In [7]:
from keras.datasets import fashion_mnist
_, (X_fmnist_test, Y_fmnist_test) = fashion_mnist.load_data()
X_fmnist_test.shape, Y_fmnist_test.shape

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


((10000, 28, 28), (10000,))

In [8]:
X_fmnist_test = 255 - X_fmnist_test
X_fmnist_test = np.expand_dims(X_fmnist_test, axis=-1)/255.
X_fmnist_test.shape

(10000, 28, 28, 1)

### Gaussian data

In [10]:
data_file = '/content/drive/MyDrive/PhD/Szeged22_paper/Atca_Cyber_long_paper/data/gauss_data.npz'

X_num_gauss, X_rnd_gauss = uncompressArray(data_file)

First, check the data!
Keys: ['X_num_gauss', 'X_rnd_gauss']
Please enter 'y' if you want to proceed: y

loading data !
load: X_num_gauss
load: X_rnd_gauss


In [11]:
X_num_gauss = X_num_gauss/255.
X_rnd_gauss = X_rnd_gauss/255.

X_num_gauss.shape, X_rnd_gauss.shape

((10000, 28, 28, 1), (10000, 28, 28, 1))

In [12]:
Y_num_gauss, Y_rnd_gauss = np.ones((10000, )), np.ones((10000, ))
Y_num_gauss.shape, Y_rnd_gauss.shape

((10000,), (10000,))

### Clean data

In [13]:
data_file = '/content/drive/MyDrive/PhD/Szeged22_paper/Atca_Cyber_long_paper/data/clean_data.npz'

X_clean_train, X_clean_test, Y_clean_train, Y_clean_test = uncompressArray(data_file)

First, check the data!
Keys: ['X_clean_train', 'X_clean_test', 'Y_clean_train', 'Y_clean_test']
Please enter 'y' if you want to proceed: y

loading data !
load: X_clean_train
load: X_clean_test
load: Y_clean_train
load: Y_clean_test


In [14]:
X_clean_train.shape, X_clean_test.shape, Y_clean_train.shape, Y_clean_test.shape

((401302, 28, 28, 1), (10000, 28, 28, 1), (401302,), (10000,))

In [15]:
X_clean_train = X_clean_train/255.
X_clean_test = X_clean_test/255.

# Creat splits

In [16]:
full_set_size = 10000

In [17]:
ratios = [str(n) for n in range(0, 101, 5)]
ratios

['0',
 '5',
 '10',
 '15',
 '20',
 '25',
 '30',
 '35',
 '40',
 '45',
 '50',
 '55',
 '60',
 '65',
 '70',
 '75',
 '80',
 '85',
 '90',
 '95',
 '100']

In [18]:
# ratio_size = [int((full_set_size*int(n))/100) for n in ratio]
# ratio_size

In [19]:
data_sets = {'mnist': Y_mnist_test, 'bmnist': Y_bmnist_test, 'kmnist': Y_kmnist_test, 'fmnist': Y_fmnist_test, 'num': Y_num_gauss, 'rnd': Y_rnd_gauss, 'clean': Y_clean_test}
ds_keys = list(data_sets.keys())
ratio_dict = dict.fromkeys(ratios, list())
splits = dict.fromkeys(ds_keys, ratio_dict)

In [20]:
for ds, labels in data_sets.items():
  ds_dict = dict()
  for ratio in ratios:
    ratio_idxs = list()
    for idx in np.unique(labels):
      label_idxs = np.where(labels==idx)[0]
      ratio_size = int((len(label_idxs)*int(ratio))/100)
      ratio_idxs.extend(label_idxs[:ratio_size])
    ds_dict[ratio] = ratio_idxs
  splits[ds] = ds_dict

In [22]:
data_file = '/content/drive/MyDrive/PhD/Szeged22_paper/Atca_Cyber_long_paper/data/ratios_idxs.npz'
np.savez_compressed(data_file, ratios_idxs=splits)

In [213]:
npz_file = np.load(data_file, allow_pickle=True)
D = npz_file['ratios_idxs'].item()

In [214]:
D.keys()

dict_keys(['mnist', 'bmnist', 'kmnist', 'fmnist', 'num', 'rnd', 'clean'])