# Import packages

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
import tensorflow_datasets as tfds

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2
import os
import random
import bisect

# Utility function

In [3]:
def uncompressArray(file_dir):
  uncompressed_data = []
  with open(file_dir, 'rb') as f:
    loaded_file = np.load(f)
    ks = list(loaded_file.keys())
    print("First, check the data!")
    print(f"Keys: {ks}")
    ans = input("Please enter 'y' if you want to proceed: ")
    if ans == 'y':
      print("\nloading data !")
      for k in ks:
        uncompressed_data.append(loaded_file[k].copy())
        print(f"load: {k}")
    else:
      print("data is not loaded!")
  return uncompressed_data

def reshape_data(X):
  X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)
  return X

In [4]:
# def Predict(class_preds=np.array([]), disc_preds=np.array([]), thrs=[]):
#   class_preds = class_preds.tolist()
#   disc_preds = disc_preds.tolist()
#   if disc_preds and (not class_preds):
#     print(f"Predicting using: Discriminator algorithm")
#     Preds = predictDisc(disc_preds)
#   elif class_preds and thrs and (not disc_preds):
#     print(f"Predicting using: Threshold algorithm")
#     Preds = predictThr(class_preds, thrs)
#   elif class_preds and thrs and disc_preds:
#     print(f"Predicting using: Combined algorithm")
#     Preds = predictCombined(class_preds, disc_preds, thrs)
#   else:
#     print("Not enough OR wrong input data, cannot proceed with prediction!!")
#     return None
  
#   return Preds

# def predictThr(class_preds, thrs):
#   results = dict()
#   for thr in thrs:
#     results[thr] = list()
#     for y_pred in class_preds:
#       pred = np.argmax(y_pred)
#       pred_confidence = y_pred[pred]

#       result = 0
#       if isOOD(pred_confidence, thr): result = 1
#       results[thr].append(result)
#   return results

# def predictDisc(disc_preds):
#   thr = 0.5
#   results = dict()
#   results[thr] = list()
#   for disc_pred in disc_preds:
#     result = 0
#     if isOOD(disc_pred[0], thr): result = 1
#     results[thr].append(result)
#   return results

# def predictCombined(class_preds, disc_preds, thrs):
#   combined_results = dict()
#   disc_results = predictDisc(disc_preds)
#   thr_results = predictThr(class_preds, thrs)

#   for thr in thr_results.keys():
#     results = np.logical_or(disc_results[0.5]), np.array(thr_results[thr])
#     combined_results[thr] = results.tolist()
  
#   return combined_results

# def isOOD(pred, thr):
#   return True if pred < thr else False

# def Evaluate(results, clean_data=False, Plot=True):
#   eval_results = dict()
#   for thr in results.keys():
#     target = 0 if clean_data else 1
#     acc = round(((results[thr].count(target))/len(results[thr]))*100, 2)
#     eval_results[thr] = acc
#   if Plot: plotLine(list(eval_results.keys()), list(eval_results.values()))
#   return eval_results

# def plotLine(x, y):
#   plt.ylim(0, 110)
#   plt.grid()
#   plt.plot(x, y)
#   plt.xlabel("threshold")
#   plt.ylabel("Accuracy")

# Data preprocessing

## Import data

### Clean data

In [5]:
data_file = '/content/drive/MyDrive/PhD/Szeged22_paper/Atca_Cyber_long_paper/data/clean_data.npz'

X_clean_train, X_clean_test, Y_clean_train, Y_clean_test = uncompressArray(data_file)

First, check the data!
Keys: ['X_clean_train', 'X_clean_test', 'Y_clean_train', 'Y_clean_test']
Please enter 'y' if you want to proceed: y

loading data !
load: X_clean_train
load: X_clean_test
load: Y_clean_train
load: Y_clean_test


In [6]:
X_clean_train.shape, X_clean_test.shape, Y_clean_train.shape, Y_clean_test.shape

((401302, 28, 28, 1), (10000, 28, 28, 1), (401302,), (10000,))

In [7]:
X_clean_train = X_clean_train/255.
X_clean_test = X_clean_test/255.

# Generate CDF and rCDF

## Import model

In [8]:
class_name = 'vgg16_DO_classifier'
class_path = f'/content/drive/MyDrive/PhD/Szeged22_paper/Atca_Cyber_long_paper/saved_models/{class_name}.h5'
classifier = tf.keras.models.load_model(class_path)

## DPM implementation

In [9]:
def DistFun(val_decisions, gt, n_classes, reverse=False):
  '''
  decisions: num images x num of classes
  gt : num images x 1
  '''
  # DF = np.zeros((n_classes, 1))
  DF = list()

  for class_idx in range(n_classes):
    if not reverse:
      samples = val_decisions[gt==class_idx][:, class_idx]
    else:
      samples = val_decisions[gt!=class_idx][:, class_idx]
    
    samples = list(samples)
    n = len(samples)
    samples.sort(reverse=reverse)
    # DF[class_idx] = np.zeros((n+1, 2))
    DF.append(np.zeros((n+1, 2)))

    for sample_idx in range(n):
      DF[class_idx][sample_idx][0] = samples[sample_idx]
      DF[class_idx][sample_idx][1] = (sample_idx+1)/(n+2)
    
    infinity = 'inf' if not reverse else '-inf'
    DF[class_idx][n][0] = float(infinity)
    DF[class_idx][n][1] = (n+1)/(n+2)

  return np.array(DF)

def CDFs(decisions, gt, n_classes):
  cdf = DistFun(decisions, gt, n_classes, reverse=False)
  rcdf = DistFun(decisions, gt, n_classes, reverse=True)
  return cdf, rcdf

# def DPM_pred(cdf, rcdf, test_decisions, n_classes, method):
#   dpm_decisions = np.zeros(len(test_decisions[0]), len(test_decisions[1])+1) ### ????

#   for ii in range(len(test_decisions[0])):
#     prod = 0
#     for class_idx1 in range(n_classes):
#       for cdf_idx, cdf_val in cdf[class_idx1]:
#         if test_decisions[ii][class_idx1] < cdf_val[cdf_idx][1]:
#           prod = cdf_val[cdf_idx][1]
#           break
      
#       for class_idx2 in range(n_classes):
#         if class_idx2 != class_idx1:
#           for rcdf_idx, rcdf_val in rcdf[class_idx2]:
#             if test_decisions[ii][class_idx2] > rcdf_val[rcdf_idx][0]:
#               if method.lower() == 'plus':
#                 prod += rcdf_val[rcdf_idx][1]
#               else:
#                 prod *= rcdf_val[rcdf_idx][1]
#       dpm_decisions[ii][class_idx1] = prod
    
#     prod = 0
#     for class_idx2 in range(n_classes):
#       for rcdf_idx, rcdf_val in rcdf[class_idx2]:
#         if test_decisions[ii][class_idx2] > rcdf_val[rcdf_idx][0]:
#           if method.lower() == 'plus':
#             prod += rcdf_val[rcdf_idx][1]
#           else:
#             prod *= rcdf_val[rcdf_idx][1]
#     dpm_decisions[ii][n_classes+1] = prod
  
#   ood_list = np.array((len(dpm_decisions[0]), 1))
#   for i in range(len(ood_list)):
#     ind = max(dpm_decisions[i])
#     if ind == n_classes+1:
#       ood_list[i] = 'ood'
#     else:
#       ood_list[i] = ind

#   return dpm_decisions, ood_list

In [10]:
def DPM_pred(cdf, rcdf, test_decisions, n_classes, method=None):
  '''
  test_decisions: num test images x num of classes
  cdf: numb of classes x num of images in this class x 2 (weight, confidence/prob)
  '''

  Pis, Pk, preds = [], [], []
  for test_decision in test_decisions:
    prod_k = porbOOD(cdf, rcdf, test_decision, n_classes)
    Pk.append(prod_k)

    Pi = porbID(cdf, rcdf, test_decision, n_classes, Pk)
    Pis.append(Pi)

    pred = Pred(Pis, Pk, test_decision, n_classes)
    preds.append(pred)
  
  return preds

def porbOOD(cdf, rcdf, test_decision, n_classes):
  prod_k = 1
  for class_idx in n_classes:
    item_idx = bisect.bisect_right(rcdf[class_idx][:,0], test_decision[class_idx]) # (list, value to compare)
    Fnj = rcdf[class_idx][:,1][item_idx]
    prod_k *= Fnj
  return prod_k

def porbID(cdf, rcdf, test_decision, n_classes, Pk):
  Pi = []
  for class_idx in n_classes:
    prod_i = 1
    item_idx = bisect.bisect_right(cdf[class_idx][:,0], test_decision[class_idx]) # (list, value to compare)
    item_idx = max(0, item_idx-1) # the idx of item from the left side
    Fpi = cdf[class_idx][:,1][item_idx]
    Fni = rcdf[class_idx][:,1][item_idx]
    Fnj = Pk[-1]/Fni
    prod_i *= Fpi*Fnj
    Pi.append(prod_i)
  return Pi

def Pred(Pis, Pk, test_decision, n_classes):
  pred = n_classes
  if Pk <= max(Pis):
    pred = np.argmax(test_decision)
  return pred

In [11]:
clean_class_Preds = classifier.predict(X_clean_test)

In [12]:
cdf, rcdf = CDFs(clean_class_Preds, Y_clean_test, 37)



In [13]:
cdf, rcdf = np.array(cdf), np.array(rcdf)

In [15]:
data_file = '/content/drive/MyDrive/PhD/Szeged22_paper/Atca_Cyber_long_paper/data/CDFs.npz'
np.savez_compressed(data_file, cdf=cdf, rcdf=rcdf)

In [15]:
# data_file = '/content/drive/MyDrive/PhD/Szeged22_paper/Atca_Cyber_long_paper/data/CDFs.npz'

# cdf, rcdf = uncompressArray(data_file)

In [None]:
npz_file = np.load(data_file, allow_pickle=True)
D = dict(zip((k for k in npz_file), (npz_file[k] for k in npz_file)))